import os import random from pathlib import Path from typing import Callable, List, Tuple, Optional, Iterable, Dict, Union from typing_extensions import TypedDict, Unpack, Required, NotRequired import numpy as np import pandas as pd import torch import torchvision.transforms.functional as TF import wandb from torch.utils.data import Dataset import torch.nn as nn def format_mrn(mrn): return str(mrn).strip().zfill(20) class CedarsDatasetTypeAnnotations(TypedDict, total=False): """A dummy class used to make IDE autocomplete and tooltips work properly with how we pass **kwargs through in subclasses of CedarsDataset.""" data_path: Required[Union[Path, str]] manifest_path: Required[Union[Path, str]] split: NotRequired[str] labels: NotRequired[Iterable[str]] extra_inputs: NotRequired[Iterable[str]] update_manifest_func: NotRequired[Callable[[pd.DataFrame], pd.DataFrame]] subsample: NotRequired[Union[Path, str]] augmentations: NotRequired[Union[Iterable[Callable[[torch.Tensor], torch.Tensor]], Callable[[dict], dict], nn.Module]] apply_augmentations_to: NotRequired[Iterable[str]] verify_existing: NotRequired[bool] drop_na_labels: NotRequired[bool] verbose: NotRequired[bool] class CedarsDataset(Dataset): """ Generic parent class for several differnet kinds of common datasets we use here at Cedars CVAIR. Expects to be used in a scenario where you have a big folder full of input examples (videos, ecgs, 3d arrays, images, etc.) and a big CSV that contains metadata and labels for those examples, called a 'manifest'. Args: data_path: Path to a directory full of files you want the dataset to load from. manifest_path: Path to a CSV or Parquet file containing the names, labels, and/or metadata of your files. split: Optional. Allows user to select which split of the manifest to use, assuming the presence of a categorical 'split' column. Defaults to None, meaning that the entire manifest is used by default. extra_inputs: Optional. A list of column names in the manifest that contain additional inputs to the model. Defaults to None. labels: Optional. Name(s) of column(s) in your manifest which contain training labels, in the order you want them returned. If set to None, the dataset will not return any labels, only filenames and inputs. Defaults to None. update_manifest_func: Optional. Allows user to pass in a function to preprocess the manifest after it is loaded, but before the dataset does anything to it. subsample: Optional. A number indicating how many examples to randomly subsample from the manifest. Defaults to None. verbose: Whether to print out progress statements when initializing. Defaults to True. augmentations: Optional. Can be a list of augmentation functions which take in a tensor and return a tensor, a single custom augmentation function which takes in a dict and returns a dict, or a single nn.Module. Defaults to None. apply_augmentations_to: Optional. A list of strings indicating which batch elements to apply augmentations to. Defaults to ("primary_input"). """ def __init__( self, data_path, manifest_path=None, split=None, labels=None, extra_inputs=None, update_manifest_func=None, subsample=None, augmentations=None, apply_augmentations_to=("primary_input",), verify_existing=True, drop_na_labels=True, verbose=True, ): self.data_path = Path(data_path) self.augmentations = augmentations self.apply_augmentations_to = apply_augmentations_to self.extra_inputs = extra_inputs self.labels = labels if isinstance(self.augmentations, nn.Module): self.augmentations = [self.augmentations] if (self.labels is None) and verbose: print( "No label column names were provided, only filenames and inputs will be returned." ) if (self.labels is not None) and isinstance(self.labels, str): self.labels = [self.labels] if (self.extra_inputs is not None) and isinstance(self.extra_inputs, str): self.extra_inputs = [self.extra_inputs] # Read manifest file if manifest_path is not None: self.manifest_path = Path(manifest_path) else: self.manifest_path = self.data_path / "manifest.csv" if self.manifest_path.exists(): if self.manifest_path.suffix == ".csv": self.manifest = pd.read_csv(self.manifest_path, low_memory=False) elif self.manifest_path.suffix == ".parquet": self.manifest = pd.read_parquet(self.manifest_path) else: self.manifest = pd.DataFrame( { "filename": os.listdir(self.data_path), } ) # do manifest processing that's specific to a given task (different from update_manifest_func, # exists as a method overridden in child classes) self.manifest = self.process_manifest(self.manifest) # Apply user-provided update function to manifest if update_manifest_func is not None: self.manifest = update_manifest_func(self, self.manifest) # Usually set to "train", "val", or "test". If set to None, the entire manifest is used. if split is not None: self.manifest = self.manifest[self.manifest["split"] == split] if verbose: print( f"Manifest loaded. \nSplit: {split}\nLength: {len(self.manifest):,}" ) # Make sure all files actually exist. This can be disabled for efficiency if # you have an especially large dataset if verify_existing and "filename" in self.manifest: old_len = len(self.manifest) existing_files = os.listdir(self.data_path) self.manifest = self.manifest[ self.manifest["filename"].isin(existing_files) ] new_len = len(self.manifest) if verbose: print( f"{old_len - new_len} files in the manifest are missing from {self.data_path}." ) elif (not verify_existing) and verbose: print( f"self.verify_existing is set to False, so it's possible for the manifest to contain filenames which are not present in {data_path}" ) # Option to subsample dataset for doing smaller, faster runs if subsample is not None: if isinstance(subsample, int): self.manifest = self.manifest.sample(n=subsample) else: self.manifest = self.manifest.sample(frac=subsample) if verbose: print(f"{subsample} examples subsampled.") # Make sure that there are no NAN labels if (self.labels is not None) and drop_na_labels: old_len = len(self.manifest) self.manifest = self.manifest.dropna(subset=self.labels) new_len = len(self.manifest) if verbose: print( f"{old_len - new_len} examples contained NaN value(s) in their labels and were dropped." ) elif (self.labels is not None) and (not drop_na_labels): print( "drop_na_labels is set to False, so it's possible for the manifest to contain NaN-valued labels." ) # Save manifest to weights and biases run directory if wandb.run is not None: run_data_path = Path(wandb.run.dir).parent / "data" if not run_data_path.is_dir(): run_data_path.mkdir() save_name = "manifest.csv" if split is not None: save_name = f"{split}_{save_name}" self.manifest.to_csv(run_data_path / save_name) if verbose: print(f"Copy of manifest saved to {run_data_path}") def __len__(self) -> int: return len(self.manifest) def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: output = {} row = self.manifest.iloc[index] if "filename" in row: output["filename"] = row["filename"] if self.labels is not None: output["labels"] = torch.FloatTensor(row[self.labels]) file_results = self.read_file(self.data_path / output["filename"], row) if isinstance(file_results, dict): output.update(file_results) else: output["primary_input"] = file_results if self.extra_inputs is not None: output["extra_inputs"] = row["extra_inputs"] if self.augmentations is not None: output = self.augment(output) return output def process_manifest(self, manifest: pd.DataFrame) -> pd.DataFrame: if "mrn" in manifest.columns: manifest["mrn"] = manifest["mrn"].apply(format_mrn) if "study_date" in manifest.columns: manifest["study_date"] = pd.to_datetime(manifest["study_date"]) if "dob" in manifest.columns: manifest["dob"] = pd.to_datetime( manifest["dob"], infer_datetime_format=True, errors="coerce" ) if ("study_date" in manifest.columns) and ("dob" in manifest.columns): manifest["study_age"] = ( manifest["study_date"] - manifest["dob"] ) / np.timedelta64(1, "Y") return manifest def augment(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: if isinstance(self.augmentations, Iterable): # would use torch.stack here for cleanliness, but it seems that torchvision # transforms v1's claims about supporting "arbitrary leading dimensions" is # hogwash. they only support up to 4D. so we have to concatenate along the # channel dimension, then apply the augmentations, then split along the channel # dimension. augmentable_inputs = torch.cat( [output_dict[key] for key in self.apply_augmentations_to], dim=0 ) # (C*N, T, H, W) for aug in self.augmentations: augmentable_inputs = aug(augmentable_inputs) place = 0 for i, key in enumerate(self.apply_augmentations_to): n_channels = output_dict[key].shape[0] output_dict[key] = augmentable_inputs[place:place+n_channels] place += n_channels elif isinstance(self.augmentations, Callable): output_dict = self.augmentations(output_dict) else: raise Exception( "self.augmentations must be either an Iterable of augmentations or a single custom augmentation function." ) return output_dict def read_file(self, filepath: Path, row: Optional[pd.Series] = None) -> torch.Tensor: raise NotImplementedError class ECGDataset(CedarsDataset): def __init__( self, # CedarsDataset params data_path: Union[Path, str], manifest_path: Union[Path, str] = None, split: str = None, labels: Union[List[str], str] = None, update_manifest_func: Callable = None, subsample: float = None, verbose: bool = True, verify_existing: bool = True, drop_na_labels: bool = True, # ECGoDataset params leads: List[str] = None, random_lead: bool = False, # New parameter for random lead selection data_length: int = 5000, **kwargs, ): """ Args: leads: List[str] -- which leads you want passed to the model. Defaults to all 12. """ super().__init__( data_path=data_path, manifest_path=manifest_path, split=split, labels=labels, update_manifest_func=update_manifest_func, subsample=subsample, verbose=verbose, verify_existing=verify_existing, drop_na_labels=drop_na_labels, **kwargs, ) self.lead_order = [ "I", "II", "III", "aVR", "aVL", "aVF", "V1", "V2", "V3", "V4", "V5", "V6", ] self.leads = leads if self.leads is None: self.leads = self.lead_order if isinstance(self.leads, str): self.leads = [self.leads] if "first_lead_only" in kwargs: raise ( Exception( '"first_lead_only" has been deprecated. Please pass leads=["I"] \ instead if you would like to train on only the first lead.' ) ) self.random_lead = random_lead # Storing the random_lead attribute self.data_length = data_length def read_file(self, filepath, row=None): # ECGs are usually stored as .npy files. file = np.load(filepath) if file.shape[0] != 12: file = file.T file = torch.tensor(file).float() # Slice the data to the specified length file = file[:, :self.data_length] if self.random_lead: lead_idx = random.choice(range(12)) file = file[lead_idx:lead_idx+1] # Select the random lead else: channels = [self.lead_order.index(lead) for lead in self.leads] file = file[channels] # Final shape should ideally be NumLeadsxTime(or NumLeadsxTime depending on the resolution of the ECG) return file class ECGSingleLeadDataset(CedarsDataset): def __init__( self, # CedarsDataset params data_path: Union[Path, str], manifest_path: Union[Path, str] = None, labels: Union[List[str], str] = None, update_manifest_func: Callable = None, subsample: float = None, verbose: bool = True, verify_existing: bool = True, drop_na_labels: bool = True, **kwargs, ): """ Args: leads: List[str] -- which leads you want passed to the model. Defaults to all 12. """ super().__init__( data_path=data_path, manifest_path=manifest_path, labels=labels, update_manifest_func=update_manifest_func, subsample=subsample, verbose=verbose, verify_existing=verify_existing, drop_na_labels=drop_na_labels, **kwargs, ) def read_file(self, filepath, row=None): # ECGs are usually stored as .npy files. try: file = np.load(filepath) except Exception as e: print(filepath) print(e) file = torch.tensor(file).float().unsqueeze(0) # Final shape should ideally be NumLeadsxTime(or NumLeadsxTime depending on the resolution of the ECG) return file