#!/usr/bin/env python # coding: utf-8 # %% import numpy as np import os from pathlib import Path from tqdm import tqdm from utils.ecg_utils import ( remove_baseline_wander, wavelet_denoise_signal, plot_12_lead_ecg, ) # %% def calculate_means_stds(npy_directory, n): npy_directory = Path(npy_directory) filelist = os.listdir(npy_directory) np.random.shuffle(filelist) full_batch = np.zeros((n, 5000, 12)) count = 0 for i, npy_filename in enumerate(tqdm(filelist[:n])): npy_filepath = npy_directory / npy_filename ekg_numpy_array = np.load(npy_filepath) if ekg_numpy_array.shape[0] != 5000: continue full_batch[count] = ekg_numpy_array count += 1 full_batch = full_batch[:count] # Trim the array to remove unused entries ecg_means = np.mean(full_batch, axis=(0, 1)) ecg_stds = np.std(full_batch, axis=(0, 1)) if ecg_means.shape[0] == ecg_stds.shape[0] == 12: print('Shape of mean and std for ECG normalization are correct!') return ecg_means, ecg_stds # %% # run the function on a list of filenames. def ecg_denoising( raw_directory=raw_directory, output_directory=output_directory, ecg_means = ecg_means, ecg_stds = ecg_stds ): filelist = os.listdir(raw_directory) for i, filename in enumerate(tqdm(filelist[:n])): # Signal processing raw_directory = Path(raw_directory) ecg_filepath = raw_directory / filename ecg_numpy_array = np.load(ecg_filepath) # 1. Wandering baseline removal ecg_numpy_array = remove_baseline_wander( ecg_numpy_array, sampling_frequency=sampling_frequency ) # Discrete wavelet transform denoising for lead in range(12): ecg_numpy_array[:, lead] = wavelet_denoise_signal(ecg_numpy_array[:, lead]) # Lead-wise normalization with precomputed means and standard deviations ecg_numpy_array = (ecg_numpy_array - ecg_means) / ecg_stds np.save(output_directory / filename, ecg_numpy_array) return True # %% def segmentation(data_path, manifest_path, output_path, length, steps): manifest = pd.read_csv(manifest_path) data = [] print('Staring segmenting ECG......') for index in tqdm(range(manifest.shape[0])): mrn = manifest['MRN'].iloc[index] #MRN as column name for medical record number filename = manifest['filename'].iloc[index] #filename as column name for ecg filename k = manifest['TEST_RSLT'].iloc[index] #TEST_RSLT as column name for potassium level ecg_array = np.load(os.path.join(data_path, filename)) ecg_array = ecg_array[:, 0] # assume lead I is the first lead in npy file # Loop through every second as start point: for start in range(0, len(ecg_array), 500*steps): end = start + 500 * length # 500 points for each seconds if start >= 0 and end <= len(ecg_array): sample = ecg_array[start:end] if len(sample) == 500 * length: data.append({'mrn': mrn, 'original_filename': filename, 'ecg': sample, 'label': k}) else: print(f'Different sample size for {filename}: {len(sample)}') df = pd.DataFrame(data) df['filename'] = None if not os.path.exists(output_path): os.makedirs(output_path) print('Saving segmented ECG......') for index, row in df.iterrows(): original_filename = row['original_filename'] ecg_array = row['ecg'] new_file_name = f"{original_filename.rsplit('.', 1)[0]}_{index+1}.npy" new_file_path = os.path.join(output_path, new_file_name) df.at[index, 'filename'] = new_file_name np.save(new_file_path, ecg_array) df.drop(columns=['ecg'], inplace=True) df.to_csv(f'{output_path}/{length}seconds_length_{steps}seconds_step_ecg_manifest.csv') # %% if __name__ == "__main__": npy_directory = "ecg folder for entire database" n = 100000 # the number of ecg for calculating mean and std print('Calculating ECG means and stds........') ecg_means, ecg_stds = calculate_means_stds(npy_directory, n) raw_directory = "path/to/raw_data_directory" #raw ecg folder for target task output_directory = "path/to/output_directory" #output ecg folder for target task print('Denoising and Normalizing ECGs........') ecg_denoising(raw_directory, output_directory, ecg_means, ecg_stds) data_path = output_directory # Output directory from the above step manifest_path = "/path/to/manifest.csv" # Manifest file path output_path = "path/to/output_path" # Output path for segmented ECGs length = 5 # Length of each segment in seconds steps = 1 # Number of seconds step process_ecg(data_path, manifest_path, output_path, length, steps)