gaitsetpy.dataset.physionet
PhysioNet VGRF Dataset Loader. Maintainer: @aharshit123456
This file contains the PhysioNet VGRF dataset loader class that inherits from BaseDatasetLoader. The PhysioNet dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's disease and healthy controls.
Dataset source: https://physionet.org/content/gaitpdb/1.0.0/
1''' 2PhysioNet VGRF Dataset Loader. 3Maintainer: @aharshit123456 4 5This file contains the PhysioNet VGRF dataset loader class that inherits from BaseDatasetLoader. 6The PhysioNet dataset contains vertical ground reaction force (VGRF) data from subjects with 7Parkinson's disease and healthy controls. 8 9Dataset source: https://physionet.org/content/gaitpdb/1.0.0/ 10''' 11 12import os 13import pandas as pd 14import numpy as np 15from typing import List, Dict, Tuple, Optional 16from glob import glob 17import requests 18from tqdm import tqdm 19import zipfile 20from ..core.base_classes import BaseDatasetLoader 21from .utils import sliding_window 22 23 24class PhysioNetLoader(BaseDatasetLoader): 25 """ 26 PhysioNet VGRF dataset loader class. 27 28 This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset. 29 The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's 30 disease and healthy controls. 31 """ 32 33 def __init__(self): 34 super().__init__( 35 name="physionet", 36 description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls" 37 ) 38 self.metadata = { 39 'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8', 40 'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'], 41 'sampling_frequency': 100, # 100 Hz sampling frequency 42 'subjects': { 43 'Co': 'Control subjects', 44 'Pt': 'Parkinson\'s disease patients' 45 }, 46 'window_size': 600, # 6 seconds at 100 Hz 47 'url': 'https://physionet.org/files/gaitpdb/1.0.0/' 48 } 49 self.labels = [] 50 self.subject_types = [] 51 52 def _download_physionet_data(self, data_dir: str) -> str: 53 """ 54 Download PhysioNet dataset if not already present. 55 56 Args: 57 data_dir: Directory to store the dataset 58 59 Returns: 60 Path to the downloaded/existing dataset directory 61 """ 62 dataset_path = os.path.join(data_dir, "physionet_gaitpdb") 63 64 if os.path.exists(dataset_path) and len(os.listdir(dataset_path)) > 0: 65 print(f"PhysioNet dataset already exists at: {dataset_path}") 66 return dataset_path 67 68 os.makedirs(dataset_path, exist_ok=True) 69 70 # Download the dataset files 71 base_url = "https://physionet.org/files/gaitpdb/1.0.0/" 72 73 # Get list of files (basic file names based on the reference) 74 file_patterns = [ 75 # Control subjects - Ga prefix 76 *[f"GaCo{i:02d}_{j:02d}.txt" for i in range(1, 18) for j in range(1, 3)], 77 "GaCo22_01.txt", "GaCo22_10.txt", 78 79 # Parkinson's patients - Ga prefix 80 *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(3, 10) for j in range(1, 3)], 81 *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(12, 34) for j in range(1, 3)], 82 *[f"GaPt{i:02d}_10.txt" for i in range(13, 34)], 83 84 # Control subjects - Ju prefix 85 *[f"JuCo{i:02d}_01.txt" for i in range(1, 27)], 86 87 # Parkinson's patients - Ju prefix 88 *[f"JuPt{i:02d}_{j:02d}.txt" for i in range(1, 30) for j in range(1, 8)], 89 90 # Control subjects - Si prefix 91 *[f"SiCo{i:02d}_01.txt" for i in range(1, 31)], 92 93 # Parkinson's patients - Si prefix 94 *[f"SiPt{i:02d}_01.txt" for i in range(2, 41)] 95 ] 96 97 print(f"Downloading PhysioNet dataset to {dataset_path}") 98 for filename in tqdm(file_patterns, desc="Downloading files"): 99 file_url = base_url + filename 100 file_path = os.path.join(dataset_path, filename) 101 102 if os.path.exists(file_path): 103 continue 104 105 try: 106 response = requests.get(file_url, stream=True) 107 if response.status_code == 200: 108 with open(file_path, 'wb') as f: 109 for chunk in response.iter_content(chunk_size=8192): 110 f.write(chunk) 111 else: 112 print(f"Could not download {filename} (status: {response.status_code})") 113 except Exception as e: 114 print(f"Error downloading {filename}: {e}") 115 116 return dataset_path 117 118 def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]: 119 """ 120 Load PhysioNet VGRF dataset from the specified directory. 121 122 Args: 123 data_dir: Directory to store/find the dataset 124 **kwargs: Additional arguments (unused for PhysioNet) 125 126 Returns: 127 Tuple of (data_list, names_list) 128 """ 129 # Download dataset if needed 130 dataset_path = self._download_physionet_data(data_dir) 131 132 physionet_data = [] 133 physionet_names = [] 134 self.labels = [] 135 self.subject_types = [] 136 137 # Load all available files 138 for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))): 139 filename = os.path.basename(filepath) 140 141 # Extract subject type from filename 142 if 'Co' in filename: 143 subject_type = 'Control' 144 label = 'Co' 145 elif 'Pt' in filename: 146 subject_type = 'Patient' 147 label = 'Pt' 148 else: 149 continue # Skip files that don't match expected pattern 150 151 try: 152 # Read the file - PhysioNet files are tab-delimited with variable columns 153 # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist 154 df = pd.read_csv(filepath, delimiter='\t', header=None) 155 156 # Handle variable number of columns 157 n_cols = min(df.shape[1], 19) # Limit to 19 columns max 158 df = df.iloc[:, :n_cols] 159 160 # Create column names 161 col_names = ['time'] 162 for i in range(1, n_cols): 163 if i <= 8: 164 col_names.append(f'VGRF_L{i}') 165 elif i <= 16: 166 col_names.append(f'VGRF_R{i-8}') 167 else: 168 col_names.append(f'sensor_{i}') 169 170 df.columns = col_names 171 172 # Set time as index 173 df = df.set_index('time') 174 175 # Add subject metadata 176 df['subject_type'] = subject_type 177 df['label'] = label 178 179 physionet_data.append(df) 180 physionet_names.append(filename) 181 self.labels.append(label) 182 self.subject_types.append(subject_type) 183 184 except Exception as e: 185 print(f"Error loading {filename}: {e}") 186 continue 187 188 # Store loaded data 189 self.data = physionet_data 190 self.names = physionet_names 191 192 print(f"Loaded {len(physionet_data)} PhysioNet files") 193 print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}") 194 195 return physionet_data, physionet_names 196 197 def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 198 window_size: int = 600, step_size: int = 100) -> List[Dict]: 199 """ 200 Create sliding windows from the PhysioNet dataset. 201 202 Args: 203 data: List of DataFrames containing PhysioNet data 204 names: List of names corresponding to the data 205 window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) 206 step_size: Step size for the sliding window (default: 100) 207 208 Returns: 209 List of dictionaries containing sliding windows for each DataFrame 210 """ 211 windows_data = [] 212 213 for idx, df in enumerate(data): 214 # Remove metadata columns for windowing 215 sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')] 216 df_sensors = df[sensor_columns] 217 218 if df_sensors.empty or len(df_sensors) < window_size: 219 continue 220 221 windows = [] 222 223 # Create windows for each sensor 224 for col in sensor_columns: 225 try: 226 window_data = sliding_window(df_sensors[col].values, window_size, step_size) 227 windows.append({"name": col, "data": window_data}) 228 except Exception as e: 229 print(f"Error creating windows for {col} in {names[idx]}: {e}") 230 continue 231 232 if windows: 233 windows_data.append({ 234 "name": names[idx], 235 "windows": windows, 236 "metadata": { 237 "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown', 238 "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown', 239 "window_size": window_size, 240 "step_size": step_size, 241 "num_windows": len(windows[0]["data"]) if windows else 0 242 } 243 }) 244 245 return windows_data 246 247 def get_supported_formats(self) -> List[str]: 248 """ 249 Get list of supported file formats for PhysioNet dataset. 250 251 Returns: 252 List of supported file extensions 253 """ 254 return ['.txt'] 255 256 def get_sensor_info(self) -> Dict[str, List[str]]: 257 """ 258 Get information about sensors in the dataset. 259 260 Returns: 261 Dictionary containing sensor information 262 """ 263 return { 264 'sensors': self.metadata['sensors'], 265 'sampling_frequency': self.metadata['sampling_frequency'], 266 'window_size': self.metadata['window_size'] 267 } 268 269 def get_subject_info(self) -> Dict[str, str]: 270 """ 271 Get information about subjects in the dataset. 272 273 Returns: 274 Dictionary containing subject information 275 """ 276 return self.metadata['subjects'] 277 278 def get_labels(self) -> List[str]: 279 """ 280 Get labels for loaded data. 281 282 Returns: 283 List of labels corresponding to loaded data 284 """ 285 return self.labels 286 287 def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]: 288 """ 289 Filter loaded data by subject type. 290 291 Args: 292 subject_type: 'Control' or 'Patient' 293 294 Returns: 295 Tuple of (filtered_data, filtered_names) 296 """ 297 if not self.data: 298 raise ValueError("No data loaded. Call load_data() first.") 299 300 filtered_data = [] 301 filtered_names = [] 302 303 for i, df in enumerate(self.data): 304 if df['subject_type'].iloc[0] == subject_type: 305 filtered_data.append(df) 306 filtered_names.append(self.names[i]) 307 308 return filtered_data, filtered_names 309 310 311# Legacy function for backward compatibility 312def load_physionet_data(data_dir: str) -> Tuple[List[pd.DataFrame], List[str]]: 313 """ 314 Legacy function to load PhysioNet data. 315 316 Args: 317 data_dir: Directory containing the dataset 318 319 Returns: 320 Tuple of (data_list, names_list) 321 """ 322 loader = PhysioNetLoader() 323 return loader.load_data(data_dir) 324 325 326def create_physionet_windows(data: List[pd.DataFrame], names: List[str], 327 window_size: int = 600, step_size: int = 100) -> List[Dict]: 328 """ 329 Legacy function to create sliding windows from PhysioNet data. 330 331 Args: 332 data: List of DataFrames 333 names: List of names 334 window_size: Size of sliding window 335 step_size: Step size for sliding window 336 337 Returns: 338 List of sliding window dictionaries 339 """ 340 loader = PhysioNetLoader() 341 return loader.create_sliding_windows(data, names, window_size, step_size)
25class PhysioNetLoader(BaseDatasetLoader): 26 """ 27 PhysioNet VGRF dataset loader class. 28 29 This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset. 30 The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's 31 disease and healthy controls. 32 """ 33 34 def __init__(self): 35 super().__init__( 36 name="physionet", 37 description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls" 38 ) 39 self.metadata = { 40 'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8', 41 'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'], 42 'sampling_frequency': 100, # 100 Hz sampling frequency 43 'subjects': { 44 'Co': 'Control subjects', 45 'Pt': 'Parkinson\'s disease patients' 46 }, 47 'window_size': 600, # 6 seconds at 100 Hz 48 'url': 'https://physionet.org/files/gaitpdb/1.0.0/' 49 } 50 self.labels = [] 51 self.subject_types = [] 52 53 def _download_physionet_data(self, data_dir: str) -> str: 54 """ 55 Download PhysioNet dataset if not already present. 56 57 Args: 58 data_dir: Directory to store the dataset 59 60 Returns: 61 Path to the downloaded/existing dataset directory 62 """ 63 dataset_path = os.path.join(data_dir, "physionet_gaitpdb") 64 65 if os.path.exists(dataset_path) and len(os.listdir(dataset_path)) > 0: 66 print(f"PhysioNet dataset already exists at: {dataset_path}") 67 return dataset_path 68 69 os.makedirs(dataset_path, exist_ok=True) 70 71 # Download the dataset files 72 base_url = "https://physionet.org/files/gaitpdb/1.0.0/" 73 74 # Get list of files (basic file names based on the reference) 75 file_patterns = [ 76 # Control subjects - Ga prefix 77 *[f"GaCo{i:02d}_{j:02d}.txt" for i in range(1, 18) for j in range(1, 3)], 78 "GaCo22_01.txt", "GaCo22_10.txt", 79 80 # Parkinson's patients - Ga prefix 81 *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(3, 10) for j in range(1, 3)], 82 *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(12, 34) for j in range(1, 3)], 83 *[f"GaPt{i:02d}_10.txt" for i in range(13, 34)], 84 85 # Control subjects - Ju prefix 86 *[f"JuCo{i:02d}_01.txt" for i in range(1, 27)], 87 88 # Parkinson's patients - Ju prefix 89 *[f"JuPt{i:02d}_{j:02d}.txt" for i in range(1, 30) for j in range(1, 8)], 90 91 # Control subjects - Si prefix 92 *[f"SiCo{i:02d}_01.txt" for i in range(1, 31)], 93 94 # Parkinson's patients - Si prefix 95 *[f"SiPt{i:02d}_01.txt" for i in range(2, 41)] 96 ] 97 98 print(f"Downloading PhysioNet dataset to {dataset_path}") 99 for filename in tqdm(file_patterns, desc="Downloading files"): 100 file_url = base_url + filename 101 file_path = os.path.join(dataset_path, filename) 102 103 if os.path.exists(file_path): 104 continue 105 106 try: 107 response = requests.get(file_url, stream=True) 108 if response.status_code == 200: 109 with open(file_path, 'wb') as f: 110 for chunk in response.iter_content(chunk_size=8192): 111 f.write(chunk) 112 else: 113 print(f"Could not download {filename} (status: {response.status_code})") 114 except Exception as e: 115 print(f"Error downloading {filename}: {e}") 116 117 return dataset_path 118 119 def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]: 120 """ 121 Load PhysioNet VGRF dataset from the specified directory. 122 123 Args: 124 data_dir: Directory to store/find the dataset 125 **kwargs: Additional arguments (unused for PhysioNet) 126 127 Returns: 128 Tuple of (data_list, names_list) 129 """ 130 # Download dataset if needed 131 dataset_path = self._download_physionet_data(data_dir) 132 133 physionet_data = [] 134 physionet_names = [] 135 self.labels = [] 136 self.subject_types = [] 137 138 # Load all available files 139 for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))): 140 filename = os.path.basename(filepath) 141 142 # Extract subject type from filename 143 if 'Co' in filename: 144 subject_type = 'Control' 145 label = 'Co' 146 elif 'Pt' in filename: 147 subject_type = 'Patient' 148 label = 'Pt' 149 else: 150 continue # Skip files that don't match expected pattern 151 152 try: 153 # Read the file - PhysioNet files are tab-delimited with variable columns 154 # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist 155 df = pd.read_csv(filepath, delimiter='\t', header=None) 156 157 # Handle variable number of columns 158 n_cols = min(df.shape[1], 19) # Limit to 19 columns max 159 df = df.iloc[:, :n_cols] 160 161 # Create column names 162 col_names = ['time'] 163 for i in range(1, n_cols): 164 if i <= 8: 165 col_names.append(f'VGRF_L{i}') 166 elif i <= 16: 167 col_names.append(f'VGRF_R{i-8}') 168 else: 169 col_names.append(f'sensor_{i}') 170 171 df.columns = col_names 172 173 # Set time as index 174 df = df.set_index('time') 175 176 # Add subject metadata 177 df['subject_type'] = subject_type 178 df['label'] = label 179 180 physionet_data.append(df) 181 physionet_names.append(filename) 182 self.labels.append(label) 183 self.subject_types.append(subject_type) 184 185 except Exception as e: 186 print(f"Error loading {filename}: {e}") 187 continue 188 189 # Store loaded data 190 self.data = physionet_data 191 self.names = physionet_names 192 193 print(f"Loaded {len(physionet_data)} PhysioNet files") 194 print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}") 195 196 return physionet_data, physionet_names 197 198 def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 199 window_size: int = 600, step_size: int = 100) -> List[Dict]: 200 """ 201 Create sliding windows from the PhysioNet dataset. 202 203 Args: 204 data: List of DataFrames containing PhysioNet data 205 names: List of names corresponding to the data 206 window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) 207 step_size: Step size for the sliding window (default: 100) 208 209 Returns: 210 List of dictionaries containing sliding windows for each DataFrame 211 """ 212 windows_data = [] 213 214 for idx, df in enumerate(data): 215 # Remove metadata columns for windowing 216 sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')] 217 df_sensors = df[sensor_columns] 218 219 if df_sensors.empty or len(df_sensors) < window_size: 220 continue 221 222 windows = [] 223 224 # Create windows for each sensor 225 for col in sensor_columns: 226 try: 227 window_data = sliding_window(df_sensors[col].values, window_size, step_size) 228 windows.append({"name": col, "data": window_data}) 229 except Exception as e: 230 print(f"Error creating windows for {col} in {names[idx]}: {e}") 231 continue 232 233 if windows: 234 windows_data.append({ 235 "name": names[idx], 236 "windows": windows, 237 "metadata": { 238 "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown', 239 "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown', 240 "window_size": window_size, 241 "step_size": step_size, 242 "num_windows": len(windows[0]["data"]) if windows else 0 243 } 244 }) 245 246 return windows_data 247 248 def get_supported_formats(self) -> List[str]: 249 """ 250 Get list of supported file formats for PhysioNet dataset. 251 252 Returns: 253 List of supported file extensions 254 """ 255 return ['.txt'] 256 257 def get_sensor_info(self) -> Dict[str, List[str]]: 258 """ 259 Get information about sensors in the dataset. 260 261 Returns: 262 Dictionary containing sensor information 263 """ 264 return { 265 'sensors': self.metadata['sensors'], 266 'sampling_frequency': self.metadata['sampling_frequency'], 267 'window_size': self.metadata['window_size'] 268 } 269 270 def get_subject_info(self) -> Dict[str, str]: 271 """ 272 Get information about subjects in the dataset. 273 274 Returns: 275 Dictionary containing subject information 276 """ 277 return self.metadata['subjects'] 278 279 def get_labels(self) -> List[str]: 280 """ 281 Get labels for loaded data. 282 283 Returns: 284 List of labels corresponding to loaded data 285 """ 286 return self.labels 287 288 def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]: 289 """ 290 Filter loaded data by subject type. 291 292 Args: 293 subject_type: 'Control' or 'Patient' 294 295 Returns: 296 Tuple of (filtered_data, filtered_names) 297 """ 298 if not self.data: 299 raise ValueError("No data loaded. Call load_data() first.") 300 301 filtered_data = [] 302 filtered_names = [] 303 304 for i, df in enumerate(self.data): 305 if df['subject_type'].iloc[0] == subject_type: 306 filtered_data.append(df) 307 filtered_names.append(self.names[i]) 308 309 return filtered_data, filtered_names
PhysioNet VGRF dataset loader class.
This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset. The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's disease and healthy controls.
34 def __init__(self): 35 super().__init__( 36 name="physionet", 37 description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls" 38 ) 39 self.metadata = { 40 'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8', 41 'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'], 42 'sampling_frequency': 100, # 100 Hz sampling frequency 43 'subjects': { 44 'Co': 'Control subjects', 45 'Pt': 'Parkinson\'s disease patients' 46 }, 47 'window_size': 600, # 6 seconds at 100 Hz 48 'url': 'https://physionet.org/files/gaitpdb/1.0.0/' 49 } 50 self.labels = [] 51 self.subject_types = []
Initialize the dataset loader.
Args: name: Name of the dataset description: Description of the dataset
119 def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]: 120 """ 121 Load PhysioNet VGRF dataset from the specified directory. 122 123 Args: 124 data_dir: Directory to store/find the dataset 125 **kwargs: Additional arguments (unused for PhysioNet) 126 127 Returns: 128 Tuple of (data_list, names_list) 129 """ 130 # Download dataset if needed 131 dataset_path = self._download_physionet_data(data_dir) 132 133 physionet_data = [] 134 physionet_names = [] 135 self.labels = [] 136 self.subject_types = [] 137 138 # Load all available files 139 for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))): 140 filename = os.path.basename(filepath) 141 142 # Extract subject type from filename 143 if 'Co' in filename: 144 subject_type = 'Control' 145 label = 'Co' 146 elif 'Pt' in filename: 147 subject_type = 'Patient' 148 label = 'Pt' 149 else: 150 continue # Skip files that don't match expected pattern 151 152 try: 153 # Read the file - PhysioNet files are tab-delimited with variable columns 154 # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist 155 df = pd.read_csv(filepath, delimiter='\t', header=None) 156 157 # Handle variable number of columns 158 n_cols = min(df.shape[1], 19) # Limit to 19 columns max 159 df = df.iloc[:, :n_cols] 160 161 # Create column names 162 col_names = ['time'] 163 for i in range(1, n_cols): 164 if i <= 8: 165 col_names.append(f'VGRF_L{i}') 166 elif i <= 16: 167 col_names.append(f'VGRF_R{i-8}') 168 else: 169 col_names.append(f'sensor_{i}') 170 171 df.columns = col_names 172 173 # Set time as index 174 df = df.set_index('time') 175 176 # Add subject metadata 177 df['subject_type'] = subject_type 178 df['label'] = label 179 180 physionet_data.append(df) 181 physionet_names.append(filename) 182 self.labels.append(label) 183 self.subject_types.append(subject_type) 184 185 except Exception as e: 186 print(f"Error loading {filename}: {e}") 187 continue 188 189 # Store loaded data 190 self.data = physionet_data 191 self.names = physionet_names 192 193 print(f"Loaded {len(physionet_data)} PhysioNet files") 194 print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}") 195 196 return physionet_data, physionet_names
Load PhysioNet VGRF dataset from the specified directory.
Args: data_dir: Directory to store/find the dataset **kwargs: Additional arguments (unused for PhysioNet)
Returns: Tuple of (data_list, names_list)
198 def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 199 window_size: int = 600, step_size: int = 100) -> List[Dict]: 200 """ 201 Create sliding windows from the PhysioNet dataset. 202 203 Args: 204 data: List of DataFrames containing PhysioNet data 205 names: List of names corresponding to the data 206 window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) 207 step_size: Step size for the sliding window (default: 100) 208 209 Returns: 210 List of dictionaries containing sliding windows for each DataFrame 211 """ 212 windows_data = [] 213 214 for idx, df in enumerate(data): 215 # Remove metadata columns for windowing 216 sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')] 217 df_sensors = df[sensor_columns] 218 219 if df_sensors.empty or len(df_sensors) < window_size: 220 continue 221 222 windows = [] 223 224 # Create windows for each sensor 225 for col in sensor_columns: 226 try: 227 window_data = sliding_window(df_sensors[col].values, window_size, step_size) 228 windows.append({"name": col, "data": window_data}) 229 except Exception as e: 230 print(f"Error creating windows for {col} in {names[idx]}: {e}") 231 continue 232 233 if windows: 234 windows_data.append({ 235 "name": names[idx], 236 "windows": windows, 237 "metadata": { 238 "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown', 239 "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown', 240 "window_size": window_size, 241 "step_size": step_size, 242 "num_windows": len(windows[0]["data"]) if windows else 0 243 } 244 }) 245 246 return windows_data
Create sliding windows from the PhysioNet dataset.
Args: data: List of DataFrames containing PhysioNet data names: List of names corresponding to the data window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) step_size: Step size for the sliding window (default: 100)
Returns: List of dictionaries containing sliding windows for each DataFrame
248 def get_supported_formats(self) -> List[str]: 249 """ 250 Get list of supported file formats for PhysioNet dataset. 251 252 Returns: 253 List of supported file extensions 254 """ 255 return ['.txt']
Get list of supported file formats for PhysioNet dataset.
Returns: List of supported file extensions
257 def get_sensor_info(self) -> Dict[str, List[str]]: 258 """ 259 Get information about sensors in the dataset. 260 261 Returns: 262 Dictionary containing sensor information 263 """ 264 return { 265 'sensors': self.metadata['sensors'], 266 'sampling_frequency': self.metadata['sampling_frequency'], 267 'window_size': self.metadata['window_size'] 268 }
Get information about sensors in the dataset.
Returns: Dictionary containing sensor information
270 def get_subject_info(self) -> Dict[str, str]: 271 """ 272 Get information about subjects in the dataset. 273 274 Returns: 275 Dictionary containing subject information 276 """ 277 return self.metadata['subjects']
Get information about subjects in the dataset.
Returns: Dictionary containing subject information
279 def get_labels(self) -> List[str]: 280 """ 281 Get labels for loaded data. 282 283 Returns: 284 List of labels corresponding to loaded data 285 """ 286 return self.labels
Get labels for loaded data.
Returns: List of labels corresponding to loaded data
288 def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]: 289 """ 290 Filter loaded data by subject type. 291 292 Args: 293 subject_type: 'Control' or 'Patient' 294 295 Returns: 296 Tuple of (filtered_data, filtered_names) 297 """ 298 if not self.data: 299 raise ValueError("No data loaded. Call load_data() first.") 300 301 filtered_data = [] 302 filtered_names = [] 303 304 for i, df in enumerate(self.data): 305 if df['subject_type'].iloc[0] == subject_type: 306 filtered_data.append(df) 307 filtered_names.append(self.names[i]) 308 309 return filtered_data, filtered_names
Filter loaded data by subject type.
Args: subject_type: 'Control' or 'Patient'
Returns: Tuple of (filtered_data, filtered_names)
Inherited Members
313def load_physionet_data(data_dir: str) -> Tuple[List[pd.DataFrame], List[str]]: 314 """ 315 Legacy function to load PhysioNet data. 316 317 Args: 318 data_dir: Directory containing the dataset 319 320 Returns: 321 Tuple of (data_list, names_list) 322 """ 323 loader = PhysioNetLoader() 324 return loader.load_data(data_dir)
Legacy function to load PhysioNet data.
Args: data_dir: Directory containing the dataset
Returns: Tuple of (data_list, names_list)
327def create_physionet_windows(data: List[pd.DataFrame], names: List[str], 328 window_size: int = 600, step_size: int = 100) -> List[Dict]: 329 """ 330 Legacy function to create sliding windows from PhysioNet data. 331 332 Args: 333 data: List of DataFrames 334 names: List of names 335 window_size: Size of sliding window 336 step_size: Step size for sliding window 337 338 Returns: 339 List of sliding window dictionaries 340 """ 341 loader = PhysioNetLoader() 342 return loader.create_sliding_windows(data, names, window_size, step_size)
Legacy function to create sliding windows from PhysioNet data.
Args: data: List of DataFrames names: List of names window_size: Size of sliding window step_size: Step size for sliding window
Returns: List of sliding window dictionaries