gaitsetpy.dataset.physionet

PhysioNet VGRF Dataset Loader. Maintainer: @aharshit123456

This file contains the PhysioNet VGRF dataset loader class that inherits from BaseDatasetLoader. The PhysioNet dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's disease and healthy controls.

Dataset source: https://physionet.org/content/gaitpdb/1.0.0/

  1'''
  2PhysioNet VGRF Dataset Loader.
  3Maintainer: @aharshit123456
  4
  5This file contains the PhysioNet VGRF dataset loader class that inherits from BaseDatasetLoader.
  6The PhysioNet dataset contains vertical ground reaction force (VGRF) data from subjects with 
  7Parkinson's disease and healthy controls.
  8
  9Dataset source: https://physionet.org/content/gaitpdb/1.0.0/
 10'''
 11
 12import os
 13import pandas as pd
 14import numpy as np
 15from typing import List, Dict, Tuple, Optional
 16from glob import glob
 17import requests
 18from tqdm import tqdm
 19import zipfile
 20from ..core.base_classes import BaseDatasetLoader
 21from .utils import sliding_window
 22
 23
 24class PhysioNetLoader(BaseDatasetLoader):
 25    """
 26    PhysioNet VGRF dataset loader class.
 27    
 28    This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset.
 29    The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's 
 30    disease and healthy controls.
 31    """
 32    
 33    def __init__(self):
 34        super().__init__(
 35            name="physionet",
 36            description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls"
 37        )
 38        self.metadata = {
 39            'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8',
 40                       'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'],
 41            'sampling_frequency': 100,  # 100 Hz sampling frequency
 42            'subjects': {
 43                'Co': 'Control subjects',
 44                'Pt': 'Parkinson\'s disease patients'
 45            },
 46            'window_size': 600,  # 6 seconds at 100 Hz
 47            'url': 'https://physionet.org/files/gaitpdb/1.0.0/'
 48        }
 49        self.labels = []
 50        self.subject_types = []
 51    
 52    def _download_physionet_data(self, data_dir: str) -> str:
 53        """
 54        Download PhysioNet dataset if not already present.
 55        
 56        Args:
 57            data_dir: Directory to store the dataset
 58            
 59        Returns:
 60            Path to the downloaded/existing dataset directory
 61        """
 62        dataset_path = os.path.join(data_dir, "physionet_gaitpdb")
 63        
 64        if os.path.exists(dataset_path) and len(os.listdir(dataset_path)) > 0:
 65            print(f"PhysioNet dataset already exists at: {dataset_path}")
 66            return dataset_path
 67        
 68        os.makedirs(dataset_path, exist_ok=True)
 69        
 70        # Download the dataset files
 71        base_url = "https://physionet.org/files/gaitpdb/1.0.0/"
 72        
 73        # Get list of files (basic file names based on the reference)
 74        file_patterns = [
 75            # Control subjects - Ga prefix
 76            *[f"GaCo{i:02d}_{j:02d}.txt" for i in range(1, 18) for j in range(1, 3)],
 77            "GaCo22_01.txt", "GaCo22_10.txt",
 78            
 79            # Parkinson's patients - Ga prefix
 80            *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(3, 10) for j in range(1, 3)],
 81            *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(12, 34) for j in range(1, 3)],
 82            *[f"GaPt{i:02d}_10.txt" for i in range(13, 34)],
 83            
 84            # Control subjects - Ju prefix
 85            *[f"JuCo{i:02d}_01.txt" for i in range(1, 27)],
 86            
 87            # Parkinson's patients - Ju prefix
 88            *[f"JuPt{i:02d}_{j:02d}.txt" for i in range(1, 30) for j in range(1, 8)],
 89            
 90            # Control subjects - Si prefix
 91            *[f"SiCo{i:02d}_01.txt" for i in range(1, 31)],
 92            
 93            # Parkinson's patients - Si prefix
 94            *[f"SiPt{i:02d}_01.txt" for i in range(2, 41)]
 95        ]
 96        
 97        print(f"Downloading PhysioNet dataset to {dataset_path}")
 98        for filename in tqdm(file_patterns, desc="Downloading files"):
 99            file_url = base_url + filename
100            file_path = os.path.join(dataset_path, filename)
101            
102            if os.path.exists(file_path):
103                continue
104                
105            try:
106                response = requests.get(file_url, stream=True)
107                if response.status_code == 200:
108                    with open(file_path, 'wb') as f:
109                        for chunk in response.iter_content(chunk_size=8192):
110                            f.write(chunk)
111                else:
112                    print(f"Could not download {filename} (status: {response.status_code})")
113            except Exception as e:
114                print(f"Error downloading {filename}: {e}")
115        
116        return dataset_path
117    
118    def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]:
119        """
120        Load PhysioNet VGRF dataset from the specified directory.
121        
122        Args:
123            data_dir: Directory to store/find the dataset
124            **kwargs: Additional arguments (unused for PhysioNet)
125            
126        Returns:
127            Tuple of (data_list, names_list)
128        """
129        # Download dataset if needed
130        dataset_path = self._download_physionet_data(data_dir)
131        
132        physionet_data = []
133        physionet_names = []
134        self.labels = []
135        self.subject_types = []
136        
137        # Load all available files
138        for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))):
139            filename = os.path.basename(filepath)
140            
141            # Extract subject type from filename
142            if 'Co' in filename:
143                subject_type = 'Control'
144                label = 'Co'
145            elif 'Pt' in filename:
146                subject_type = 'Patient'
147                label = 'Pt'
148            else:
149                continue  # Skip files that don't match expected pattern
150            
151            try:
152                # Read the file - PhysioNet files are tab-delimited with variable columns
153                # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist
154                df = pd.read_csv(filepath, delimiter='\t', header=None)
155                
156                # Handle variable number of columns
157                n_cols = min(df.shape[1], 19)  # Limit to 19 columns max
158                df = df.iloc[:, :n_cols]
159                
160                # Create column names
161                col_names = ['time']
162                for i in range(1, n_cols):
163                    if i <= 8:
164                        col_names.append(f'VGRF_L{i}')
165                    elif i <= 16:
166                        col_names.append(f'VGRF_R{i-8}')
167                    else:
168                        col_names.append(f'sensor_{i}')
169                
170                df.columns = col_names
171                
172                # Set time as index
173                df = df.set_index('time')
174                
175                # Add subject metadata
176                df['subject_type'] = subject_type
177                df['label'] = label
178                
179                physionet_data.append(df)
180                physionet_names.append(filename)
181                self.labels.append(label)
182                self.subject_types.append(subject_type)
183                
184            except Exception as e:
185                print(f"Error loading {filename}: {e}")
186                continue
187        
188        # Store loaded data
189        self.data = physionet_data
190        self.names = physionet_names
191        
192        print(f"Loaded {len(physionet_data)} PhysioNet files")
193        print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}")
194        
195        return physionet_data, physionet_names
196    
197    def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 
198                             window_size: int = 600, step_size: int = 100) -> List[Dict]:
199        """
200        Create sliding windows from the PhysioNet dataset.
201        
202        Args:
203            data: List of DataFrames containing PhysioNet data
204            names: List of names corresponding to the data
205            window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz)
206            step_size: Step size for the sliding window (default: 100)
207            
208        Returns:
209            List of dictionaries containing sliding windows for each DataFrame
210        """
211        windows_data = []
212        
213        for idx, df in enumerate(data):
214            # Remove metadata columns for windowing
215            sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')]
216            df_sensors = df[sensor_columns]
217            
218            if df_sensors.empty or len(df_sensors) < window_size:
219                continue
220                
221            windows = []
222            
223            # Create windows for each sensor
224            for col in sensor_columns:
225                try:
226                    window_data = sliding_window(df_sensors[col].values, window_size, step_size)
227                    windows.append({"name": col, "data": window_data})
228                except Exception as e:
229                    print(f"Error creating windows for {col} in {names[idx]}: {e}")
230                    continue
231            
232            if windows:
233                windows_data.append({
234                    "name": names[idx],
235                    "windows": windows,
236                    "metadata": {
237                        "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown',
238                        "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown',
239                        "window_size": window_size,
240                        "step_size": step_size,
241                        "num_windows": len(windows[0]["data"]) if windows else 0
242                    }
243                })
244        
245        return windows_data
246    
247    def get_supported_formats(self) -> List[str]:
248        """
249        Get list of supported file formats for PhysioNet dataset.
250        
251        Returns:
252            List of supported file extensions
253        """
254        return ['.txt']
255    
256    def get_sensor_info(self) -> Dict[str, List[str]]:
257        """
258        Get information about sensors in the dataset.
259        
260        Returns:
261            Dictionary containing sensor information
262        """
263        return {
264            'sensors': self.metadata['sensors'],
265            'sampling_frequency': self.metadata['sampling_frequency'],
266            'window_size': self.metadata['window_size']
267        }
268    
269    def get_subject_info(self) -> Dict[str, str]:
270        """
271        Get information about subjects in the dataset.
272        
273        Returns:
274            Dictionary containing subject information
275        """
276        return self.metadata['subjects']
277    
278    def get_labels(self) -> List[str]:
279        """
280        Get labels for loaded data.
281        
282        Returns:
283            List of labels corresponding to loaded data
284        """
285        return self.labels
286    
287    def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]:
288        """
289        Filter loaded data by subject type.
290        
291        Args:
292            subject_type: 'Control' or 'Patient'
293            
294        Returns:
295            Tuple of (filtered_data, filtered_names)
296        """
297        if not self.data:
298            raise ValueError("No data loaded. Call load_data() first.")
299        
300        filtered_data = []
301        filtered_names = []
302        
303        for i, df in enumerate(self.data):
304            if df['subject_type'].iloc[0] == subject_type:
305                filtered_data.append(df)
306                filtered_names.append(self.names[i])
307        
308        return filtered_data, filtered_names
309
310
311# Legacy function for backward compatibility
312def load_physionet_data(data_dir: str) -> Tuple[List[pd.DataFrame], List[str]]:
313    """
314    Legacy function to load PhysioNet data.
315    
316    Args:
317        data_dir: Directory containing the dataset
318        
319    Returns:
320        Tuple of (data_list, names_list)
321    """
322    loader = PhysioNetLoader()
323    return loader.load_data(data_dir)
324
325
326def create_physionet_windows(data: List[pd.DataFrame], names: List[str], 
327                           window_size: int = 600, step_size: int = 100) -> List[Dict]:
328    """
329    Legacy function to create sliding windows from PhysioNet data.
330    
331    Args:
332        data: List of DataFrames
333        names: List of names
334        window_size: Size of sliding window
335        step_size: Step size for sliding window
336        
337    Returns:
338        List of sliding window dictionaries
339    """
340    loader = PhysioNetLoader()
341    return loader.create_sliding_windows(data, names, window_size, step_size) 
class PhysioNetLoader(gaitsetpy.core.base_classes.BaseDatasetLoader):
 25class PhysioNetLoader(BaseDatasetLoader):
 26    """
 27    PhysioNet VGRF dataset loader class.
 28    
 29    This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset.
 30    The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's 
 31    disease and healthy controls.
 32    """
 33    
 34    def __init__(self):
 35        super().__init__(
 36            name="physionet",
 37            description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls"
 38        )
 39        self.metadata = {
 40            'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8',
 41                       'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'],
 42            'sampling_frequency': 100,  # 100 Hz sampling frequency
 43            'subjects': {
 44                'Co': 'Control subjects',
 45                'Pt': 'Parkinson\'s disease patients'
 46            },
 47            'window_size': 600,  # 6 seconds at 100 Hz
 48            'url': 'https://physionet.org/files/gaitpdb/1.0.0/'
 49        }
 50        self.labels = []
 51        self.subject_types = []
 52    
 53    def _download_physionet_data(self, data_dir: str) -> str:
 54        """
 55        Download PhysioNet dataset if not already present.
 56        
 57        Args:
 58            data_dir: Directory to store the dataset
 59            
 60        Returns:
 61            Path to the downloaded/existing dataset directory
 62        """
 63        dataset_path = os.path.join(data_dir, "physionet_gaitpdb")
 64        
 65        if os.path.exists(dataset_path) and len(os.listdir(dataset_path)) > 0:
 66            print(f"PhysioNet dataset already exists at: {dataset_path}")
 67            return dataset_path
 68        
 69        os.makedirs(dataset_path, exist_ok=True)
 70        
 71        # Download the dataset files
 72        base_url = "https://physionet.org/files/gaitpdb/1.0.0/"
 73        
 74        # Get list of files (basic file names based on the reference)
 75        file_patterns = [
 76            # Control subjects - Ga prefix
 77            *[f"GaCo{i:02d}_{j:02d}.txt" for i in range(1, 18) for j in range(1, 3)],
 78            "GaCo22_01.txt", "GaCo22_10.txt",
 79            
 80            # Parkinson's patients - Ga prefix
 81            *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(3, 10) for j in range(1, 3)],
 82            *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(12, 34) for j in range(1, 3)],
 83            *[f"GaPt{i:02d}_10.txt" for i in range(13, 34)],
 84            
 85            # Control subjects - Ju prefix
 86            *[f"JuCo{i:02d}_01.txt" for i in range(1, 27)],
 87            
 88            # Parkinson's patients - Ju prefix
 89            *[f"JuPt{i:02d}_{j:02d}.txt" for i in range(1, 30) for j in range(1, 8)],
 90            
 91            # Control subjects - Si prefix
 92            *[f"SiCo{i:02d}_01.txt" for i in range(1, 31)],
 93            
 94            # Parkinson's patients - Si prefix
 95            *[f"SiPt{i:02d}_01.txt" for i in range(2, 41)]
 96        ]
 97        
 98        print(f"Downloading PhysioNet dataset to {dataset_path}")
 99        for filename in tqdm(file_patterns, desc="Downloading files"):
100            file_url = base_url + filename
101            file_path = os.path.join(dataset_path, filename)
102            
103            if os.path.exists(file_path):
104                continue
105                
106            try:
107                response = requests.get(file_url, stream=True)
108                if response.status_code == 200:
109                    with open(file_path, 'wb') as f:
110                        for chunk in response.iter_content(chunk_size=8192):
111                            f.write(chunk)
112                else:
113                    print(f"Could not download {filename} (status: {response.status_code})")
114            except Exception as e:
115                print(f"Error downloading {filename}: {e}")
116        
117        return dataset_path
118    
119    def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]:
120        """
121        Load PhysioNet VGRF dataset from the specified directory.
122        
123        Args:
124            data_dir: Directory to store/find the dataset
125            **kwargs: Additional arguments (unused for PhysioNet)
126            
127        Returns:
128            Tuple of (data_list, names_list)
129        """
130        # Download dataset if needed
131        dataset_path = self._download_physionet_data(data_dir)
132        
133        physionet_data = []
134        physionet_names = []
135        self.labels = []
136        self.subject_types = []
137        
138        # Load all available files
139        for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))):
140            filename = os.path.basename(filepath)
141            
142            # Extract subject type from filename
143            if 'Co' in filename:
144                subject_type = 'Control'
145                label = 'Co'
146            elif 'Pt' in filename:
147                subject_type = 'Patient'
148                label = 'Pt'
149            else:
150                continue  # Skip files that don't match expected pattern
151            
152            try:
153                # Read the file - PhysioNet files are tab-delimited with variable columns
154                # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist
155                df = pd.read_csv(filepath, delimiter='\t', header=None)
156                
157                # Handle variable number of columns
158                n_cols = min(df.shape[1], 19)  # Limit to 19 columns max
159                df = df.iloc[:, :n_cols]
160                
161                # Create column names
162                col_names = ['time']
163                for i in range(1, n_cols):
164                    if i <= 8:
165                        col_names.append(f'VGRF_L{i}')
166                    elif i <= 16:
167                        col_names.append(f'VGRF_R{i-8}')
168                    else:
169                        col_names.append(f'sensor_{i}')
170                
171                df.columns = col_names
172                
173                # Set time as index
174                df = df.set_index('time')
175                
176                # Add subject metadata
177                df['subject_type'] = subject_type
178                df['label'] = label
179                
180                physionet_data.append(df)
181                physionet_names.append(filename)
182                self.labels.append(label)
183                self.subject_types.append(subject_type)
184                
185            except Exception as e:
186                print(f"Error loading {filename}: {e}")
187                continue
188        
189        # Store loaded data
190        self.data = physionet_data
191        self.names = physionet_names
192        
193        print(f"Loaded {len(physionet_data)} PhysioNet files")
194        print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}")
195        
196        return physionet_data, physionet_names
197    
198    def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 
199                             window_size: int = 600, step_size: int = 100) -> List[Dict]:
200        """
201        Create sliding windows from the PhysioNet dataset.
202        
203        Args:
204            data: List of DataFrames containing PhysioNet data
205            names: List of names corresponding to the data
206            window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz)
207            step_size: Step size for the sliding window (default: 100)
208            
209        Returns:
210            List of dictionaries containing sliding windows for each DataFrame
211        """
212        windows_data = []
213        
214        for idx, df in enumerate(data):
215            # Remove metadata columns for windowing
216            sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')]
217            df_sensors = df[sensor_columns]
218            
219            if df_sensors.empty or len(df_sensors) < window_size:
220                continue
221                
222            windows = []
223            
224            # Create windows for each sensor
225            for col in sensor_columns:
226                try:
227                    window_data = sliding_window(df_sensors[col].values, window_size, step_size)
228                    windows.append({"name": col, "data": window_data})
229                except Exception as e:
230                    print(f"Error creating windows for {col} in {names[idx]}: {e}")
231                    continue
232            
233            if windows:
234                windows_data.append({
235                    "name": names[idx],
236                    "windows": windows,
237                    "metadata": {
238                        "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown',
239                        "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown',
240                        "window_size": window_size,
241                        "step_size": step_size,
242                        "num_windows": len(windows[0]["data"]) if windows else 0
243                    }
244                })
245        
246        return windows_data
247    
248    def get_supported_formats(self) -> List[str]:
249        """
250        Get list of supported file formats for PhysioNet dataset.
251        
252        Returns:
253            List of supported file extensions
254        """
255        return ['.txt']
256    
257    def get_sensor_info(self) -> Dict[str, List[str]]:
258        """
259        Get information about sensors in the dataset.
260        
261        Returns:
262            Dictionary containing sensor information
263        """
264        return {
265            'sensors': self.metadata['sensors'],
266            'sampling_frequency': self.metadata['sampling_frequency'],
267            'window_size': self.metadata['window_size']
268        }
269    
270    def get_subject_info(self) -> Dict[str, str]:
271        """
272        Get information about subjects in the dataset.
273        
274        Returns:
275            Dictionary containing subject information
276        """
277        return self.metadata['subjects']
278    
279    def get_labels(self) -> List[str]:
280        """
281        Get labels for loaded data.
282        
283        Returns:
284            List of labels corresponding to loaded data
285        """
286        return self.labels
287    
288    def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]:
289        """
290        Filter loaded data by subject type.
291        
292        Args:
293            subject_type: 'Control' or 'Patient'
294            
295        Returns:
296            Tuple of (filtered_data, filtered_names)
297        """
298        if not self.data:
299            raise ValueError("No data loaded. Call load_data() first.")
300        
301        filtered_data = []
302        filtered_names = []
303        
304        for i, df in enumerate(self.data):
305            if df['subject_type'].iloc[0] == subject_type:
306                filtered_data.append(df)
307                filtered_names.append(self.names[i])
308        
309        return filtered_data, filtered_names

PhysioNet VGRF dataset loader class.

This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset. The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's disease and healthy controls.

PhysioNetLoader()
34    def __init__(self):
35        super().__init__(
36            name="physionet",
37            description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls"
38        )
39        self.metadata = {
40            'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8',
41                       'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'],
42            'sampling_frequency': 100,  # 100 Hz sampling frequency
43            'subjects': {
44                'Co': 'Control subjects',
45                'Pt': 'Parkinson\'s disease patients'
46            },
47            'window_size': 600,  # 6 seconds at 100 Hz
48            'url': 'https://physionet.org/files/gaitpdb/1.0.0/'
49        }
50        self.labels = []
51        self.subject_types = []

Initialize the dataset loader.

Args: name: Name of the dataset description: Description of the dataset

metadata
labels
subject_types
def load_data( self, data_dir: str, **kwargs) -> Tuple[List[pandas.core.frame.DataFrame], List[str]]:
119    def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]:
120        """
121        Load PhysioNet VGRF dataset from the specified directory.
122        
123        Args:
124            data_dir: Directory to store/find the dataset
125            **kwargs: Additional arguments (unused for PhysioNet)
126            
127        Returns:
128            Tuple of (data_list, names_list)
129        """
130        # Download dataset if needed
131        dataset_path = self._download_physionet_data(data_dir)
132        
133        physionet_data = []
134        physionet_names = []
135        self.labels = []
136        self.subject_types = []
137        
138        # Load all available files
139        for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))):
140            filename = os.path.basename(filepath)
141            
142            # Extract subject type from filename
143            if 'Co' in filename:
144                subject_type = 'Control'
145                label = 'Co'
146            elif 'Pt' in filename:
147                subject_type = 'Patient'
148                label = 'Pt'
149            else:
150                continue  # Skip files that don't match expected pattern
151            
152            try:
153                # Read the file - PhysioNet files are tab-delimited with variable columns
154                # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist
155                df = pd.read_csv(filepath, delimiter='\t', header=None)
156                
157                # Handle variable number of columns
158                n_cols = min(df.shape[1], 19)  # Limit to 19 columns max
159                df = df.iloc[:, :n_cols]
160                
161                # Create column names
162                col_names = ['time']
163                for i in range(1, n_cols):
164                    if i <= 8:
165                        col_names.append(f'VGRF_L{i}')
166                    elif i <= 16:
167                        col_names.append(f'VGRF_R{i-8}')
168                    else:
169                        col_names.append(f'sensor_{i}')
170                
171                df.columns = col_names
172                
173                # Set time as index
174                df = df.set_index('time')
175                
176                # Add subject metadata
177                df['subject_type'] = subject_type
178                df['label'] = label
179                
180                physionet_data.append(df)
181                physionet_names.append(filename)
182                self.labels.append(label)
183                self.subject_types.append(subject_type)
184                
185            except Exception as e:
186                print(f"Error loading {filename}: {e}")
187                continue
188        
189        # Store loaded data
190        self.data = physionet_data
191        self.names = physionet_names
192        
193        print(f"Loaded {len(physionet_data)} PhysioNet files")
194        print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}")
195        
196        return physionet_data, physionet_names

Load PhysioNet VGRF dataset from the specified directory.

Args: data_dir: Directory to store/find the dataset **kwargs: Additional arguments (unused for PhysioNet)

Returns: Tuple of (data_list, names_list)

def create_sliding_windows( self, data: List[pandas.core.frame.DataFrame], names: List[str], window_size: int = 600, step_size: int = 100) -> List[Dict]:
198    def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 
199                             window_size: int = 600, step_size: int = 100) -> List[Dict]:
200        """
201        Create sliding windows from the PhysioNet dataset.
202        
203        Args:
204            data: List of DataFrames containing PhysioNet data
205            names: List of names corresponding to the data
206            window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz)
207            step_size: Step size for the sliding window (default: 100)
208            
209        Returns:
210            List of dictionaries containing sliding windows for each DataFrame
211        """
212        windows_data = []
213        
214        for idx, df in enumerate(data):
215            # Remove metadata columns for windowing
216            sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')]
217            df_sensors = df[sensor_columns]
218            
219            if df_sensors.empty or len(df_sensors) < window_size:
220                continue
221                
222            windows = []
223            
224            # Create windows for each sensor
225            for col in sensor_columns:
226                try:
227                    window_data = sliding_window(df_sensors[col].values, window_size, step_size)
228                    windows.append({"name": col, "data": window_data})
229                except Exception as e:
230                    print(f"Error creating windows for {col} in {names[idx]}: {e}")
231                    continue
232            
233            if windows:
234                windows_data.append({
235                    "name": names[idx],
236                    "windows": windows,
237                    "metadata": {
238                        "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown',
239                        "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown',
240                        "window_size": window_size,
241                        "step_size": step_size,
242                        "num_windows": len(windows[0]["data"]) if windows else 0
243                    }
244                })
245        
246        return windows_data

Create sliding windows from the PhysioNet dataset.

Args: data: List of DataFrames containing PhysioNet data names: List of names corresponding to the data window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) step_size: Step size for the sliding window (default: 100)

Returns: List of dictionaries containing sliding windows for each DataFrame

def get_supported_formats(self) -> List[str]:
248    def get_supported_formats(self) -> List[str]:
249        """
250        Get list of supported file formats for PhysioNet dataset.
251        
252        Returns:
253            List of supported file extensions
254        """
255        return ['.txt']

Get list of supported file formats for PhysioNet dataset.

Returns: List of supported file extensions

def get_sensor_info(self) -> Dict[str, List[str]]:
257    def get_sensor_info(self) -> Dict[str, List[str]]:
258        """
259        Get information about sensors in the dataset.
260        
261        Returns:
262            Dictionary containing sensor information
263        """
264        return {
265            'sensors': self.metadata['sensors'],
266            'sampling_frequency': self.metadata['sampling_frequency'],
267            'window_size': self.metadata['window_size']
268        }

Get information about sensors in the dataset.

Returns: Dictionary containing sensor information

def get_subject_info(self) -> Dict[str, str]:
270    def get_subject_info(self) -> Dict[str, str]:
271        """
272        Get information about subjects in the dataset.
273        
274        Returns:
275            Dictionary containing subject information
276        """
277        return self.metadata['subjects']

Get information about subjects in the dataset.

Returns: Dictionary containing subject information

def get_labels(self) -> List[str]:
279    def get_labels(self) -> List[str]:
280        """
281        Get labels for loaded data.
282        
283        Returns:
284            List of labels corresponding to loaded data
285        """
286        return self.labels

Get labels for loaded data.

Returns: List of labels corresponding to loaded data

def filter_by_subject_type( self, subject_type: str) -> Tuple[List[pandas.core.frame.DataFrame], List[str]]:
288    def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]:
289        """
290        Filter loaded data by subject type.
291        
292        Args:
293            subject_type: 'Control' or 'Patient'
294            
295        Returns:
296            Tuple of (filtered_data, filtered_names)
297        """
298        if not self.data:
299            raise ValueError("No data loaded. Call load_data() first.")
300        
301        filtered_data = []
302        filtered_names = []
303        
304        for i, df in enumerate(self.data):
305            if df['subject_type'].iloc[0] == subject_type:
306                filtered_data.append(df)
307                filtered_names.append(self.names[i])
308        
309        return filtered_data, filtered_names

Filter loaded data by subject type.

Args: subject_type: 'Control' or 'Patient'

Returns: Tuple of (filtered_data, filtered_names)

def load_physionet_data(data_dir: str) -> Tuple[List[pandas.core.frame.DataFrame], List[str]]:
313def load_physionet_data(data_dir: str) -> Tuple[List[pd.DataFrame], List[str]]:
314    """
315    Legacy function to load PhysioNet data.
316    
317    Args:
318        data_dir: Directory containing the dataset
319        
320    Returns:
321        Tuple of (data_list, names_list)
322    """
323    loader = PhysioNetLoader()
324    return loader.load_data(data_dir)

Legacy function to load PhysioNet data.

Args: data_dir: Directory containing the dataset

Returns: Tuple of (data_list, names_list)

def create_physionet_windows( data: List[pandas.core.frame.DataFrame], names: List[str], window_size: int = 600, step_size: int = 100) -> List[Dict]:
327def create_physionet_windows(data: List[pd.DataFrame], names: List[str], 
328                           window_size: int = 600, step_size: int = 100) -> List[Dict]:
329    """
330    Legacy function to create sliding windows from PhysioNet data.
331    
332    Args:
333        data: List of DataFrames
334        names: List of names
335        window_size: Size of sliding window
336        step_size: Step size for sliding window
337        
338    Returns:
339        List of sliding window dictionaries
340    """
341    loader = PhysioNetLoader()
342    return loader.create_sliding_windows(data, names, window_size, step_size) 

Legacy function to create sliding windows from PhysioNet data.

Args: data: List of DataFrames names: List of names window_size: Size of sliding window step_size: Step size for sliding window

Returns: List of sliding window dictionaries