Coverage for src / crump / tabular_file.py: 74%
76 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-11 14:40 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-11 14:40 +0000
1"""Abstract base classes for tabular file formats (CSV, Parquet, etc.)."""
3from __future__ import annotations
5from abc import ABC, abstractmethod
6from collections.abc import Iterator
7from pathlib import Path
8from typing import Any
10from .file_types import InputFileType, OutputFileType
13class TabularFileReader(ABC):
14 """Abstract base class for reading tabular file formats.
16 Provides a common interface for reading different tabular file formats
17 like CSV and Parquet. Designed to work as a context manager and iterator,
18 similar to csv.DictReader.
20 Example usage:
21 with TabularFileReader(file_path) as reader:
22 print(f"Columns: {reader.fieldnames}")
23 for row in reader:
24 print(row) # row is a dict
25 """
27 def __init__(self, file_path: str | Path):
28 """Initialize the reader with a file path.
30 Args:
31 file_path: Path to the file to read
32 """
33 self.file_path = Path(file_path)
35 @abstractmethod
36 def __enter__(self) -> TabularFileReader:
37 """Enter context manager and prepare for reading.
39 Returns:
40 Self for use in with statement
41 """
42 pass
44 @abstractmethod
45 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
46 """Exit context manager and cleanup resources.
48 Args:
49 exc_type: Exception type if an error occurred
50 exc_val: Exception value if an error occurred
51 exc_tb: Exception traceback if an error occurred
52 """
53 pass
55 @property
56 @abstractmethod
57 def fieldnames(self) -> list[str]:
58 """Get column names from the file.
60 Returns:
61 List of column names
62 """
63 pass
65 @abstractmethod
66 def __iter__(self) -> Iterator[dict[str, Any]]:
67 """Iterate through rows as dictionaries.
69 Yields:
70 Dictionary mapping column names to values for each row
71 """
72 pass
75class TabularFileWriter(ABC):
76 """Abstract base class for writing tabular file formats.
78 Provides a common interface for writing different tabular file formats
79 like CSV and Parquet. Designed to work as a context manager, similar to
80 csv.writer.
82 Example usage:
83 with TabularFileWriter(file_path, append=False) as writer:
84 writer.writerow(['col1', 'col2']) # header
85 writer.writerow(['val1', 'val2']) # data row
86 """
88 def __init__(self, file_path: str | Path, append: bool = False):
89 """Initialize the writer with a file path.
91 Args:
92 file_path: Path to the file to write
93 append: If True, append to existing file. If False, overwrite.
94 """
95 self.file_path = Path(file_path)
96 self.append = append
98 @abstractmethod
99 def __enter__(self) -> TabularFileWriter:
100 """Enter context manager and prepare for writing.
102 Returns:
103 Self for use in with statement
104 """
105 pass
107 @abstractmethod
108 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
109 """Exit context manager and cleanup resources.
111 Args:
112 exc_type: Exception type if an error occurred
113 exc_val: Exception value if an error occurred
114 exc_tb: Exception traceback if an error occurred
115 """
116 pass
118 @abstractmethod
119 def writerow(self, row: list[Any]) -> None:
120 """Write a single row to the file.
122 Args:
123 row: List of values to write (can be header or data row)
124 """
125 pass
128def create_reader(
129 file_path: str | Path, file_format: InputFileType | str | None = None
130) -> TabularFileReader:
131 """Factory function to create appropriate reader based on file format.
133 Args:
134 file_path: Path to the file to read
135 file_format: File format (InputFileType enum, 'csv', or 'parquet').
136 If None, auto-detect from extension. Defaults to CSV for unknown extensions.
138 Returns:
139 TabularFileReader instance for the file format
141 Raises:
142 ValueError: If file format is not supported
143 """
144 from .csv_file import CsvFileReader
145 from .parquet_file import ParquetFileReader
147 path = Path(file_path)
149 # Convert string to enum if needed, or auto-detect from extension
150 if file_format is None:
151 # Auto-detect with CSV fallback for unknown extensions (like .cdf)
152 try:
153 format_enum = InputFileType.from_path(str(path))
154 except ValueError:
155 format_enum = InputFileType.CSV
156 elif isinstance(file_format, str):
157 # Convert string to enum
158 format_str = file_format.lower()
159 if format_str == "csv":
160 format_enum = InputFileType.CSV
161 elif format_str == "parquet":
162 format_enum = InputFileType.PARQUET
163 else:
164 raise ValueError(f"Unsupported file format: {file_format}")
165 else:
166 format_enum = file_format
168 # Create appropriate reader (CDF not supported for reading tabular data)
169 if format_enum == InputFileType.CSV:
170 return CsvFileReader(path)
171 elif format_enum == InputFileType.PARQUET:
172 return ParquetFileReader(path)
173 else:
174 raise ValueError(f"Cannot read {format_enum.value} files as tabular data")
177def create_writer(
178 file_path: str | Path, file_format: OutputFileType | str | None = None, append: bool = False
179) -> TabularFileWriter:
180 """Factory function to create appropriate writer based on file format.
182 Args:
183 file_path: Path to the file to write
184 file_format: File format (OutputFileType enum, 'csv', or 'parquet').
185 If None, auto-detect from extension.
186 append: If True, append to existing file. If False, overwrite.
188 Returns:
189 TabularFileWriter instance for the file format
191 Raises:
192 ValueError: If file format is not supported or cannot be detected
193 """
194 from .csv_file import CsvFileWriter
195 from .parquet_file import ParquetFileWriter
197 path = Path(file_path)
199 # Convert string to enum if needed, or auto-detect from extension
200 if file_format is None:
201 format_enum = OutputFileType.from_path(str(path))
202 elif isinstance(file_format, str):
203 # Convert string to enum
204 format_str = file_format.lower()
205 if format_str == "csv":
206 format_enum = OutputFileType.CSV
207 elif format_str == "parquet":
208 format_enum = OutputFileType.PARQUET
209 else:
210 raise ValueError(f"Unsupported file format: {file_format}")
211 else:
212 format_enum = file_format
214 # Create appropriate writer
215 if format_enum == OutputFileType.CSV:
216 return CsvFileWriter(path, append=append)
217 elif format_enum == OutputFileType.PARQUET:
218 return ParquetFileWriter(path, append=append)
219 else:
220 raise ValueError(f"Unsupported file format: {format_enum.value}")