Coverage for src / crump / tabular_file.py: 74%

76 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-02-11 14:40 +0000

1"""Abstract base classes for tabular file formats (CSV, Parquet, etc.).""" 

2 

3from __future__ import annotations 

4 

5from abc import ABC, abstractmethod 

6from collections.abc import Iterator 

7from pathlib import Path 

8from typing import Any 

9 

10from .file_types import InputFileType, OutputFileType 

11 

12 

13class TabularFileReader(ABC): 

14 """Abstract base class for reading tabular file formats. 

15 

16 Provides a common interface for reading different tabular file formats 

17 like CSV and Parquet. Designed to work as a context manager and iterator, 

18 similar to csv.DictReader. 

19 

20 Example usage: 

21 with TabularFileReader(file_path) as reader: 

22 print(f"Columns: {reader.fieldnames}") 

23 for row in reader: 

24 print(row) # row is a dict 

25 """ 

26 

27 def __init__(self, file_path: str | Path): 

28 """Initialize the reader with a file path. 

29 

30 Args: 

31 file_path: Path to the file to read 

32 """ 

33 self.file_path = Path(file_path) 

34 

35 @abstractmethod 

36 def __enter__(self) -> TabularFileReader: 

37 """Enter context manager and prepare for reading. 

38 

39 Returns: 

40 Self for use in with statement 

41 """ 

42 pass 

43 

44 @abstractmethod 

45 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: 

46 """Exit context manager and cleanup resources. 

47 

48 Args: 

49 exc_type: Exception type if an error occurred 

50 exc_val: Exception value if an error occurred 

51 exc_tb: Exception traceback if an error occurred 

52 """ 

53 pass 

54 

55 @property 

56 @abstractmethod 

57 def fieldnames(self) -> list[str]: 

58 """Get column names from the file. 

59 

60 Returns: 

61 List of column names 

62 """ 

63 pass 

64 

65 @abstractmethod 

66 def __iter__(self) -> Iterator[dict[str, Any]]: 

67 """Iterate through rows as dictionaries. 

68 

69 Yields: 

70 Dictionary mapping column names to values for each row 

71 """ 

72 pass 

73 

74 

75class TabularFileWriter(ABC): 

76 """Abstract base class for writing tabular file formats. 

77 

78 Provides a common interface for writing different tabular file formats 

79 like CSV and Parquet. Designed to work as a context manager, similar to 

80 csv.writer. 

81 

82 Example usage: 

83 with TabularFileWriter(file_path, append=False) as writer: 

84 writer.writerow(['col1', 'col2']) # header 

85 writer.writerow(['val1', 'val2']) # data row 

86 """ 

87 

88 def __init__(self, file_path: str | Path, append: bool = False): 

89 """Initialize the writer with a file path. 

90 

91 Args: 

92 file_path: Path to the file to write 

93 append: If True, append to existing file. If False, overwrite. 

94 """ 

95 self.file_path = Path(file_path) 

96 self.append = append 

97 

98 @abstractmethod 

99 def __enter__(self) -> TabularFileWriter: 

100 """Enter context manager and prepare for writing. 

101 

102 Returns: 

103 Self for use in with statement 

104 """ 

105 pass 

106 

107 @abstractmethod 

108 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: 

109 """Exit context manager and cleanup resources. 

110 

111 Args: 

112 exc_type: Exception type if an error occurred 

113 exc_val: Exception value if an error occurred 

114 exc_tb: Exception traceback if an error occurred 

115 """ 

116 pass 

117 

118 @abstractmethod 

119 def writerow(self, row: list[Any]) -> None: 

120 """Write a single row to the file. 

121 

122 Args: 

123 row: List of values to write (can be header or data row) 

124 """ 

125 pass 

126 

127 

128def create_reader( 

129 file_path: str | Path, file_format: InputFileType | str | None = None 

130) -> TabularFileReader: 

131 """Factory function to create appropriate reader based on file format. 

132 

133 Args: 

134 file_path: Path to the file to read 

135 file_format: File format (InputFileType enum, 'csv', or 'parquet'). 

136 If None, auto-detect from extension. Defaults to CSV for unknown extensions. 

137 

138 Returns: 

139 TabularFileReader instance for the file format 

140 

141 Raises: 

142 ValueError: If file format is not supported 

143 """ 

144 from .csv_file import CsvFileReader 

145 from .parquet_file import ParquetFileReader 

146 

147 path = Path(file_path) 

148 

149 # Convert string to enum if needed, or auto-detect from extension 

150 if file_format is None: 

151 # Auto-detect with CSV fallback for unknown extensions (like .cdf) 

152 try: 

153 format_enum = InputFileType.from_path(str(path)) 

154 except ValueError: 

155 format_enum = InputFileType.CSV 

156 elif isinstance(file_format, str): 

157 # Convert string to enum 

158 format_str = file_format.lower() 

159 if format_str == "csv": 

160 format_enum = InputFileType.CSV 

161 elif format_str == "parquet": 

162 format_enum = InputFileType.PARQUET 

163 else: 

164 raise ValueError(f"Unsupported file format: {file_format}") 

165 else: 

166 format_enum = file_format 

167 

168 # Create appropriate reader (CDF not supported for reading tabular data) 

169 if format_enum == InputFileType.CSV: 

170 return CsvFileReader(path) 

171 elif format_enum == InputFileType.PARQUET: 

172 return ParquetFileReader(path) 

173 else: 

174 raise ValueError(f"Cannot read {format_enum.value} files as tabular data") 

175 

176 

177def create_writer( 

178 file_path: str | Path, file_format: OutputFileType | str | None = None, append: bool = False 

179) -> TabularFileWriter: 

180 """Factory function to create appropriate writer based on file format. 

181 

182 Args: 

183 file_path: Path to the file to write 

184 file_format: File format (OutputFileType enum, 'csv', or 'parquet'). 

185 If None, auto-detect from extension. 

186 append: If True, append to existing file. If False, overwrite. 

187 

188 Returns: 

189 TabularFileWriter instance for the file format 

190 

191 Raises: 

192 ValueError: If file format is not supported or cannot be detected 

193 """ 

194 from .csv_file import CsvFileWriter 

195 from .parquet_file import ParquetFileWriter 

196 

197 path = Path(file_path) 

198 

199 # Convert string to enum if needed, or auto-detect from extension 

200 if file_format is None: 

201 format_enum = OutputFileType.from_path(str(path)) 

202 elif isinstance(file_format, str): 

203 # Convert string to enum 

204 format_str = file_format.lower() 

205 if format_str == "csv": 

206 format_enum = OutputFileType.CSV 

207 elif format_str == "parquet": 

208 format_enum = OutputFileType.PARQUET 

209 else: 

210 raise ValueError(f"Unsupported file format: {file_format}") 

211 else: 

212 format_enum = file_format 

213 

214 # Create appropriate writer 

215 if format_enum == OutputFileType.CSV: 

216 return CsvFileWriter(path, append=append) 

217 elif format_enum == OutputFileType.PARQUET: 

218 return ParquetFileWriter(path, append=append) 

219 else: 

220 raise ValueError(f"Unsupported file format: {format_enum.value}")