Coverage for src / crump / cli_inspect.py: 82%

210 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-02-11 14:40 +0000

1"""Inspect command for examining CSV and CDF files.""" 

2 

3from __future__ import annotations 

4 

5from pathlib import Path 

6 

7import click 

8import numpy as np 

9from rich.console import Console 

10from rich.table import Table 

11 

12from crump.tabular_file import create_reader 

13 

14console = Console() 

15 

16# Constants 

17MAX_COLUMNS_TO_DISPLAY = 10 

18MAX_VALUE_LENGTH = 80 

19 

20 

21def format_file_size(size_bytes: int) -> str: 

22 """Format file size in human-readable format. 

23 

24 Args: 

25 size_bytes: File size in bytes 

26 

27 Returns: 

28 Formatted file size string 

29 """ 

30 size: float = float(size_bytes) 

31 for unit in ["B", "KB", "MB", "GB"]: 

32 if size < 1024.0: 

33 return f"{size:.1f} {unit}" 

34 size /= 1024.0 

35 return f"{size:.1f} TB" 

36 

37 

38def inspect_tabular(file_path: Path, num_records: int) -> None: 

39 """Inspect a tabular file (CSV or Parquet) and display summary information. 

40 

41 Args: 

42 file_path: Path to the tabular file 

43 num_records: Number of sample records to display 

44 

45 Raises: 

46 click.ClickException: If the file cannot be read or parsed 

47 """ 

48 # Detect file type for display 

49 from crump.file_types import InputFileType 

50 

51 try: 

52 detected_type = InputFileType.from_path(str(file_path)) 

53 file_type = "Parquet" if detected_type == InputFileType.PARQUET else "CSV" 

54 except ValueError: 

55 file_type = "CSV" # Default to CSV for unknown extensions 

56 

57 console.print(f"\n[bold cyan]{file_type} File: {file_path.name}[/bold cyan]") 

58 console.print(f"[dim]Path: {file_path}[/dim]") 

59 

60 # Get file size 

61 try: 

62 file_size = file_path.stat().st_size 

63 console.print(f"[dim]Size: {format_file_size(file_size)}[/dim]\n") 

64 except OSError as e: 

65 raise click.ClickException(f"Cannot access file: {e}") from e 

66 

67 try: 

68 # Get total row count efficiently for Parquet files 

69 total_rows = None 

70 if file_type == "Parquet": 

71 try: 

72 import pyarrow.parquet as pq # type: ignore[import-untyped] 

73 

74 parquet_file = pq.ParquetFile(file_path) 

75 total_rows = parquet_file.metadata.num_rows 

76 except Exception: 

77 # If we can't get metadata, we'll count while reading 

78 pass 

79 

80 with create_reader(file_path) as reader: 

81 if not reader.fieldnames: 

82 console.print(f"[red]Error: No columns found in {file_type} file[/red]") 

83 return 

84 

85 # Display header 

86 console.print(f"[bold]Columns ({len(reader.fieldnames)}):[/bold]") 

87 console.print(f" {', '.join(reader.fieldnames)}\n") 

88 

89 # Create table for sample records 

90 table = Table(title=f"Sample Records (first {num_records})") 

91 for col in reader.fieldnames: 

92 table.add_column(col, style="cyan", overflow="fold") 

93 

94 # Read and display only the sample records we need 

95 rows_read = 0 

96 for i, row in enumerate(reader): 

97 if i < num_records: 

98 # Convert all values to strings for display 

99 row_values = [str(row.get(col, "")) for col in reader.fieldnames] 

100 table.add_row(*row_values) 

101 rows_read += 1 

102 else: 

103 # If we already have the total from Parquet metadata, stop reading 

104 if total_rows is not None: 

105 break 

106 # Otherwise, just count remaining rows without storing them 

107 rows_read += 1 

108 

109 # If we didn't get total from metadata, use what we counted 

110 if total_rows is None: 

111 total_rows = rows_read 

112 

113 console.print(table) 

114 

115 # Display summary 

116 console.print( 

117 f"\n[green]Summary: {total_rows:,} rows total, " 

118 f"{len(reader.fieldnames)} columns, {format_file_size(file_size)}[/green]" 

119 ) 

120 

121 except Exception as e: 

122 raise click.ClickException(f"Unexpected error reading {file_type} file: {e}") from e 

123 

124 

125def _format_attribute_value(attr_values: object) -> str: 

126 """Format CDF attribute value for display. 

127 

128 Args: 

129 attr_values: Attribute value(s) to format 

130 

131 Returns: 

132 Formatted string representation 

133 """ 

134 if isinstance(attr_values, list) and len(attr_values) == 1: 

135 value_str = str(attr_values[0]) 

136 elif isinstance(attr_values, list) and len(attr_values) > 1: 

137 value_str = f"{attr_values[0]} (+ {len(attr_values) - 1} more)" 

138 else: 

139 value_str = str(attr_values) 

140 

141 # Truncate long values 

142 if len(value_str) > MAX_VALUE_LENGTH: 

143 value_str = value_str[: MAX_VALUE_LENGTH - 3] + "..." 

144 

145 return value_str 

146 

147 

148def _format_data_value(value: object, is_numeric: bool = False) -> str: 

149 """Format a data value for display. 

150 

151 Args: 

152 value: The value to format 

153 is_numeric: Whether the value is numeric 

154 

155 Returns: 

156 Formatted string representation 

157 """ 

158 if is_numeric: 

159 try: 

160 return f"{float(str(value)):.4g}" 

161 except (ValueError, TypeError): 

162 return str(value) 

163 return str(value) 

164 

165 

166def inspect_cdf(file_path: Path, num_records: int) -> None: 

167 """Inspect a CDF file and display summary information. 

168 

169 Args: 

170 file_path: Path to the CDF file 

171 num_records: Number of sample records to display per variable 

172 

173 Raises: 

174 click.ClickException: If the file cannot be read or parsed 

175 """ 

176 try: 

177 import cdflib # type: ignore[import-untyped] 

178 except ImportError: 

179 console.print( 

180 "[red]Error: cdflib is not installed. Install it with: pip install cdflib[/red]" 

181 ) 

182 raise click.ClickException("cdflib is required for CDF file inspection") from None 

183 

184 console.print(f"\n[bold cyan]CDF File: {file_path.name}[/bold cyan]") 

185 console.print(f"[dim]Path: {file_path}[/dim]") 

186 

187 # Get file size 

188 try: 

189 file_size = file_path.stat().st_size 

190 console.print(f"[dim]Size: {format_file_size(file_size)}[/dim]\n") 

191 except OSError as e: 

192 raise click.ClickException(f"Cannot access file: {e}") from e 

193 

194 try: 

195 # Read variables using our reader to get EPOCH conversion 

196 from crump.cdf_reader import read_cdf_variables 

197 

198 cdf_variables = read_cdf_variables(file_path) 

199 

200 with cdflib.CDF(str(file_path)) as cdf: 

201 # Get CDF info 

202 info = cdf.cdf_info() 

203 console.print(f"[bold]CDF Version:[/bold] {info.Version}") 

204 console.print(f"[bold]Encoding:[/bold] {info.Encoding}") 

205 console.print(f"[bold]Majority:[/bold] {info.Majority}\n") 

206 

207 # Display global attributes 

208 console.print("[bold]Global Attributes:[/bold]") 

209 global_attrs = cdf.globalattsget() 

210 attr_table = Table(show_header=True, box=None, padding=(0, 1)) 

211 attr_table.add_column("Attribute", style="yellow") 

212 attr_table.add_column("Value", style="dim") 

213 

214 for attr_name, attr_values in sorted(global_attrs.items()): 

215 value_str = _format_attribute_value(attr_values) 

216 attr_table.add_row(attr_name, value_str) 

217 

218 console.print(attr_table) 

219 

220 # Get all variables using the converted data 

221 var_info_list = [] 

222 for var in cdf_variables: 

223 var_info_list.append((var.name, var.data, var.num_records)) 

224 

225 # Sort by number of records (descending) 

226 var_info_list.sort(key=lambda x: x[2], reverse=True) 

227 

228 # Display variable summary 

229 console.print(f"\n[bold]Variables ({len(var_info_list)}):[/bold]") 

230 var_summary_table = Table(show_header=True) 

231 var_summary_table.add_column("Variable", style="cyan") 

232 var_summary_table.add_column("Type", style="yellow") 

233 var_summary_table.add_column("Shape", style="green") 

234 var_summary_table.add_column("Records", style="magenta", justify="right") 

235 

236 for var_name, data, num_recs in var_info_list: 

237 if isinstance(data, np.ndarray): 

238 dtype_str = str(data.dtype) 

239 shape_str = str(data.shape) 

240 else: 

241 dtype_str = type(data).__name__ 

242 shape_str = "scalar" 

243 

244 var_summary_table.add_row(var_name, dtype_str, shape_str, f"{num_recs:,}") 

245 

246 console.print(var_summary_table) 

247 

248 # Display detailed information for each variable with sample data 

249 console.print("\n[bold]Variable Details (sorted by record count):[/bold]\n") 

250 

251 for var_name, data, num_recs in var_info_list: 

252 console.print(f"[bold cyan]{var_name}[/bold cyan]") 

253 

254 # Get variable attributes 

255 try: 

256 var_attrs = cdf.varattsget(var_name) 

257 except Exception: 

258 var_attrs = {} 

259 

260 # Show key attributes 

261 if var_attrs: 

262 attr_lines = [] 

263 for key in ["FIELDNAM", "CATDESC", "UNITS", "VAR_TYPE"]: 

264 if key in var_attrs: 

265 attr_lines.append(f"{key}: {var_attrs[key]}") 

266 if attr_lines: 

267 console.print(f" [dim]{' | '.join(attr_lines)}[/dim]") 

268 

269 # Show data structure 

270 if isinstance(data, np.ndarray): 

271 console.print( 

272 f" Shape: {data.shape} | Type: {data.dtype} | Records: {num_recs:,}" 

273 ) 

274 

275 # Create table for sample data 

276 if len(data.shape) == 1: 

277 # 1D array - show as single column 

278 sample_table = Table(show_header=True, box=None, padding=(0, 1)) 

279 sample_table.add_column("Index", style="dim", justify="right") 

280 sample_table.add_column("Value") 

281 

282 for i in range(min(num_records, num_recs)): 

283 sample_table.add_row(str(i), str(data[i])) 

284 

285 console.print(sample_table) 

286 

287 if num_recs > num_records: 

288 console.print( 

289 f" [dim]... {num_recs - num_records:,} more records[/dim]" 

290 ) 

291 

292 elif len(data.shape) == 2: 

293 # 2D array - show as table with columns 

294 sample_table = Table(show_header=True, box=None, padding=(0, 1)) 

295 sample_table.add_column("Index", style="dim", justify="right") 

296 

297 # Add columns for each component 

298 num_cols_to_show = min(data.shape[1], MAX_COLUMNS_TO_DISPLAY) 

299 for col_idx in range(num_cols_to_show): 

300 sample_table.add_column(f"[{col_idx}]") 

301 

302 if data.shape[1] > MAX_COLUMNS_TO_DISPLAY: 

303 sample_table.add_column("...") 

304 

305 for i in range(min(num_records, num_recs)): 

306 row_values = [str(i)] 

307 for col_idx in range(num_cols_to_show): 

308 row_values.append( 

309 _format_data_value(data[i, col_idx], is_numeric=True) 

310 ) 

311 if data.shape[1] > MAX_COLUMNS_TO_DISPLAY: 

312 row_values.append("...") 

313 sample_table.add_row(*row_values) 

314 

315 console.print(sample_table) 

316 

317 if num_recs > num_records: 

318 console.print( 

319 f" [dim]... {num_recs - num_records:,} more records[/dim]" 

320 ) 

321 

322 elif len(data.shape) > 2: 

323 # Multi-dimensional - just show shape info 

324 console.print(f" [dim]Multi-dimensional data: {data.shape}[/dim]") 

325 if num_recs > 0: 

326 console.print(f" [dim]First record shape: {data[0].shape}[/dim]") 

327 console.print( 

328 f" [dim]Sample value: {str(data[0].flatten()[:5])}...[/dim]" 

329 ) 

330 

331 # Show value range for numeric data 

332 if np.issubdtype(data.dtype, np.number) and num_recs > 0: 

333 try: 

334 flat_data = data.flatten() 

335 if not np.all(np.isnan(flat_data)): 

336 valid_data = flat_data[~np.isnan(flat_data)] 

337 if len(valid_data) > 0: 

338 console.print( 

339 f" [dim]Value range: [{np.min(valid_data):.4g}, " 

340 f"{np.max(valid_data):.4g}][/dim]" 

341 ) 

342 except Exception: 

343 # Silently skip if we can't compute range 

344 pass 

345 else: 

346 console.print(f" Type: {type(data).__name__} | Value: {data}") 

347 

348 console.print() # Blank line between variables 

349 

350 except Exception as e: 

351 raise click.ClickException(f"Error reading CDF file: {e}") from e 

352 

353 

354@click.command() 

355@click.argument("files", nargs=-1, type=click.Path(exists=True, path_type=Path), required=True) 

356@click.option( 

357 "--max-records", 

358 "-n", 

359 type=int, 

360 default=10, 

361 help="Number of sample records to display (default: 10)", 

362) 

363def inspect(files: tuple[Path, ...], max_records: int) -> None: 

364 """Inspect CSV, Parquet, or CDF files and display summary information. 

365 

366 Displays file metadata, structure, and sample data for each file. 

367 Supports CSV, Parquet, and CDF file formats. 

368 

369 Arguments: 

370 FILES: One or more file paths to inspect 

371 

372 Examples: 

373 # Inspect a single CSV file 

374 crump inspect data.csv 

375 

376 # Inspect a Parquet file 

377 crump inspect data.parquet 

378 

379 # Inspect multiple files with custom record count 

380 crump inspect file1.csv file2.parquet file3.cdf --max-records 20 

381 

382 # Inspect all CSV files in a directory 

383 crump inspect data/*.csv 

384 """ 

385 try: 

386 from crump.file_types import InputFileType 

387 

388 for file_path in files: 

389 # Determine file type and inspect 

390 try: 

391 file_type = InputFileType.from_path(str(file_path)) 

392 

393 if file_type in [InputFileType.CSV, InputFileType.PARQUET]: 

394 inspect_tabular(file_path, max_records) 

395 elif file_type == InputFileType.CDF: 

396 inspect_cdf(file_path, max_records) 

397 else: 

398 console.print( 

399 f"\n[yellow]Warning: Unsupported file type '{file_path.suffix}' " 

400 f"for {file_path.name}[/yellow]" 

401 ) 

402 except ValueError: 

403 console.print( 

404 f"\n[yellow]Warning: Unsupported file type '{file_path.suffix}' " 

405 f"for {file_path.name}[/yellow]" 

406 ) 

407 

408 # Add separator between files if multiple 

409 if len(files) > 1: 

410 console.print("\n" + "=" * 80 + "\n") 

411 

412 except Exception as e: 

413 console.print(f"[red]Error:[/red] {e}") 

414 raise click.Abort() from e