Coverage for src / crump / cli_inspect.py: 82%
210 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-11 14:40 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-11 14:40 +0000
1"""Inspect command for examining CSV and CDF files."""
3from __future__ import annotations
5from pathlib import Path
7import click
8import numpy as np
9from rich.console import Console
10from rich.table import Table
12from crump.tabular_file import create_reader
14console = Console()
16# Constants
17MAX_COLUMNS_TO_DISPLAY = 10
18MAX_VALUE_LENGTH = 80
21def format_file_size(size_bytes: int) -> str:
22 """Format file size in human-readable format.
24 Args:
25 size_bytes: File size in bytes
27 Returns:
28 Formatted file size string
29 """
30 size: float = float(size_bytes)
31 for unit in ["B", "KB", "MB", "GB"]:
32 if size < 1024.0:
33 return f"{size:.1f} {unit}"
34 size /= 1024.0
35 return f"{size:.1f} TB"
38def inspect_tabular(file_path: Path, num_records: int) -> None:
39 """Inspect a tabular file (CSV or Parquet) and display summary information.
41 Args:
42 file_path: Path to the tabular file
43 num_records: Number of sample records to display
45 Raises:
46 click.ClickException: If the file cannot be read or parsed
47 """
48 # Detect file type for display
49 from crump.file_types import InputFileType
51 try:
52 detected_type = InputFileType.from_path(str(file_path))
53 file_type = "Parquet" if detected_type == InputFileType.PARQUET else "CSV"
54 except ValueError:
55 file_type = "CSV" # Default to CSV for unknown extensions
57 console.print(f"\n[bold cyan]{file_type} File: {file_path.name}[/bold cyan]")
58 console.print(f"[dim]Path: {file_path}[/dim]")
60 # Get file size
61 try:
62 file_size = file_path.stat().st_size
63 console.print(f"[dim]Size: {format_file_size(file_size)}[/dim]\n")
64 except OSError as e:
65 raise click.ClickException(f"Cannot access file: {e}") from e
67 try:
68 # Get total row count efficiently for Parquet files
69 total_rows = None
70 if file_type == "Parquet":
71 try:
72 import pyarrow.parquet as pq # type: ignore[import-untyped]
74 parquet_file = pq.ParquetFile(file_path)
75 total_rows = parquet_file.metadata.num_rows
76 except Exception:
77 # If we can't get metadata, we'll count while reading
78 pass
80 with create_reader(file_path) as reader:
81 if not reader.fieldnames:
82 console.print(f"[red]Error: No columns found in {file_type} file[/red]")
83 return
85 # Display header
86 console.print(f"[bold]Columns ({len(reader.fieldnames)}):[/bold]")
87 console.print(f" {', '.join(reader.fieldnames)}\n")
89 # Create table for sample records
90 table = Table(title=f"Sample Records (first {num_records})")
91 for col in reader.fieldnames:
92 table.add_column(col, style="cyan", overflow="fold")
94 # Read and display only the sample records we need
95 rows_read = 0
96 for i, row in enumerate(reader):
97 if i < num_records:
98 # Convert all values to strings for display
99 row_values = [str(row.get(col, "")) for col in reader.fieldnames]
100 table.add_row(*row_values)
101 rows_read += 1
102 else:
103 # If we already have the total from Parquet metadata, stop reading
104 if total_rows is not None:
105 break
106 # Otherwise, just count remaining rows without storing them
107 rows_read += 1
109 # If we didn't get total from metadata, use what we counted
110 if total_rows is None:
111 total_rows = rows_read
113 console.print(table)
115 # Display summary
116 console.print(
117 f"\n[green]Summary: {total_rows:,} rows total, "
118 f"{len(reader.fieldnames)} columns, {format_file_size(file_size)}[/green]"
119 )
121 except Exception as e:
122 raise click.ClickException(f"Unexpected error reading {file_type} file: {e}") from e
125def _format_attribute_value(attr_values: object) -> str:
126 """Format CDF attribute value for display.
128 Args:
129 attr_values: Attribute value(s) to format
131 Returns:
132 Formatted string representation
133 """
134 if isinstance(attr_values, list) and len(attr_values) == 1:
135 value_str = str(attr_values[0])
136 elif isinstance(attr_values, list) and len(attr_values) > 1:
137 value_str = f"{attr_values[0]} (+ {len(attr_values) - 1} more)"
138 else:
139 value_str = str(attr_values)
141 # Truncate long values
142 if len(value_str) > MAX_VALUE_LENGTH:
143 value_str = value_str[: MAX_VALUE_LENGTH - 3] + "..."
145 return value_str
148def _format_data_value(value: object, is_numeric: bool = False) -> str:
149 """Format a data value for display.
151 Args:
152 value: The value to format
153 is_numeric: Whether the value is numeric
155 Returns:
156 Formatted string representation
157 """
158 if is_numeric:
159 try:
160 return f"{float(str(value)):.4g}"
161 except (ValueError, TypeError):
162 return str(value)
163 return str(value)
166def inspect_cdf(file_path: Path, num_records: int) -> None:
167 """Inspect a CDF file and display summary information.
169 Args:
170 file_path: Path to the CDF file
171 num_records: Number of sample records to display per variable
173 Raises:
174 click.ClickException: If the file cannot be read or parsed
175 """
176 try:
177 import cdflib # type: ignore[import-untyped]
178 except ImportError:
179 console.print(
180 "[red]Error: cdflib is not installed. Install it with: pip install cdflib[/red]"
181 )
182 raise click.ClickException("cdflib is required for CDF file inspection") from None
184 console.print(f"\n[bold cyan]CDF File: {file_path.name}[/bold cyan]")
185 console.print(f"[dim]Path: {file_path}[/dim]")
187 # Get file size
188 try:
189 file_size = file_path.stat().st_size
190 console.print(f"[dim]Size: {format_file_size(file_size)}[/dim]\n")
191 except OSError as e:
192 raise click.ClickException(f"Cannot access file: {e}") from e
194 try:
195 # Read variables using our reader to get EPOCH conversion
196 from crump.cdf_reader import read_cdf_variables
198 cdf_variables = read_cdf_variables(file_path)
200 with cdflib.CDF(str(file_path)) as cdf:
201 # Get CDF info
202 info = cdf.cdf_info()
203 console.print(f"[bold]CDF Version:[/bold] {info.Version}")
204 console.print(f"[bold]Encoding:[/bold] {info.Encoding}")
205 console.print(f"[bold]Majority:[/bold] {info.Majority}\n")
207 # Display global attributes
208 console.print("[bold]Global Attributes:[/bold]")
209 global_attrs = cdf.globalattsget()
210 attr_table = Table(show_header=True, box=None, padding=(0, 1))
211 attr_table.add_column("Attribute", style="yellow")
212 attr_table.add_column("Value", style="dim")
214 for attr_name, attr_values in sorted(global_attrs.items()):
215 value_str = _format_attribute_value(attr_values)
216 attr_table.add_row(attr_name, value_str)
218 console.print(attr_table)
220 # Get all variables using the converted data
221 var_info_list = []
222 for var in cdf_variables:
223 var_info_list.append((var.name, var.data, var.num_records))
225 # Sort by number of records (descending)
226 var_info_list.sort(key=lambda x: x[2], reverse=True)
228 # Display variable summary
229 console.print(f"\n[bold]Variables ({len(var_info_list)}):[/bold]")
230 var_summary_table = Table(show_header=True)
231 var_summary_table.add_column("Variable", style="cyan")
232 var_summary_table.add_column("Type", style="yellow")
233 var_summary_table.add_column("Shape", style="green")
234 var_summary_table.add_column("Records", style="magenta", justify="right")
236 for var_name, data, num_recs in var_info_list:
237 if isinstance(data, np.ndarray):
238 dtype_str = str(data.dtype)
239 shape_str = str(data.shape)
240 else:
241 dtype_str = type(data).__name__
242 shape_str = "scalar"
244 var_summary_table.add_row(var_name, dtype_str, shape_str, f"{num_recs:,}")
246 console.print(var_summary_table)
248 # Display detailed information for each variable with sample data
249 console.print("\n[bold]Variable Details (sorted by record count):[/bold]\n")
251 for var_name, data, num_recs in var_info_list:
252 console.print(f"[bold cyan]{var_name}[/bold cyan]")
254 # Get variable attributes
255 try:
256 var_attrs = cdf.varattsget(var_name)
257 except Exception:
258 var_attrs = {}
260 # Show key attributes
261 if var_attrs:
262 attr_lines = []
263 for key in ["FIELDNAM", "CATDESC", "UNITS", "VAR_TYPE"]:
264 if key in var_attrs:
265 attr_lines.append(f"{key}: {var_attrs[key]}")
266 if attr_lines:
267 console.print(f" [dim]{' | '.join(attr_lines)}[/dim]")
269 # Show data structure
270 if isinstance(data, np.ndarray):
271 console.print(
272 f" Shape: {data.shape} | Type: {data.dtype} | Records: {num_recs:,}"
273 )
275 # Create table for sample data
276 if len(data.shape) == 1:
277 # 1D array - show as single column
278 sample_table = Table(show_header=True, box=None, padding=(0, 1))
279 sample_table.add_column("Index", style="dim", justify="right")
280 sample_table.add_column("Value")
282 for i in range(min(num_records, num_recs)):
283 sample_table.add_row(str(i), str(data[i]))
285 console.print(sample_table)
287 if num_recs > num_records:
288 console.print(
289 f" [dim]... {num_recs - num_records:,} more records[/dim]"
290 )
292 elif len(data.shape) == 2:
293 # 2D array - show as table with columns
294 sample_table = Table(show_header=True, box=None, padding=(0, 1))
295 sample_table.add_column("Index", style="dim", justify="right")
297 # Add columns for each component
298 num_cols_to_show = min(data.shape[1], MAX_COLUMNS_TO_DISPLAY)
299 for col_idx in range(num_cols_to_show):
300 sample_table.add_column(f"[{col_idx}]")
302 if data.shape[1] > MAX_COLUMNS_TO_DISPLAY:
303 sample_table.add_column("...")
305 for i in range(min(num_records, num_recs)):
306 row_values = [str(i)]
307 for col_idx in range(num_cols_to_show):
308 row_values.append(
309 _format_data_value(data[i, col_idx], is_numeric=True)
310 )
311 if data.shape[1] > MAX_COLUMNS_TO_DISPLAY:
312 row_values.append("...")
313 sample_table.add_row(*row_values)
315 console.print(sample_table)
317 if num_recs > num_records:
318 console.print(
319 f" [dim]... {num_recs - num_records:,} more records[/dim]"
320 )
322 elif len(data.shape) > 2:
323 # Multi-dimensional - just show shape info
324 console.print(f" [dim]Multi-dimensional data: {data.shape}[/dim]")
325 if num_recs > 0:
326 console.print(f" [dim]First record shape: {data[0].shape}[/dim]")
327 console.print(
328 f" [dim]Sample value: {str(data[0].flatten()[:5])}...[/dim]"
329 )
331 # Show value range for numeric data
332 if np.issubdtype(data.dtype, np.number) and num_recs > 0:
333 try:
334 flat_data = data.flatten()
335 if not np.all(np.isnan(flat_data)):
336 valid_data = flat_data[~np.isnan(flat_data)]
337 if len(valid_data) > 0:
338 console.print(
339 f" [dim]Value range: [{np.min(valid_data):.4g}, "
340 f"{np.max(valid_data):.4g}][/dim]"
341 )
342 except Exception:
343 # Silently skip if we can't compute range
344 pass
345 else:
346 console.print(f" Type: {type(data).__name__} | Value: {data}")
348 console.print() # Blank line between variables
350 except Exception as e:
351 raise click.ClickException(f"Error reading CDF file: {e}") from e
354@click.command()
355@click.argument("files", nargs=-1, type=click.Path(exists=True, path_type=Path), required=True)
356@click.option(
357 "--max-records",
358 "-n",
359 type=int,
360 default=10,
361 help="Number of sample records to display (default: 10)",
362)
363def inspect(files: tuple[Path, ...], max_records: int) -> None:
364 """Inspect CSV, Parquet, or CDF files and display summary information.
366 Displays file metadata, structure, and sample data for each file.
367 Supports CSV, Parquet, and CDF file formats.
369 Arguments:
370 FILES: One or more file paths to inspect
372 Examples:
373 # Inspect a single CSV file
374 crump inspect data.csv
376 # Inspect a Parquet file
377 crump inspect data.parquet
379 # Inspect multiple files with custom record count
380 crump inspect file1.csv file2.parquet file3.cdf --max-records 20
382 # Inspect all CSV files in a directory
383 crump inspect data/*.csv
384 """
385 try:
386 from crump.file_types import InputFileType
388 for file_path in files:
389 # Determine file type and inspect
390 try:
391 file_type = InputFileType.from_path(str(file_path))
393 if file_type in [InputFileType.CSV, InputFileType.PARQUET]:
394 inspect_tabular(file_path, max_records)
395 elif file_type == InputFileType.CDF:
396 inspect_cdf(file_path, max_records)
397 else:
398 console.print(
399 f"\n[yellow]Warning: Unsupported file type '{file_path.suffix}' "
400 f"for {file_path.name}[/yellow]"
401 )
402 except ValueError:
403 console.print(
404 f"\n[yellow]Warning: Unsupported file type '{file_path.suffix}' "
405 f"for {file_path.name}[/yellow]"
406 )
408 # Add separator between files if multiple
409 if len(files) > 1:
410 console.print("\n" + "=" * 80 + "\n")
412 except Exception as e:
413 console.print(f"[red]Error:[/red] {e}")
414 raise click.Abort() from e