Coverage for src / crump / cli_extract.py: 86%
168 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-11 14:40 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-11 14:40 +0000
1"""Extract command for converting CDF files to CSV."""
3from __future__ import annotations
5from pathlib import Path
7import click
8from rich.console import Console
9from rich.table import Table
11from crump.cdf_extractor import extract_cdf_to_tabular_file, extract_cdf_with_config
12from crump.config import CrumpConfig
13from crump.console_utils import CHECKMARK
15console = Console()
18def format_file_size(size_bytes: int) -> str:
19 """Format file size in human-readable format.
21 Args:
22 size_bytes: File size in bytes
24 Returns:
25 Formatted file size string
26 """
27 size: float = float(size_bytes)
28 for unit in ["B", "KB", "MB", "GB"]:
29 if size < 1024.0:
30 return f"{size:.1f} {unit}"
31 size /= 1024.0
32 return f"{size:.1f} TB"
35@click.command()
36@click.argument("files", nargs=-1, type=click.Path(exists=True, path_type=Path), required=True)
37@click.option(
38 "--output-path",
39 "-o",
40 type=click.Path(path_type=Path),
41 default=None,
42 help="Output directory for output files (default: current directory)",
43)
44@click.option(
45 "--filename",
46 type=str,
47 default="[SOURCE_FILE]-[VARIABLE_NAME].csv",
48 help="Filename template for output files. Use [SOURCE_FILE] and [VARIABLE_NAME] as placeholders.",
49)
50@click.option(
51 "--automerge/--no-automerge",
52 default=True,
53 help="Merge variables with the same record count into a single file (default: enabled)",
54)
55@click.option(
56 "--append",
57 is_flag=True,
58 default=False,
59 help="Append to existing files instead of overwriting (default: disabled)",
60)
61@click.option(
62 "--variables",
63 "-v",
64 multiple=True,
65 help="Specific variable names to extract (can be specified multiple times). Default: extract all variables.",
66)
67@click.option(
68 "--max-records",
69 type=int,
70 default=None,
71 help="Maximum number of records to extract per variable (default: extract all records)",
72)
73@click.option(
74 "--config",
75 "-c",
76 type=click.Path(exists=True, path_type=Path),
77 default=None,
78 help="YAML configuration file for column mapping. Applies same transformations as sync command.",
79)
80@click.option(
81 "--job",
82 "-j",
83 type=str,
84 default=None,
85 help="Job name from config file (optional - auto-detected if config contains only one job).",
86)
87@click.option(
88 "--parquet",
89 is_flag=True,
90 default=False,
91 help="Output to Parquet format instead of CSV (default: disabled)",
92)
93def extract(
94 files: tuple[Path, ...],
95 output_path: Path | None,
96 filename: str,
97 automerge: bool,
98 append: bool,
99 variables: tuple[str, ...],
100 max_records: int | None,
101 config: Path | None,
102 job: str | None,
103 parquet: bool,
104) -> None:
105 """Extract data from CDF files to CSV or Parquet format.
107 Reads CDF science data files and extracts variable data into CSV or Parquet files.
108 Variables with array data are expanded into multiple columns with sensible names.
110 When using --config and --job, the extract command applies the same column mappings
111 and transformations that would be used by the sync command, but outputs to CSV/Parquet
112 instead of a database.
114 Arguments:
115 FILES: One or more CDF files to extract data from
117 Examples:
118 # Extract all variables from a CDF file (raw dump)
119 crump extract data.cdf
121 # Extract to a specific directory with custom filename
122 crump extract data.cdf -o output/ --filename "[SOURCE_FILE]_data.csv"
124 # Extract specific variables without auto-merging
125 crump extract data.cdf -v epoch -v vectors --no-automerge
127 # Extract and append to existing CSV files
128 crump extract data1.cdf data2.cdf --append
130 # Extract first 100 records from each variable
131 crump extract data.cdf --max-records 100
133 # Extract multiple files with auto-merge enabled
134 crump extract *.cdf -o csv_output/
136 # Extract using config file (applies column mappings and transformations)
137 crump extract data.cdf -o output/ --config crump_config.yml --job my_job
139 # Extract with auto-detected job (when config has only one job)
140 crump extract data.cdf -o output/ --config crump_config.yml
142 # Extract with config and limited records
143 crump extract data.cdf --config crump_config.yml --job my_job --max-records 100
145 # Extract with config and append to existing transformed CSV
146 crump extract data1.cdf --config crump_config.yml --append
147 crump extract data2.cdf --config crump_config.yml --append
149 # Extract with config and custom filename template
150 crump extract data.cdf --config crump_config.yml --filename "processed_[SOURCE_FILE].csv"
152 # Extract with config, specific variables, and custom filename
153 crump extract data.cdf --config crump_config.yml -v epoch -v vectors --filename "vectors_[SOURCE_FILE].csv"
155 # Extract to Parquet format instead of CSV
156 crump extract data.cdf --parquet
158 # Extract to Parquet with config
159 crump extract data.cdf --config crump_config.yml --parquet
160 """
161 try:
162 # Validate config/job parameters
163 if job is not None and config is None:
164 console.print("[red]Error:[/red] --job requires --config to be specified.")
165 raise click.Abort()
167 # Adjust filename extension if using Parquet
168 from crump.file_types import OutputFileType
170 if parquet:
171 # Replace CSV extension with Parquet extension if present
172 from pathlib import Path as PathLib
174 filename_path = PathLib(filename)
175 if filename_path.suffix.lower() in [".csv"]:
176 filename = filename_path.stem + "." + OutputFileType.PARQUET.value
178 # Mode 1: Config-based extraction (applies column mappings)
179 if config is not None:
180 return _extract_with_config(
181 files,
182 output_path,
183 config,
184 job,
185 max_records,
186 automerge,
187 variables,
188 append,
189 filename,
190 parquet,
191 )
193 # Mode 2: Raw extraction (current behavior)
194 return _extract_raw(
195 files, output_path, filename, automerge, append, variables, max_records, parquet
196 )
198 except Exception as e:
199 console.print(f"[red]Error:[/red] {e}")
200 raise click.Abort() from e
203def _extract_with_config(
204 files: tuple[Path, ...],
205 output_path: Path | None,
206 config_path: Path,
207 job_name: str | None,
208 max_records: int | None,
209 automerge: bool,
210 variables: tuple[str, ...],
211 append: bool,
212 filename: str,
213 parquet: bool,
214) -> None:
215 """Extract CDF files using config-based column mapping.
217 Args:
218 files: CDF files to extract
219 output_path: Output directory for output files
220 config_path: Path to YAML config file
221 job_name: Job name from config (None to auto-detect if single job)
222 max_records: Maximum records to extract
223 automerge: Whether to merge variables with same record count
224 variables: Specific variable names to extract
225 append: Whether to append to existing files
226 filename: Filename template for output files
227 parquet: Whether to output Parquet format instead of CSV
228 """
229 # Load configuration
230 crump_config = CrumpConfig.from_yaml(config_path)
232 # Determine output directory
233 output_dir = output_path if output_path else Path.cwd()
234 output_dir.mkdir(parents=True, exist_ok=True)
236 # Convert variables tuple to list (None if empty)
237 variable_list = list(variables) if variables else None
239 file_format = "Parquet" if parquet else "CSV"
240 console.print(
241 f"[cyan]Extracting {len(files)} CDF file(s) to {file_format} with config-based mapping...[/cyan]"
242 )
243 console.print(f"[dim] Config: {config_path.name}[/dim]")
244 console.print(f"[dim] Job: {job_name}[/dim]")
245 console.print(f"[dim] Output directory: {output_dir}[/dim]")
246 console.print(f"[dim] Format: {file_format}[/dim]")
247 if variable_list:
248 console.print(f"[dim] Variables: {', '.join(variable_list)}[/dim]")
249 console.print(f"[dim] Auto-merge: {automerge}[/dim]")
250 console.print(f"[dim] Append mode: {append}[/dim]")
251 default_filename = (
252 "[SOURCE_FILE]-[VARIABLE_NAME].parquet" if parquet else "[SOURCE_FILE]-[VARIABLE_NAME].csv"
253 )
254 if filename != default_filename:
255 console.print(f"[dim] Filename template: {filename}[/dim]")
256 if max_records is not None:
257 console.print(f"[dim] Max records: {max_records:,}[/dim]")
258 console.print()
260 total_files_created = 0
261 total_rows = 0
263 for cdf_file in files:
264 console.print(f"[bold]Processing:[/bold] {cdf_file.name}")
266 # Get the specified job or auto-detect if there's only one
267 try:
268 job_result = crump_config.get_job_or_auto_detect(job_name, filename=cdf_file.as_posix())
269 if not job_result:
270 if job_name:
271 available_jobs = ", ".join(crump_config.jobs.keys())
272 console.print(f"[red]Error:[/red] Job '{job_name}' not found in config")
273 console.print(f"[dim]Available jobs: {available_jobs}[/dim]")
274 else:
275 console.print("[red]Error:[/red] Config file contains no jobs")
276 raise click.Abort()
278 crump_job, detected_job_name = job_result
280 # Inform user if we auto-detected the job
281 if job_name is None:
282 console.print(f"[dim]Auto-detected job: {detected_job_name}[/dim]")
284 except ValueError as e:
285 # Multiple jobs found, need explicit job name
286 available_jobs = ", ".join(crump_config.jobs.keys())
287 console.print(f"[red]Error:[/red] {e}")
288 console.print(f"[dim]Available jobs: {available_jobs}[/dim]")
289 raise click.Abort() from e
291 try:
292 results = extract_cdf_with_config(
293 cdf_file_path=cdf_file,
294 output_dir=output_dir,
295 job=crump_job,
296 max_records=max_records,
297 automerge=automerge,
298 variable_names=variable_list,
299 append=append,
300 filename_template=filename,
301 use_parquet=parquet,
302 )
304 if not results:
305 console.print(
306 "[yellow] No matching data found - column mappings don't match any extracted data[/yellow]\n"
307 )
308 continue
310 # Display results for each transformed file
311 table = Table(show_header=True, box=None, padding=(0, 1))
312 table.add_column("Output File", style="cyan")
313 table.add_column("Variables", style="yellow")
314 table.add_column("Columns", justify="right", style="green")
315 table.add_column("Rows", justify="right", style="magenta")
316 table.add_column("Size", justify="right", style="dim")
318 for result in results:
319 var_display = ", ".join(result.variable_names)
320 if len(var_display) > 40:
321 var_display = var_display[:37] + "..."
323 table.add_row(
324 result.output_file.name,
325 var_display,
326 str(result.num_columns),
327 f"{result.num_rows:,}",
328 format_file_size(result.file_size),
329 )
331 total_files_created += 1
332 total_rows += result.num_rows
334 console.print(table)
335 console.print()
337 except ValueError as e:
338 console.print(f"[red]Error processing {cdf_file.name}:[/red] {e}\n")
339 continue
340 except Exception as e:
341 console.print(f"[red]Unexpected error processing {cdf_file.name}:[/red] {e}\n")
342 continue
344 # Final summary
345 console.print(f"[bold green]{CHECKMARK} Extraction complete[/bold green]")
346 console.print(f"[dim] Created {total_files_created} {file_format} file(s)[/dim]")
347 console.print(f"[dim] Total rows extracted: {total_rows:,}[/dim]")
348 console.print(f"[dim] Output directory: {output_dir.absolute()}[/dim]")
351def _extract_raw(
352 files: tuple[Path, ...],
353 output_path: Path | None,
354 filename: str,
355 automerge: bool,
356 append: bool,
357 variables: tuple[str, ...],
358 max_records: int | None,
359 parquet: bool,
360) -> None:
361 """Extract CDF files with raw dump (original behavior).
363 Args:
364 files: CDF files to extract
365 output_path: Output directory for output files
366 filename: Filename template
367 automerge: Whether to merge variables
368 append: Whether to append to existing files
369 variables: Specific variables to extract
370 max_records: Maximum records to extract
371 parquet: Whether to output Parquet format instead of CSV
372 """
373 try:
374 # Determine output directory
375 output_dir = output_path if output_path else Path.cwd()
377 # Convert variables tuple to list (None if empty)
378 variable_list = list(variables) if variables else None
380 file_format = "Parquet" if parquet else "CSV"
381 console.print(
382 f"[cyan]Extracting data from {len(files)} CDF file(s) to {file_format}...[/cyan]"
383 )
384 if variable_list:
385 console.print(f"[dim] Extracting variables: {', '.join(variable_list)}[/dim]")
386 console.print(f"[dim] Output directory: {output_dir}[/dim]")
387 console.print(f"[dim] Format: {file_format}[/dim]")
388 console.print(f"[dim] Auto-merge: {automerge}[/dim]")
389 console.print(f"[dim] Append mode: {append}[/dim]")
390 if max_records is not None:
391 console.print(f"[dim] Max records per variable: {max_records:,}[/dim]")
392 console.print()
394 total_files_created = 0
395 total_rows = 0
397 for cdf_file in files:
398 console.print(f"[bold]Processing:[/bold] {cdf_file.name}")
400 try:
401 results = extract_cdf_to_tabular_file(
402 cdf_file_path=cdf_file,
403 output_dir=output_dir,
404 filename_template=filename,
405 automerge=automerge,
406 append=append,
407 variable_names=variable_list,
408 max_records=max_records,
409 use_parquet=parquet,
410 )
412 if not results:
413 console.print(
414 "[yellow] No data extracted (no suitable variables found)[/yellow]\n"
415 )
416 continue
418 # Display results table
419 table = Table(show_header=True, box=None, padding=(0, 1))
420 table.add_column("Output File", style="cyan")
421 table.add_column("Variables", style="yellow")
422 table.add_column("Columns", justify="right", style="green")
423 table.add_column("Rows", justify="right", style="magenta")
424 table.add_column("Size", justify="right", style="dim")
426 for result in results:
427 var_display = ", ".join(result.variable_names)
428 if len(var_display) > 40:
429 var_display = var_display[:37] + "..."
431 table.add_row(
432 result.output_file.name,
433 var_display,
434 str(result.num_columns),
435 f"{result.num_rows:,}",
436 format_file_size(result.file_size),
437 )
439 total_files_created += 1
440 total_rows += result.num_rows
442 console.print(table)
443 console.print()
445 except ValueError as e:
446 console.print(f"[red]Error processing {cdf_file.name}:[/red] {e}\n")
447 continue
448 except Exception as e:
449 console.print(f"[red]Unexpected error processing {cdf_file.name}:[/red] {e}\n")
450 continue
452 # Final summary
453 console.print(f"[bold green]{CHECKMARK} Extraction complete[/bold green]")
454 console.print(f"[dim] Created/updated {total_files_created} {file_format} file(s)[/dim]")
455 console.print(f"[dim] Total rows extracted: {total_rows:,}[/dim]")
456 console.print(f"[dim] Output directory: {output_dir.absolute()}[/dim]")
458 except Exception as e:
459 console.print(f"[red]Error:[/red] {e}")
460 raise click.Abort() from e