Coverage for src / crump / cli_extract.py: 86%

168 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-02-11 14:40 +0000

1"""Extract command for converting CDF files to CSV.""" 

2 

3from __future__ import annotations 

4 

5from pathlib import Path 

6 

7import click 

8from rich.console import Console 

9from rich.table import Table 

10 

11from crump.cdf_extractor import extract_cdf_to_tabular_file, extract_cdf_with_config 

12from crump.config import CrumpConfig 

13from crump.console_utils import CHECKMARK 

14 

15console = Console() 

16 

17 

18def format_file_size(size_bytes: int) -> str: 

19 """Format file size in human-readable format. 

20 

21 Args: 

22 size_bytes: File size in bytes 

23 

24 Returns: 

25 Formatted file size string 

26 """ 

27 size: float = float(size_bytes) 

28 for unit in ["B", "KB", "MB", "GB"]: 

29 if size < 1024.0: 

30 return f"{size:.1f} {unit}" 

31 size /= 1024.0 

32 return f"{size:.1f} TB" 

33 

34 

35@click.command() 

36@click.argument("files", nargs=-1, type=click.Path(exists=True, path_type=Path), required=True) 

37@click.option( 

38 "--output-path", 

39 "-o", 

40 type=click.Path(path_type=Path), 

41 default=None, 

42 help="Output directory for output files (default: current directory)", 

43) 

44@click.option( 

45 "--filename", 

46 type=str, 

47 default="[SOURCE_FILE]-[VARIABLE_NAME].csv", 

48 help="Filename template for output files. Use [SOURCE_FILE] and [VARIABLE_NAME] as placeholders.", 

49) 

50@click.option( 

51 "--automerge/--no-automerge", 

52 default=True, 

53 help="Merge variables with the same record count into a single file (default: enabled)", 

54) 

55@click.option( 

56 "--append", 

57 is_flag=True, 

58 default=False, 

59 help="Append to existing files instead of overwriting (default: disabled)", 

60) 

61@click.option( 

62 "--variables", 

63 "-v", 

64 multiple=True, 

65 help="Specific variable names to extract (can be specified multiple times). Default: extract all variables.", 

66) 

67@click.option( 

68 "--max-records", 

69 type=int, 

70 default=None, 

71 help="Maximum number of records to extract per variable (default: extract all records)", 

72) 

73@click.option( 

74 "--config", 

75 "-c", 

76 type=click.Path(exists=True, path_type=Path), 

77 default=None, 

78 help="YAML configuration file for column mapping. Applies same transformations as sync command.", 

79) 

80@click.option( 

81 "--job", 

82 "-j", 

83 type=str, 

84 default=None, 

85 help="Job name from config file (optional - auto-detected if config contains only one job).", 

86) 

87@click.option( 

88 "--parquet", 

89 is_flag=True, 

90 default=False, 

91 help="Output to Parquet format instead of CSV (default: disabled)", 

92) 

93def extract( 

94 files: tuple[Path, ...], 

95 output_path: Path | None, 

96 filename: str, 

97 automerge: bool, 

98 append: bool, 

99 variables: tuple[str, ...], 

100 max_records: int | None, 

101 config: Path | None, 

102 job: str | None, 

103 parquet: bool, 

104) -> None: 

105 """Extract data from CDF files to CSV or Parquet format. 

106 

107 Reads CDF science data files and extracts variable data into CSV or Parquet files. 

108 Variables with array data are expanded into multiple columns with sensible names. 

109 

110 When using --config and --job, the extract command applies the same column mappings 

111 and transformations that would be used by the sync command, but outputs to CSV/Parquet 

112 instead of a database. 

113 

114 Arguments: 

115 FILES: One or more CDF files to extract data from 

116 

117 Examples: 

118 # Extract all variables from a CDF file (raw dump) 

119 crump extract data.cdf 

120 

121 # Extract to a specific directory with custom filename 

122 crump extract data.cdf -o output/ --filename "[SOURCE_FILE]_data.csv" 

123 

124 # Extract specific variables without auto-merging 

125 crump extract data.cdf -v epoch -v vectors --no-automerge 

126 

127 # Extract and append to existing CSV files 

128 crump extract data1.cdf data2.cdf --append 

129 

130 # Extract first 100 records from each variable 

131 crump extract data.cdf --max-records 100 

132 

133 # Extract multiple files with auto-merge enabled 

134 crump extract *.cdf -o csv_output/ 

135 

136 # Extract using config file (applies column mappings and transformations) 

137 crump extract data.cdf -o output/ --config crump_config.yml --job my_job 

138 

139 # Extract with auto-detected job (when config has only one job) 

140 crump extract data.cdf -o output/ --config crump_config.yml 

141 

142 # Extract with config and limited records 

143 crump extract data.cdf --config crump_config.yml --job my_job --max-records 100 

144 

145 # Extract with config and append to existing transformed CSV 

146 crump extract data1.cdf --config crump_config.yml --append 

147 crump extract data2.cdf --config crump_config.yml --append 

148 

149 # Extract with config and custom filename template 

150 crump extract data.cdf --config crump_config.yml --filename "processed_[SOURCE_FILE].csv" 

151 

152 # Extract with config, specific variables, and custom filename 

153 crump extract data.cdf --config crump_config.yml -v epoch -v vectors --filename "vectors_[SOURCE_FILE].csv" 

154 

155 # Extract to Parquet format instead of CSV 

156 crump extract data.cdf --parquet 

157 

158 # Extract to Parquet with config 

159 crump extract data.cdf --config crump_config.yml --parquet 

160 """ 

161 try: 

162 # Validate config/job parameters 

163 if job is not None and config is None: 

164 console.print("[red]Error:[/red] --job requires --config to be specified.") 

165 raise click.Abort() 

166 

167 # Adjust filename extension if using Parquet 

168 from crump.file_types import OutputFileType 

169 

170 if parquet: 

171 # Replace CSV extension with Parquet extension if present 

172 from pathlib import Path as PathLib 

173 

174 filename_path = PathLib(filename) 

175 if filename_path.suffix.lower() in [".csv"]: 

176 filename = filename_path.stem + "." + OutputFileType.PARQUET.value 

177 

178 # Mode 1: Config-based extraction (applies column mappings) 

179 if config is not None: 

180 return _extract_with_config( 

181 files, 

182 output_path, 

183 config, 

184 job, 

185 max_records, 

186 automerge, 

187 variables, 

188 append, 

189 filename, 

190 parquet, 

191 ) 

192 

193 # Mode 2: Raw extraction (current behavior) 

194 return _extract_raw( 

195 files, output_path, filename, automerge, append, variables, max_records, parquet 

196 ) 

197 

198 except Exception as e: 

199 console.print(f"[red]Error:[/red] {e}") 

200 raise click.Abort() from e 

201 

202 

203def _extract_with_config( 

204 files: tuple[Path, ...], 

205 output_path: Path | None, 

206 config_path: Path, 

207 job_name: str | None, 

208 max_records: int | None, 

209 automerge: bool, 

210 variables: tuple[str, ...], 

211 append: bool, 

212 filename: str, 

213 parquet: bool, 

214) -> None: 

215 """Extract CDF files using config-based column mapping. 

216 

217 Args: 

218 files: CDF files to extract 

219 output_path: Output directory for output files 

220 config_path: Path to YAML config file 

221 job_name: Job name from config (None to auto-detect if single job) 

222 max_records: Maximum records to extract 

223 automerge: Whether to merge variables with same record count 

224 variables: Specific variable names to extract 

225 append: Whether to append to existing files 

226 filename: Filename template for output files 

227 parquet: Whether to output Parquet format instead of CSV 

228 """ 

229 # Load configuration 

230 crump_config = CrumpConfig.from_yaml(config_path) 

231 

232 # Determine output directory 

233 output_dir = output_path if output_path else Path.cwd() 

234 output_dir.mkdir(parents=True, exist_ok=True) 

235 

236 # Convert variables tuple to list (None if empty) 

237 variable_list = list(variables) if variables else None 

238 

239 file_format = "Parquet" if parquet else "CSV" 

240 console.print( 

241 f"[cyan]Extracting {len(files)} CDF file(s) to {file_format} with config-based mapping...[/cyan]" 

242 ) 

243 console.print(f"[dim] Config: {config_path.name}[/dim]") 

244 console.print(f"[dim] Job: {job_name}[/dim]") 

245 console.print(f"[dim] Output directory: {output_dir}[/dim]") 

246 console.print(f"[dim] Format: {file_format}[/dim]") 

247 if variable_list: 

248 console.print(f"[dim] Variables: {', '.join(variable_list)}[/dim]") 

249 console.print(f"[dim] Auto-merge: {automerge}[/dim]") 

250 console.print(f"[dim] Append mode: {append}[/dim]") 

251 default_filename = ( 

252 "[SOURCE_FILE]-[VARIABLE_NAME].parquet" if parquet else "[SOURCE_FILE]-[VARIABLE_NAME].csv" 

253 ) 

254 if filename != default_filename: 

255 console.print(f"[dim] Filename template: {filename}[/dim]") 

256 if max_records is not None: 

257 console.print(f"[dim] Max records: {max_records:,}[/dim]") 

258 console.print() 

259 

260 total_files_created = 0 

261 total_rows = 0 

262 

263 for cdf_file in files: 

264 console.print(f"[bold]Processing:[/bold] {cdf_file.name}") 

265 

266 # Get the specified job or auto-detect if there's only one 

267 try: 

268 job_result = crump_config.get_job_or_auto_detect(job_name, filename=cdf_file.as_posix()) 

269 if not job_result: 

270 if job_name: 

271 available_jobs = ", ".join(crump_config.jobs.keys()) 

272 console.print(f"[red]Error:[/red] Job '{job_name}' not found in config") 

273 console.print(f"[dim]Available jobs: {available_jobs}[/dim]") 

274 else: 

275 console.print("[red]Error:[/red] Config file contains no jobs") 

276 raise click.Abort() 

277 

278 crump_job, detected_job_name = job_result 

279 

280 # Inform user if we auto-detected the job 

281 if job_name is None: 

282 console.print(f"[dim]Auto-detected job: {detected_job_name}[/dim]") 

283 

284 except ValueError as e: 

285 # Multiple jobs found, need explicit job name 

286 available_jobs = ", ".join(crump_config.jobs.keys()) 

287 console.print(f"[red]Error:[/red] {e}") 

288 console.print(f"[dim]Available jobs: {available_jobs}[/dim]") 

289 raise click.Abort() from e 

290 

291 try: 

292 results = extract_cdf_with_config( 

293 cdf_file_path=cdf_file, 

294 output_dir=output_dir, 

295 job=crump_job, 

296 max_records=max_records, 

297 automerge=automerge, 

298 variable_names=variable_list, 

299 append=append, 

300 filename_template=filename, 

301 use_parquet=parquet, 

302 ) 

303 

304 if not results: 

305 console.print( 

306 "[yellow] No matching data found - column mappings don't match any extracted data[/yellow]\n" 

307 ) 

308 continue 

309 

310 # Display results for each transformed file 

311 table = Table(show_header=True, box=None, padding=(0, 1)) 

312 table.add_column("Output File", style="cyan") 

313 table.add_column("Variables", style="yellow") 

314 table.add_column("Columns", justify="right", style="green") 

315 table.add_column("Rows", justify="right", style="magenta") 

316 table.add_column("Size", justify="right", style="dim") 

317 

318 for result in results: 

319 var_display = ", ".join(result.variable_names) 

320 if len(var_display) > 40: 

321 var_display = var_display[:37] + "..." 

322 

323 table.add_row( 

324 result.output_file.name, 

325 var_display, 

326 str(result.num_columns), 

327 f"{result.num_rows:,}", 

328 format_file_size(result.file_size), 

329 ) 

330 

331 total_files_created += 1 

332 total_rows += result.num_rows 

333 

334 console.print(table) 

335 console.print() 

336 

337 except ValueError as e: 

338 console.print(f"[red]Error processing {cdf_file.name}:[/red] {e}\n") 

339 continue 

340 except Exception as e: 

341 console.print(f"[red]Unexpected error processing {cdf_file.name}:[/red] {e}\n") 

342 continue 

343 

344 # Final summary 

345 console.print(f"[bold green]{CHECKMARK} Extraction complete[/bold green]") 

346 console.print(f"[dim] Created {total_files_created} {file_format} file(s)[/dim]") 

347 console.print(f"[dim] Total rows extracted: {total_rows:,}[/dim]") 

348 console.print(f"[dim] Output directory: {output_dir.absolute()}[/dim]") 

349 

350 

351def _extract_raw( 

352 files: tuple[Path, ...], 

353 output_path: Path | None, 

354 filename: str, 

355 automerge: bool, 

356 append: bool, 

357 variables: tuple[str, ...], 

358 max_records: int | None, 

359 parquet: bool, 

360) -> None: 

361 """Extract CDF files with raw dump (original behavior). 

362 

363 Args: 

364 files: CDF files to extract 

365 output_path: Output directory for output files 

366 filename: Filename template 

367 automerge: Whether to merge variables 

368 append: Whether to append to existing files 

369 variables: Specific variables to extract 

370 max_records: Maximum records to extract 

371 parquet: Whether to output Parquet format instead of CSV 

372 """ 

373 try: 

374 # Determine output directory 

375 output_dir = output_path if output_path else Path.cwd() 

376 

377 # Convert variables tuple to list (None if empty) 

378 variable_list = list(variables) if variables else None 

379 

380 file_format = "Parquet" if parquet else "CSV" 

381 console.print( 

382 f"[cyan]Extracting data from {len(files)} CDF file(s) to {file_format}...[/cyan]" 

383 ) 

384 if variable_list: 

385 console.print(f"[dim] Extracting variables: {', '.join(variable_list)}[/dim]") 

386 console.print(f"[dim] Output directory: {output_dir}[/dim]") 

387 console.print(f"[dim] Format: {file_format}[/dim]") 

388 console.print(f"[dim] Auto-merge: {automerge}[/dim]") 

389 console.print(f"[dim] Append mode: {append}[/dim]") 

390 if max_records is not None: 

391 console.print(f"[dim] Max records per variable: {max_records:,}[/dim]") 

392 console.print() 

393 

394 total_files_created = 0 

395 total_rows = 0 

396 

397 for cdf_file in files: 

398 console.print(f"[bold]Processing:[/bold] {cdf_file.name}") 

399 

400 try: 

401 results = extract_cdf_to_tabular_file( 

402 cdf_file_path=cdf_file, 

403 output_dir=output_dir, 

404 filename_template=filename, 

405 automerge=automerge, 

406 append=append, 

407 variable_names=variable_list, 

408 max_records=max_records, 

409 use_parquet=parquet, 

410 ) 

411 

412 if not results: 

413 console.print( 

414 "[yellow] No data extracted (no suitable variables found)[/yellow]\n" 

415 ) 

416 continue 

417 

418 # Display results table 

419 table = Table(show_header=True, box=None, padding=(0, 1)) 

420 table.add_column("Output File", style="cyan") 

421 table.add_column("Variables", style="yellow") 

422 table.add_column("Columns", justify="right", style="green") 

423 table.add_column("Rows", justify="right", style="magenta") 

424 table.add_column("Size", justify="right", style="dim") 

425 

426 for result in results: 

427 var_display = ", ".join(result.variable_names) 

428 if len(var_display) > 40: 

429 var_display = var_display[:37] + "..." 

430 

431 table.add_row( 

432 result.output_file.name, 

433 var_display, 

434 str(result.num_columns), 

435 f"{result.num_rows:,}", 

436 format_file_size(result.file_size), 

437 ) 

438 

439 total_files_created += 1 

440 total_rows += result.num_rows 

441 

442 console.print(table) 

443 console.print() 

444 

445 except ValueError as e: 

446 console.print(f"[red]Error processing {cdf_file.name}:[/red] {e}\n") 

447 continue 

448 except Exception as e: 

449 console.print(f"[red]Unexpected error processing {cdf_file.name}:[/red] {e}\n") 

450 continue 

451 

452 # Final summary 

453 console.print(f"[bold green]{CHECKMARK} Extraction complete[/bold green]") 

454 console.print(f"[dim] Created/updated {total_files_created} {file_format} file(s)[/dim]") 

455 console.print(f"[dim] Total rows extracted: {total_rows:,}[/dim]") 

456 console.print(f"[dim] Output directory: {output_dir.absolute()}[/dim]") 

457 

458 except Exception as e: 

459 console.print(f"[red]Error:[/red] {e}") 

460 raise click.Abort() from e