Source code for scitex_io._load

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-08-11 05:54:51 (ywatanabe)"
# File: /home/ywatanabe/proj/SciTeX-Code/src/scitex/io/_load.py
# ----------------------------------------
from __future__ import annotations
import os

__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

import glob
from pathlib import Path
from typing import Any, Union

from ._load_cache import (
    cache_data,
    get_cached_data,
    load_npy_cached,
)
from ._registry import get_loader  # noqa: F401


[docs] def load( lpath: Union[str, Path], ext: str = None, show: bool = False, verbose: bool = False, cache: bool = True, **kwargs, ) -> Any: """ Load data from various file formats. This function supports loading data from multiple file formats with optional caching. Parameters ---------- lpath : Union[str, Path] The path to the file to be loaded. Can be a string or pathlib.Path object. ext : str, optional File extension to use for loading. If None, automatically detects from filename. Useful for files without extensions (e.g., UUID-named files). Examples: 'pdf', 'json', 'csv' show : bool, optional If True, display additional information during loading. Default is False. verbose : bool, optional If True, print verbose output during loading. Default is False. cache : bool, optional If True, enable caching for faster repeated loads. Default is True. **kwargs : dict Additional keyword arguments to be passed to the specific loading function. Returns ------- object The loaded data object, which can be of various types depending on the input file format. Raises ------ ValueError If the file extension is not supported. FileNotFoundError If the specified file does not exist. Supported Extensions ------------------- - Data formats: .csv, .tsv, .xls, .xlsx, .xlsm, .xlsb, .json, .yaml, .yml - Scientific: .npy, .npz, .mat, .hdf5, .con - ML/DL: .pth, .pt, .cbm, .joblib, .pkl - Documents: .txt, .log, .event, .md, .docx, .pdf, .xml - Images: .jpg, .png, .tiff, .tif - EEG data: .vhdr, .vmrk, .edf, .bdf, .gdf, .cnt, .egi, .eeg, .set - Database: .db Examples -------- >>> data = load('data.csv') >>> image = load('image.png') >>> model = load('model.pth') >>> # Load file without extension (e.g., UUID PDF) >>> pdf = load('f2694ccb-1b6f-4994-add8-5111fd4d52f1', ext='pdf') """ # Don't use clean_path as it breaks relative paths like ./file.txt # lpath = clean_path(lpath) # Convert Path objects to strings for consistency if isinstance(lpath, Path): lpath = str(lpath) if verbose: print(f"[DEBUG] After Path conversion: {lpath}") # Check if it's a glob pattern if "*" in lpath or "?" in lpath or "[" in lpath: # Handle glob pattern matched_files = sorted(glob.glob(lpath)) if not matched_files: raise FileNotFoundError(f"No files found matching pattern: {lpath}") # Load all matched files results = [] for file_path in matched_files: results.append(load(file_path, show=show, verbose=verbose, **kwargs)) return results # Handle broken symlinks - os.path.exists() returns False for broken symlinks if not os.path.exists(lpath): if os.path.islink(lpath): # For symlinks, resolve the target path relative to symlink's directory symlink_dir = os.path.dirname(os.path.abspath(lpath)) target = os.readlink(lpath) resolved_target = os.path.join(symlink_dir, target) resolved_target = os.path.abspath(resolved_target) if os.path.exists(resolved_target): lpath = resolved_target else: raise FileNotFoundError(f"Symlink target not found: {resolved_target}") else: # Try general path resolution try: resolved_path = os.path.realpath(lpath) if os.path.exists(resolved_path): lpath = resolved_path else: raise FileNotFoundError(f"File not found: {lpath}") except Exception: raise FileNotFoundError(f"File not found: {lpath}") # Try to get from cache first if cache: cached_data = get_cached_data(lpath) if cached_data is not None: if verbose: print(f"[Cache HIT] Loaded from cache: {lpath}") return cached_data # Determine extension: use explicit ext parameter or detect from filename if ext is not None: detected_ext = ext.lstrip(".") else: detected_ext = lpath.split(".")[-1] if "." in lpath else "" # Special handling for numpy files with caching if cache and detected_ext in ["npy", "npz"]: return load_npy_cached(lpath, **kwargs) # Registry lookup (normalized with dot) loader = get_loader(f".{detected_ext}" if detected_ext else "") if loader is None: raise ValueError( f"No load handler registered for '.{detected_ext}'. " f"Use register_loader('.{detected_ext}', your_fn) to add one." ) try: result = loader(lpath, **kwargs) # Cache the result if caching is enabled if cache: cache_data(lpath, result) if verbose: print(f"[Cache STORED] Cached data for: {lpath}") return result except (ValueError, FileNotFoundError) as e: raise ValueError(f"Error loading file {lpath}: {str(e)}")
# EOF