Metadata-Version: 2.4
Name: lakelogic
Version: 0.5.0
Summary: A Python-based data contract runtime for consistent quality across engines.
Author-email: LakeLogic Team <lakelogic@gmail.com>
License: Apache-2.0
License-File: LICENSE
Keywords: data-contracts,data-engineering,data-governance,data-pipeline,data-quality,delta-lake,duckdb,etl,lakehouse,lineage,medallion-architecture,polars,quarantine,schema-validation,spark
Classifier: Development Status :: 4 - Beta
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Classifier: Topic :: Database
Classifier: Topic :: Scientific/Engineering
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Software Development :: Quality Assurance
Classifier: Typing :: Typed
Requires-Python: >=3.9
Requires-Dist: httpx>=0.27.0
Requires-Dist: loguru>=0.7.0
Requires-Dist: polars>=0.20.0
Requires-Dist: pydantic>=2.0.0
Requires-Dist: pyyaml>=6.0
Requires-Dist: sqlglot>=20.0.0
Requires-Dist: typer>=0.9.0
Provides-Extra: all
Requires-Dist: apprise>=1.7.0; extra == 'all'
Requires-Dist: azure-eventgrid>=4.17.0; extra == 'all'
Requires-Dist: azure-identity>=1.15.0; extra == 'all'
Requires-Dist: azure-keyvault-secrets>=4.7.0; extra == 'all'
Requires-Dist: azure-servicebus>=7.11.0; extra == 'all'
Requires-Dist: azure-storage-blob>=12.19.0; extra == 'all'
Requires-Dist: boto3>=1.28.0; extra == 'all'
Requires-Dist: bytewax>=0.19.0; extra == 'all'
Requires-Dist: cryptography>=41.0.0; extra == 'all'
Requires-Dist: databricks-sdk>=0.18.0; extra == 'all'
Requires-Dist: dataprofiler>=0.9.0; extra == 'all'
Requires-Dist: deltalake>=0.15.0; extra == 'all'
Requires-Dist: duckdb>=0.9.0; extra == 'all'
Requires-Dist: google-cloud-bigquery>=3.11.0; extra == 'all'
Requires-Dist: google-cloud-pubsub>=2.18.0; extra == 'all'
Requires-Dist: google-cloud-secret-manager>=2.16.0; extra == 'all'
Requires-Dist: google-cloud-storage>=2.10.0; extra == 'all'
Requires-Dist: hvac>=2.0.0; extra == 'all'
Requires-Dist: jinja2>=3.1.0; extra == 'all'
Requires-Dist: kafka-python>=2.0.2; extra == 'all'
Requires-Dist: lxml>=4.9.0; extra == 'all'
Requires-Dist: nbclient>=0.9.0; extra == 'all'
Requires-Dist: nbformat>=5.9.0; extra == 'all'
Requires-Dist: openpyxl>=3.1.0; extra == 'all'
Requires-Dist: pandas>=2.0.0; extra == 'all'
Requires-Dist: paramiko>=3.4.0; extra == 'all'
Requires-Dist: polars>=0.20.0; extra == 'all'
Requires-Dist: presidio-analyzer>=2.2.0; extra == 'all'
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
Requires-Dist: pyarrow>=14.0.0; extra == 'all'
Requires-Dist: pymongo>=4.6.0; extra == 'all'
Requires-Dist: pymysql>=1.1.0; extra == 'all'
Requires-Dist: pyodbc>=5.0.0; extra == 'all'
Requires-Dist: pyspark>=3.3.0; extra == 'all'
Requires-Dist: requests>=2.31.0; extra == 'all'
Requires-Dist: snowflake-connector-python>=3.5.0; extra == 'all'
Requires-Dist: sseclient-py>=1.8.0; extra == 'all'
Requires-Dist: websocket-client>=1.6.0; extra == 'all'
Provides-Extra: api
Requires-Dist: requests>=2.31.0; extra == 'api'
Provides-Extra: aws-messaging
Requires-Dist: boto3>=1.28.0; extra == 'aws-messaging'
Provides-Extra: azure-messaging
Requires-Dist: azure-eventgrid>=4.17.0; extra == 'azure-messaging'
Requires-Dist: azure-identity>=1.15.0; extra == 'azure-messaging'
Requires-Dist: azure-servicebus>=7.11.0; extra == 'azure-messaging'
Provides-Extra: azuresql
Requires-Dist: azure-identity>=1.15.0; extra == 'azuresql'
Requires-Dist: pyodbc>=5.0.0; extra == 'azuresql'
Provides-Extra: bigquery
Requires-Dist: google-cloud-bigquery>=3.11.0; extra == 'bigquery'
Provides-Extra: bytewax
Requires-Dist: bytewax>=0.19.0; extra == 'bytewax'
Provides-Extra: databases
Requires-Dist: azure-identity>=1.15.0; extra == 'databases'
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
Requires-Dist: pymongo>=4.6.0; extra == 'databases'
Requires-Dist: pymysql>=1.1.0; extra == 'databases'
Requires-Dist: pyodbc>=5.0.0; extra == 'databases'
Provides-Extra: delta
Requires-Dist: azure-identity>=1.15.0; extra == 'delta'
Requires-Dist: azure-storage-blob>=12.19.0; extra == 'delta'
Requires-Dist: boto3>=1.28.0; extra == 'delta'
Requires-Dist: databricks-sdk>=0.18.0; extra == 'delta'
Requires-Dist: deltalake>=0.15.0; extra == 'delta'
Requires-Dist: google-cloud-storage>=2.10.0; extra == 'delta'
Provides-Extra: dev
Requires-Dist: black>=23.0.0; extra == 'dev'
Requires-Dist: hypothesis>=6.100.0; extra == 'dev'
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
Requires-Dist: pytest>=7.0.0; extra == 'dev'
Requires-Dist: ruff>=0.1.0; extra == 'dev'
Provides-Extra: docs
Requires-Dist: mkdocs-jupyter>=0.24.0; extra == 'docs'
Requires-Dist: mkdocs-material>=9.0.0; extra == 'docs'
Requires-Dist: mkdocstrings[python]>=0.20.0; extra == 'docs'
Provides-Extra: duckdb
Requires-Dist: deltalake>=0.15.0; extra == 'duckdb'
Requires-Dist: duckdb>=0.9.0; extra == 'duckdb'
Requires-Dist: lxml>=4.9.0; extra == 'duckdb'
Requires-Dist: openpyxl>=3.1.0; extra == 'duckdb'
Requires-Dist: pandas>=2.0.0; extra == 'duckdb'
Requires-Dist: pyarrow>=14.0.0; extra == 'duckdb'
Provides-Extra: gcp-messaging
Requires-Dist: google-cloud-pubsub>=2.18.0; extra == 'gcp-messaging'
Provides-Extra: integrations
Requires-Dist: azure-eventgrid>=4.17.0; extra == 'integrations'
Requires-Dist: azure-identity>=1.15.0; extra == 'integrations'
Requires-Dist: azure-servicebus>=7.11.0; extra == 'integrations'
Requires-Dist: boto3>=1.28.0; extra == 'integrations'
Requires-Dist: google-cloud-pubsub>=2.18.0; extra == 'integrations'
Requires-Dist: paramiko>=3.4.0; extra == 'integrations'
Requires-Dist: requests>=2.31.0; extra == 'integrations'
Provides-Extra: kafka
Requires-Dist: kafka-python>=2.0.2; extra == 'kafka'
Provides-Extra: mongodb
Requires-Dist: pymongo>=4.6.0; extra == 'mongodb'
Provides-Extra: mysql
Requires-Dist: pymysql>=1.1.0; extra == 'mysql'
Provides-Extra: notebook
Requires-Dist: nbclient>=0.9.0; extra == 'notebook'
Requires-Dist: nbformat>=5.9.0; extra == 'notebook'
Provides-Extra: notifications
Requires-Dist: apprise>=1.7.0; extra == 'notifications'
Requires-Dist: azure-identity>=1.15.0; extra == 'notifications'
Requires-Dist: azure-keyvault-secrets>=4.7.0; extra == 'notifications'
Requires-Dist: boto3>=1.28.0; extra == 'notifications'
Requires-Dist: cryptography>=41.0.0; extra == 'notifications'
Requires-Dist: google-cloud-secret-manager>=2.16.0; extra == 'notifications'
Requires-Dist: hvac>=2.0.0; extra == 'notifications'
Requires-Dist: jinja2>=3.1.0; extra == 'notifications'
Provides-Extra: pandas
Requires-Dist: deltalake>=0.15.0; extra == 'pandas'
Requires-Dist: duckdb>=0.9.0; extra == 'pandas'
Requires-Dist: lxml>=4.9.0; extra == 'pandas'
Requires-Dist: openpyxl>=3.1.0; extra == 'pandas'
Requires-Dist: pandas>=2.0.0; extra == 'pandas'
Provides-Extra: pathway
Requires-Dist: pathway>=0.7.0; (python_version >= '3.10') and extra == 'pathway'
Provides-Extra: polars
Requires-Dist: deltalake>=0.15.0; extra == 'polars'
Requires-Dist: lxml>=4.9.0; extra == 'polars'
Requires-Dist: openpyxl>=3.1.0; extra == 'polars'
Requires-Dist: polars>=0.20.0; extra == 'polars'
Provides-Extra: postgresql
Requires-Dist: azure-identity>=1.15.0; extra == 'postgresql'
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgresql'
Provides-Extra: profiling
Requires-Dist: dataprofiler>=0.9.0; extra == 'profiling'
Requires-Dist: presidio-analyzer>=2.2.0; extra == 'profiling'
Provides-Extra: sftp
Requires-Dist: paramiko>=3.4.0; extra == 'sftp'
Provides-Extra: snowflake
Requires-Dist: snowflake-connector-python>=3.5.0; extra == 'snowflake'
Provides-Extra: spark
Requires-Dist: pyspark>=3.3.0; extra == 'spark'
Provides-Extra: sse
Requires-Dist: sseclient-py>=1.8.0; extra == 'sse'
Provides-Extra: streaming
Requires-Dist: bytewax>=0.19.0; extra == 'streaming'
Requires-Dist: kafka-python>=2.0.2; extra == 'streaming'
Requires-Dist: pathway>=0.7.0; (python_version >= '3.10') and extra == 'streaming'
Requires-Dist: sseclient-py>=1.8.0; extra == 'streaming'
Requires-Dist: websocket-client>=1.6.0; extra == 'streaming'
Provides-Extra: websocket
Requires-Dist: websocket-client>=1.6.0; extra == 'websocket'
Description-Content-Type: text/markdown

# LakeLogic

**Your data pipeline breaks silently. LakeLogic catches it.**

One YAML contract. Any engine. Every row validated, quarantined, or promoted — automatically.

[![CI](https://github.com/lakelogic/LakeLogic/actions/workflows/ci-gate.yml/badge.svg)](https://github.com/lakelogic/LakeLogic/actions/workflows/ci-gate.yml)
[![Documentation](https://img.shields.io/badge/docs-GitHub%20Pages-blue)](https://lakelogic.github.io/LakeLogic/)
[![PyPI](https://img.shields.io/pypi/v/lakelogic?logo=pypi&logoColor=white)](https://pypi.org/project/lakelogic/)
[![Python](https://img.shields.io/badge/python-3.9+-blue?logo=python&logoColor=white)](https://www.python.org)
[![License](https://img.shields.io/badge/license-Apache%202.0-green)](LICENSE)
[![Downloads](https://img.shields.io/pypi/dm/lakelogic?logo=pypi&logoColor=white)](https://pypi.org/project/lakelogic/)

---

## The Problem

You write quality checks in Spark. Then you need to run locally with Polars. Now you're maintaining two codebases. Your bronze layer has no validation. Your silver layer silently drops rows. Nobody knows which records failed or why.

## The Solution

```yaml
# contract.yaml — this is your entire quality gate
version: "1.0"
info:
  title: Silver Customers
  owner: data-team
model:
  fields:
    - name: customer_id
      type: integer
      required: true
    - name: email
      type: string
    - name: revenue
      type: float
    - name: status
      type: string
source:
  type: landing
  path: "data/customers/*.csv"
  load_mode: incremental
quality:
  row_rules:
    - sql: "customer_id IS NOT NULL AND email IS NOT NULL"
    - sql: "status IN ('active', 'churned', 'pending')"
    - sql: "revenue >= 0"
    - sql: "email LIKE '%@%.%'"
materialization:
  strategy: merge
  target_path: "silver/customers"
  format: parquet
  merge_keys: [customer_id]
quarantine:
  enabled: true
  target: "quarantine/customers"
```

```python
from lakelogic import DataProcessor

result = DataProcessor("contract.yaml").run_source()

print(f"✅ Valid: {len(result.good)}  |  ❌ Quarantined: {len(result.bad)}")
```

**Same contract runs on Polars, Spark, DuckDB, or Pandas.**
Zero code changes.

---

## Install

```bash
pip install lakelogic                    # Core + Polars
pip install "lakelogic[spark]"           # + PySpark
pip install "lakelogic[delta]"           # + Delta Lake (Spark-free)
pip install "lakelogic[notifications]"   # + Apprise + Jinja2 alerts
pip install "lakelogic[all]"             # Everything
```

## What You Get

### 🔒 Schema & Quality Gate
Define fields, types, required constraints, and SQL-based rules in YAML. Bad rows are quarantined with tagged error reasons — never silently dropped.

### 🔄 Engine Portability
One contract, four engines. Develop locally on Polars in milliseconds. Deploy to Spark at scale. Same validation semantics everywhere.

### 📊 Declarative Transformations
Rename, derive, deduplicate, pivot, unpivot, bucket, join, filter, JSON extract, date range explode — all in YAML, all engine-agnostic.

### 🔗 Automatic Lineage
Every row is stamped with `_lakelogic_source`, `_lakelogic_processed_at`, and `_lakelogic_run_id`. Upstream lineage columns are preserved with `_upstream_*` prefix across layers.

### 📦 Incremental Processing
Watermark-based incremental loads, file-mtime tracking, run logs, and CDC support. Process only what's new.

### 🔔 Notifications
Slack, Teams, Email, Discord, and [90+ channels](https://github.com/caronc/apprise/wiki) via Apprise. Built-in Jinja2 templates per event. Just add a `target` URL.

### 🏗️ Materialization
Write validated data to CSV, Parquet, Delta Lake, or Unity Catalog tables. Supports append, overwrite, merge, and SCD2 strategies.

### 🧪 Synthetic Data
Generate realistic test data from any contract: `lakelogic generate --contract contract.yaml --rows 1000`

### 🔌 dbt Import
Already using dbt? Convert your `schema.yml` in one command: `lakelogic import-dbt --schema models/schema.yml --output contracts/`

---

## Quick Start (5 Minutes)

### 1. Bootstrap a contract from your data

```bash
lakelogic bootstrap --landing data/ --output contracts/
```

This scans your files, infers schemas, detects PII, and generates ready-to-use contracts.

### 2. Run the quality gate

```bash
lakelogic run --contract contracts/customers.yaml --source data/customers.csv
```

### 3. See the results

```
✅ Good records: 847 → output/customers_good.parquet
❌ Quarantined:  23  → output/customers_quarantine.parquet
📊 Quality score: 97.4%
```

### 4. Check your environment

```bash
lakelogic doctor
```

```
LakeLogic Doctor
═══════════════════════════════════════
  Version     : 0.2.0
  Python      : 3.11.7
  OS          : Windows 11

  Engines
  ───────
  ✅ polars    1.18.0
  ✅ duckdb    1.1.3
  ✅ pandas    2.2.1
  ⬚  pyspark  not installed

  Extras
  ──────
  ✅ deltalake  0.22.3
  ✅ jinja2     3.1.4
  ✅ apprise    1.9.0
  ⬚  dataprofiler  not installed
═══════════════════════════════════════
```

---

## Architecture

```text
┌──────────────────────────────────────────────────────────────────┐
│                         Contract YAML                           │
│  schema · SQL quality rules · transforms · lineage · target     │
└────────────────────────────┬─────────────────────────────────────┘
                             │
                      ┌──────▼──────┐
                      │ DataProcessor│
                      └──────┬──────┘
                             │
        ┌────────────┬───────┼───────┬────────────┐
        ▼            ▼       ▼       ▼            │
   ┌────────┐  ┌────────┐ ┌───────┐ ┌────────┐   │
   │ Polars │  │ Spark  │ │DuckDB │ │ Pandas │   │
   └───┬────┘  └───┬────┘ └──┬────┘ └───┬────┘   │
       │           │         │          │         │
       └───────────┴────┬────┴──────────┘         │
                        │                         │
               ┌────────▼────────┐                │
               │  Validated Data  │                │
               │  ┌────┐ ┌─────┐ │                │
               │  │Good│ │ Bad │ │                │
               │  └──┬─┘ └──┬──┘ │                │
               └─────┼──────┼────┘                │
                     │      │                     │
               ┌─────▼┐  ┌──▼────────┐            │
               │Target│  │Quarantine │            │
               └──────┘  └───────────┘            │
```

## Explore the Examples

The [`examples/`](https://github.com/lakelogic/LakeLogic/tree/main/examples) directory contains 24 runnable notebooks:

| Category | What You'll Learn |
|:---|:---|
| **Quickstart** | Your first contract in 5 minutes, database governance, dbt+PII |
| **Core Patterns** | Medallion architecture, bronze quality gates, SCD2, deduplication, soft deletes |
| **Advanced** | Insurance ELT, GDPR compliance, late-arriving data, external logic, streaming, synthetic data |
| **Compliance** | HIPAA PII masking |

## Documentation

- **[Full Docs](https://LakeLogic.github.io/LakeLogic)** — Complete guides and API reference
- **[Quickstart](https://LakeLogic.github.io/LakeLogic/quickstart/)** — Get running in 5 minutes
- **[Contract Reference](docs/contract_template.md)** — Full YAML field reference
- **[CLI Reference](https://LakeLogic.github.io/LakeLogic/cli/)** — Command-line usage
- **[Delta Lake Support](docs/delta_lake_support.md)** — Spark-free Delta operations
- **[Streaming](docs/streaming_implementation_complete.md)** — Real-time ingestion

## Contributing

See `CONTRIBUTING.md` to get started, or `docs/installation.md#developer-installation` for environment setup.

---

### License

Apache-2.0
