Skip to content

Quick Start

Get up and running with PyCharter in 5 minutes.

Your First Validation

The simplest way to use PyCharter is to validate data against a JSON schema:

from pycharter import from_dict, validate

# Define your schema
schema = {
    "type": "object",
    "version": "1.0.0",
    "properties": {
        "name": {"type": "string", "minLength": 1},
        "email": {"type": "string", "format": "email"},
        "age": {"type": "integer", "minimum": 0}
    },
    "required": ["name", "email"]
}

# Generate a Pydantic model
User = from_dict(schema, "User")

# Validate data
result = validate(User, {
    "name": "Alice",
    "email": "alice@example.com",
    "age": 30
})

print(f"Valid: {result.is_valid}")  # True
print(f"Data: {result.data}")       # User(name='Alice', email='alice@example.com', age=30)

Using the Validator Class

For production use with multiple validations, use the Validator class. The Pydantic model is generated once and cached — fast for large batches.

from pycharter import Validator

# From a directory (looks for schema.yaml, coercion_rules.yaml, validation_rules.yaml)
validator = Validator.from_dir("contracts/user/")

# Or from a single contract file, explicit files, or a dict
validator = Validator.from_file("user_contract.yaml")
validator = Validator.from_files(schema="schemas/user.yaml", coercion_rules="rules/coerce.yaml")
validator = Validator.from_dict(schema={...}, coercion_rules={...}, validation_rules={...})

# Validate one record
result = validator.validate({"name": "Alice", "email": "alice@example.com"})
print(result.is_valid)   # True
print(result.data)       # User(name='Alice', email='alice@example.com')
print(result.errors)     # [] when valid

# Invalid data
result = validator.validate({"name": "X"})  # missing required "email"
print(result.is_valid)   # False
print(result.errors)     # [{'loc': ('email',), 'msg': 'Field required', ...}]

# Batch validation (efficient — model cached across all calls)
results = validator.validate_batch([
    {"name": "Alice", "email": "alice@example.com"},
    {"name": "Bob", "email": "bob@example.com"},
    {"name": "Charlie", "email": "charlie@example.com"},
])

valid_count = sum(1 for r in results if r.is_valid)
print(f"Valid: {valid_count}/{len(results)}")

Using ValidatorBuilder (Fluent API)

ValidatorBuilder is a chainable interface for configuring validators with quality checks, state-specific rules, and metrics tracking:

from pycharter import ValidatorBuilder

validator = (
    ValidatorBuilder()
    .from_dir("contracts/order")
    .with_state_rules({
        # Per-lifecycle-state overrides: make fields optional in DRAFT, required in FILLED
        "DRAFT":  {"optional": ["filled_qty", "fill_price"]},
        "FILLED": {"required": ["filled_qty", "fill_price"]},
    })
    .with_quality_checks(thresholds={"completeness": 0.95, "accuracy": 0.99})
    .build()
)

# Standard validation
result = validator.validate({"symbol": "AAPL", "order_qty": 100})

# State-aware validation: rules change based on where the entity is in its lifecycle
draft_result  = validator.validate_for_state({"symbol": "AAPL", "order_qty": 100}, "DRAFT")
filled_result = validator.validate_for_state(
    {"symbol": "AAPL", "order_qty": 100, "filled_qty": 100, "fill_price": 150.0}, "FILLED"
)
print(draft_result.is_valid)   # True — filled_qty/fill_price are optional in DRAFT
print(filled_result.is_valid)  # True — filled_qty/fill_price are present and required in FILLED

validate_for_state(data, state) is especially powerful when combined with PyStator: use the FSM's current state to automatically apply the right validation rules.

Your First ETL Pipeline

Build a simple data pipeline:

import asyncio
from pycharter import Pipeline, HTTPExtractor, FileLoader, Rename, Filter

# Define the pipeline
pipeline = (
    Pipeline(HTTPExtractor(url="https://jsonplaceholder.typicode.com/users"))
    | Rename({"username": "user_name"})  # Rename fields
    | Filter(lambda r: r.get("id", 0) <= 5)  # Keep first 5 users
    | FileLoader(path="output/users.json", file_format="json")
)

# Run the pipeline
result = asyncio.run(pipeline.run())

print(f"Extracted: {result.rows_extracted}")
print(f"Loaded: {result.rows_loaded}")
print(f"Duration: {result.duration_seconds:.2f}s")

Config-Driven Pipeline

Define pipelines in YAML for easier management:

type: http
url: https://jsonplaceholder.typicode.com/users
method: GET
rename:
  username: user_name
  email: contact_email

add:
  processed_at: "now()"

drop:
  - website
  - phone
type: file
file_path: output/users.json
format: json

Run the config-driven pipeline:

import asyncio
from pycharter import Pipeline

# Load from directory
pipeline = Pipeline.from_config_dir("pipelines/users/")

# Or from explicit files
pipeline = Pipeline.from_config_files(
    extract="configs/extract.yaml",
    transform="configs/transform.yaml",
    load="configs/load.yaml"
)

result = asyncio.run(pipeline.run())

Data Quality Check

The fastest way to check data quality is check_quality() — one function, one result:

from pycharter import check_quality

report = check_quality(
    contract={"schema": {
        "version": "1.0.0",
        "properties": {
            "name": {"type": "string", "minLength": 1},
            "email": {"type": "string", "format": "email"},
            "age": {"type": "integer", "minimum": 0}
        },
        "required": ["name", "email"]
    }},
    data=[
        {"name": "Alice", "email": "alice@example.com", "age": 30},
        {"name": "Bob", "email": "invalid-email", "age": -5},
        {"name": "Charlie", "email": "charlie@example.com", "age": 25},
    ],
)

print(f"Quality Score: {report.quality_score.overall_score:.1f}/100")
print(f"Valid: {report.valid_count}/{report.record_count}")
print(f"Violations: {report.violation_count}")

Presets for Common Scenarios

Use presets instead of configuring every option:

from pycharter import check_quality, QualityCheckOptions

# Quick one-off check (default)
report = check_quality(contract, data)

# Strict gated check — must pass thresholds before proceeding
report = check_quality(contract, data, options=QualityCheckOptions.strict())
print(f"Passed thresholds: {report.passed}")

# Monitoring — dedup violations, skip unchanged data
report = check_quality(contract, data, options=QualityCheckOptions.monitoring())

Using the QualityCheck Class

For store-backed schemas or advanced control, use the QualityCheck class directly:

from pycharter import QualityCheck, QualityThresholds, SQLiteMetadataStore

store = SQLiteMetadataStore("metadata.db")
store.connect()

check = QualityCheck(store=store)
report = check.run(
    schema_id="user_schema_v1",
    data=records,
    thresholds=QualityThresholds(min_overall_score=90.0, max_violation_rate=0.10)
)

print(f"Quality Score: {report.quality_score.overall_score:.1f}/100")
print(f"Passed Thresholds: {report.passed}")

Quick Data Profiling

Profile a dataset without a contract:

from pycharter import profile_data

profile = profile_data([
    {"name": "Alice", "age": 30},
    {"name": "Bob", "age": None},
])

print(f"Records: {profile['record_count']}")
print(f"Completeness: {profile['overall_stats']['completeness']:.1f}%")
print(f"Age nulls: {profile['field_profiles']['age']['null_count']}")

Domain Lifecycle & FSM Integration

PyCharter can validate that a data contract's states are aligned with an FSM definition (e.g. PyStator). This is useful to catch drift between your schema's enum values and your state machine's states.

from pycharter import check_state_alignment, validate_lifecycle_binding, get_lifecycle_binding

# Your FSM states (from PyStator or any other source)
fsm_states = {"PENDING", "OPEN", "FILLED", "CANCELLED"}

# Your contract (loaded from YAML or built programmatically)
contract = {
    "schema": {
        "properties": {
            "status": {"type": "string", "enum": ["PENDING", "OPEN", "FILLED"]}
        }
    },
    "metadata": {
        "governance_rules": {
            "lifecycle": {
                "state_machine_name": "order_management",
                "machine_version": "1.0.0",
                "state_field": "status",
                "entity_id_field": "order_id",
            }
        }
    }
}

# Check alignment: are contract enum values ↔ FSM states in sync?
result = check_state_alignment(contract, fsm_states, state_field="status")
print(result.aligned)                  # False — CANCELLED is in FSM but not in contract
print(result.missing_from_contract)    # {"CANCELLED"}
print(result.missing_from_fsm)         # set()

# Validate the lifecycle binding structure in metadata
errors = validate_lifecycle_binding(contract["metadata"])
print(errors)  # [] if the binding is structurally valid

# Read the binding
binding = get_lifecycle_binding(contract["metadata"])
print(binding.state_machine_name)  # "order_management"
print(binding.state_field)         # "status"

Start the API Server

Run PyCharter as a REST API:

# Start the API server
pycharter api

# Or with custom host/port
pycharter api --host 0.0.0.0 --port 8080

API endpoints:

Example API call:

curl -X POST http://localhost:8002/api/v1/validation/validate \
  -H "Content-Type: application/json" \
  -d '{
    "schema": {
      "type": "object",
      "properties": {"name": {"type": "string"}},
      "required": ["name"]
    },
    "data": {"name": "Alice"}
  }'

Start the Web UI

Launch the interactive web interface:

# Production mode (pre-built)
pycharter ui serve

# Development mode (with hot reload)
pycharter ui dev

Open http://localhost:3000 to access:

  • Contract management
  • Schema browser
  • Validation testing
  • Quality dashboards
  • ETL monitoring

Next Steps

Now that you've seen the basics, dive deeper: