Quick Start¶
Get up and running with PyCharter in 5 minutes.
Your First Validation¶
The simplest way to use PyCharter is to validate data against a JSON schema:
from pycharter import from_dict, validate
# Define your schema
schema = {
"type": "object",
"version": "1.0.0",
"properties": {
"name": {"type": "string", "minLength": 1},
"email": {"type": "string", "format": "email"},
"age": {"type": "integer", "minimum": 0}
},
"required": ["name", "email"]
}
# Generate a Pydantic model
User = from_dict(schema, "User")
# Validate data
result = validate(User, {
"name": "Alice",
"email": "alice@example.com",
"age": 30
})
print(f"Valid: {result.is_valid}") # True
print(f"Data: {result.data}") # User(name='Alice', email='alice@example.com', age=30)
Using the Validator Class¶
For production use with multiple validations, use the Validator class. The Pydantic model is generated once and cached — fast for large batches.
from pycharter import Validator
# From a directory (looks for schema.yaml, coercion_rules.yaml, validation_rules.yaml)
validator = Validator.from_dir("contracts/user/")
# Or from a single contract file, explicit files, or a dict
validator = Validator.from_file("user_contract.yaml")
validator = Validator.from_files(schema="schemas/user.yaml", coercion_rules="rules/coerce.yaml")
validator = Validator.from_dict(schema={...}, coercion_rules={...}, validation_rules={...})
# Validate one record
result = validator.validate({"name": "Alice", "email": "alice@example.com"})
print(result.is_valid) # True
print(result.data) # User(name='Alice', email='alice@example.com')
print(result.errors) # [] when valid
# Invalid data
result = validator.validate({"name": "X"}) # missing required "email"
print(result.is_valid) # False
print(result.errors) # [{'loc': ('email',), 'msg': 'Field required', ...}]
# Batch validation (efficient — model cached across all calls)
results = validator.validate_batch([
{"name": "Alice", "email": "alice@example.com"},
{"name": "Bob", "email": "bob@example.com"},
{"name": "Charlie", "email": "charlie@example.com"},
])
valid_count = sum(1 for r in results if r.is_valid)
print(f"Valid: {valid_count}/{len(results)}")
Using ValidatorBuilder (Fluent API)¶
ValidatorBuilder is a chainable interface for configuring validators with quality checks, state-specific rules, and metrics tracking:
from pycharter import ValidatorBuilder
validator = (
ValidatorBuilder()
.from_dir("contracts/order")
.with_state_rules({
# Per-lifecycle-state overrides: make fields optional in DRAFT, required in FILLED
"DRAFT": {"optional": ["filled_qty", "fill_price"]},
"FILLED": {"required": ["filled_qty", "fill_price"]},
})
.with_quality_checks(thresholds={"completeness": 0.95, "accuracy": 0.99})
.build()
)
# Standard validation
result = validator.validate({"symbol": "AAPL", "order_qty": 100})
# State-aware validation: rules change based on where the entity is in its lifecycle
draft_result = validator.validate_for_state({"symbol": "AAPL", "order_qty": 100}, "DRAFT")
filled_result = validator.validate_for_state(
{"symbol": "AAPL", "order_qty": 100, "filled_qty": 100, "fill_price": 150.0}, "FILLED"
)
print(draft_result.is_valid) # True — filled_qty/fill_price are optional in DRAFT
print(filled_result.is_valid) # True — filled_qty/fill_price are present and required in FILLED
validate_for_state(data, state) is especially powerful when combined with PyStator: use the FSM's current state to automatically apply the right validation rules.
Your First ETL Pipeline¶
Build a simple data pipeline:
import asyncio
from pycharter import Pipeline, HTTPExtractor, FileLoader, Rename, Filter
# Define the pipeline
pipeline = (
Pipeline(HTTPExtractor(url="https://jsonplaceholder.typicode.com/users"))
| Rename({"username": "user_name"}) # Rename fields
| Filter(lambda r: r.get("id", 0) <= 5) # Keep first 5 users
| FileLoader(path="output/users.json", file_format="json")
)
# Run the pipeline
result = asyncio.run(pipeline.run())
print(f"Extracted: {result.rows_extracted}")
print(f"Loaded: {result.rows_loaded}")
print(f"Duration: {result.duration_seconds:.2f}s")
Config-Driven Pipeline¶
Define pipelines in YAML for easier management:
Run the config-driven pipeline:
import asyncio
from pycharter import Pipeline
# Load from directory
pipeline = Pipeline.from_config_dir("pipelines/users/")
# Or from explicit files
pipeline = Pipeline.from_config_files(
extract="configs/extract.yaml",
transform="configs/transform.yaml",
load="configs/load.yaml"
)
result = asyncio.run(pipeline.run())
Data Quality Check¶
The fastest way to check data quality is check_quality() — one function, one result:
from pycharter import check_quality
report = check_quality(
contract={"schema": {
"version": "1.0.0",
"properties": {
"name": {"type": "string", "minLength": 1},
"email": {"type": "string", "format": "email"},
"age": {"type": "integer", "minimum": 0}
},
"required": ["name", "email"]
}},
data=[
{"name": "Alice", "email": "alice@example.com", "age": 30},
{"name": "Bob", "email": "invalid-email", "age": -5},
{"name": "Charlie", "email": "charlie@example.com", "age": 25},
],
)
print(f"Quality Score: {report.quality_score.overall_score:.1f}/100")
print(f"Valid: {report.valid_count}/{report.record_count}")
print(f"Violations: {report.violation_count}")
Presets for Common Scenarios¶
Use presets instead of configuring every option:
from pycharter import check_quality, QualityCheckOptions
# Quick one-off check (default)
report = check_quality(contract, data)
# Strict gated check — must pass thresholds before proceeding
report = check_quality(contract, data, options=QualityCheckOptions.strict())
print(f"Passed thresholds: {report.passed}")
# Monitoring — dedup violations, skip unchanged data
report = check_quality(contract, data, options=QualityCheckOptions.monitoring())
Using the QualityCheck Class¶
For store-backed schemas or advanced control, use the QualityCheck class directly:
from pycharter import QualityCheck, QualityThresholds, SQLiteMetadataStore
store = SQLiteMetadataStore("metadata.db")
store.connect()
check = QualityCheck(store=store)
report = check.run(
schema_id="user_schema_v1",
data=records,
thresholds=QualityThresholds(min_overall_score=90.0, max_violation_rate=0.10)
)
print(f"Quality Score: {report.quality_score.overall_score:.1f}/100")
print(f"Passed Thresholds: {report.passed}")
Quick Data Profiling¶
Profile a dataset without a contract:
from pycharter import profile_data
profile = profile_data([
{"name": "Alice", "age": 30},
{"name": "Bob", "age": None},
])
print(f"Records: {profile['record_count']}")
print(f"Completeness: {profile['overall_stats']['completeness']:.1f}%")
print(f"Age nulls: {profile['field_profiles']['age']['null_count']}")
Domain Lifecycle & FSM Integration¶
PyCharter can validate that a data contract's states are aligned with an FSM definition (e.g. PyStator). This is useful to catch drift between your schema's enum values and your state machine's states.
from pycharter import check_state_alignment, validate_lifecycle_binding, get_lifecycle_binding
# Your FSM states (from PyStator or any other source)
fsm_states = {"PENDING", "OPEN", "FILLED", "CANCELLED"}
# Your contract (loaded from YAML or built programmatically)
contract = {
"schema": {
"properties": {
"status": {"type": "string", "enum": ["PENDING", "OPEN", "FILLED"]}
}
},
"metadata": {
"governance_rules": {
"lifecycle": {
"state_machine_name": "order_management",
"machine_version": "1.0.0",
"state_field": "status",
"entity_id_field": "order_id",
}
}
}
}
# Check alignment: are contract enum values ↔ FSM states in sync?
result = check_state_alignment(contract, fsm_states, state_field="status")
print(result.aligned) # False — CANCELLED is in FSM but not in contract
print(result.missing_from_contract) # {"CANCELLED"}
print(result.missing_from_fsm) # set()
# Validate the lifecycle binding structure in metadata
errors = validate_lifecycle_binding(contract["metadata"])
print(errors) # [] if the binding is structurally valid
# Read the binding
binding = get_lifecycle_binding(contract["metadata"])
print(binding.state_machine_name) # "order_management"
print(binding.state_field) # "status"
Start the API Server¶
Run PyCharter as a REST API:
# Start the API server
pycharter api
# Or with custom host/port
pycharter api --host 0.0.0.0 --port 8080
API endpoints:
- Swagger UI: http://localhost:8002/docs
- ReDoc: http://localhost:8002/redoc
- OpenAPI JSON: http://localhost:8002/openapi.json
Example API call:
curl -X POST http://localhost:8002/api/v1/validation/validate \
-H "Content-Type: application/json" \
-d '{
"schema": {
"type": "object",
"properties": {"name": {"type": "string"}},
"required": ["name"]
},
"data": {"name": "Alice"}
}'
Start the Web UI¶
Launch the interactive web interface:
# Production mode (pre-built)
pycharter ui serve
# Development mode (with hot reload)
pycharter ui dev
Open http://localhost:3000 to access:
- Contract management
- Schema browser
- Validation testing
- Quality dashboards
- ETL monitoring
Next Steps¶
Now that you've seen the basics, dive deeper:
- Core Concepts - Understand data contracts, schemas, and the validation pipeline
- ETL Pipelines Tutorial - Build production-ready data pipelines
- Data Contracts Tutorial - Master validation and coercion
- API Reference - Complete API documentation