From contracts to trusted data
Without clear agreements, data pipelines rely on tribal knowledge and manual checks.
Error-prone, slow feedback, and little governance.
A thin wrapper around Spark that enforces contracts and records lineage.
from open_data_contract_standard.model import OpenDataContractStandard
contract = OpenDataContractStandard(
name="orders",
version="1.0.0",
fields=[
{"name": "id", "type": "string"},
{"name": "amount", "type": "double"},
{"name": "customer_id", "type": "string"}
],
expectations=["amount > 0", "customer_id not null"]
)
{
"name": "orders",
"version": "1.0.0",
"fields": [
{"name": "id", "type": "string"},
{"name": "amount", "type": "double"},
{"name": "customer_id", "type": "string"}
],
"expectations": ["amount > 0", "customer_id not null"]
}
orders_df, status = read_with_contract(
spark,
path="orders.json",
contract=contract,
dq_client=dq
)
{
"status": "fail",
"violations": [
{"row": 42, "field": "amount", "message": "amount must be > 0"}
]
}
df = spark.read.json("orders.json")
errors = validate_schema(df)
if errors:
raise ValueError(errors)
enriched = orders_df.join(customers_df, "customer_id")\
.withColumn("total", orders_df.amount * 1.2)
[
{"id": "1", "total": 12.0},
{"id": "2", "total": -6.0}
]
Negative totals will trigger contract checks later.
result, status, draft = write_with_contract(
enriched,
contract=contract,
path="out/orders",
dq_client=dq,
draft_on_mismatch=True
)
{
"metrics": {"row_count": 2, "negative_total": 1},
"draft": {
"version": "1.1.0",
"changes": ["allow negative total"]
}
}
# manual
row_count = enriched.count()
negatives = enriched.filter("total < 0").count()
status = attach_failed_expectations(
enriched,
contract,
status,
collect_examples=True
)
[
{
"expectation": "amount > 0",
"examples": [{"id": "2", "amount": -5.0}]
}
]
records.append(DatasetRecord(
name="orders_enriched",
version=1,
status=status.status,
metrics=result.metrics
))
save_records(records)
[
{"version": 1, "row_count": 2, "status": "fail"}
]
# v1.0.0
{"fields": [{"name": "amount", "type": "double"}]}
# v1.1.0
{"fields": [{"name": "amount", "type": "double", "nullable": true}]}
pip install dc43 → build your first contract today.