Skip to content

Instantly share code, notes, and snippets.

@joaodaher
Last active August 14, 2025 13:19
Show Gist options
  • Save joaodaher/97847c9987ccf7d6ee13ffe76066c5ca to your computer and use it in GitHub Desktop.
Save joaodaher/97847c9987ccf7d6ee13ffe76066c5ca to your computer and use it in GitHub Desktop.
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://warmly.ai/schemas/person-identification-record-4.3.json",
"title": "person-identification-record",
"description": "A complete record for person identity resolution. Each lineage item flows vendor_ingest → normalization → enrichment. Identification selects the best enrichment. The root publish_target declares where to emit the final result.",
"type": "object",
"required": ["id", "lineage", "identification", "publish_target"],
"properties": {
"id": {
"type": "string",
"description": "Unique ID for this record (e.g., batch/window identifier)"
},
"publish_target": {
"$ref": "#/$defs/publish-target",
"description": "Destination where the final identity should be published (e.g., GCP Pub/Sub topic)."
},
"lineage": {
"type": "array",
"description": "List of lineage items, each containing vendor_ingest, normalization, and enrichment stages.",
"items": { "$ref": "#/$defs/lineage-item" },
"minItems": 1
},
"identification": { "$ref": "#/$defs/identification-stage" }
},
"additionalProperties": false,
"$defs": {
"publish-target": {
"title": "publish-target",
"type": "object",
"required": ["system"],
"properties": {
"system": { "type": "string", "enum": ["gcp_pubsub"] },
"gcp_pubsub": { "$ref": "#/$defs/gcp-pubsub-target" }
},
"allOf": [
{
"if": { "properties": { "system": { "const": "gcp_pubsub" } }, "required": ["system"] },
"then": { "required": ["gcp_pubsub"] }
}
],
"additionalProperties": false
},
"gcp-pubsub-target": {
"title": "gcp-pubsub-target",
"type": "object",
"required": ["topic"],
"properties": {
"project_id": {
"type": "string",
"description": "Optional override; if omitted, use service default project."
},
"topic": {
"type": "string",
"description": "Pub/Sub topic name (e.g., warmly-best-contact)."
}
},
"additionalProperties": false
},
"lineage-item": {
"title": "lineage-item",
"type": "object",
"required": ["lineage_id", "vendor_ingest", "normalization", "enrichment"],
"properties": {
"lineage_id": {
"type": "string",
"description": "Unique ID for this lineage item (referenced by identification.metadata.lineage_id)"
},
"vendor_ingest": { "$ref": "#/$defs/vendor-ingest-stage" },
"normalization": { "$ref": "#/$defs/normalization-stage" },
"enrichment": { "$ref": "#/$defs/enrichment-stage" }
},
"additionalProperties": false
},
"vendor-ingest-stage": {
"title": "vendor-ingest-stage",
"type": "object",
"required": ["raw", "metadata"],
"properties": {
"raw": {
"type": "object",
"description": "Opaque vendor payload as received.",
"additionalProperties": true
},
"metadata": { "$ref": "#/$defs/vendor-ingest-metadata" }
},
"additionalProperties": false
},
"normalization-stage": {
"title": "normalization-stage",
"type": "object",
"required": ["content", "metadata"],
"properties": {
"content": { "$ref": "#/$defs/entity-resolution-input" },
"metadata": { "$ref": "#/$defs/normalization-decision" }
},
"additionalProperties": false
},
"enrichment-stage": {
"title": "enrichment-stage",
"type": "object",
"required": ["content", "metadata"],
"properties": {
"content": { "$ref": "#/$defs/enriched-person-content" },
"metadata": { "$ref": "#/$defs/enrichment-metadata" }
},
"additionalProperties": false
},
"vendor-ingest-metadata": {
"title": "vendor-ingest-metadata",
"type": "object",
"required": ["received_at", "vendor"],
"properties": {
"vendor": { "type": "string" },
"received_at": { "type": "string", "format": "date-time" },
"correlation_id": { "type": "string" },
"hash": { "type": "string", "description": "Stable hash of vendor_ingest.raw" },
"session_id": { "type": "string", "description": "Optional; only for session-scoped pipelines" },
"producer": {
"type": "object",
"properties": {
"service": { "type": "string" },
"version": { "type": "string" },
"environment": { "type": "string" }
},
"additionalProperties": false
}
},
"additionalProperties": false
},
"entity-resolution-input": {
"title": "entity-resolution-input",
"type": "object",
"description": "Compact, strict person input (snake_case).",
"properties": {
"id": { "type": "string" },
"organization_id": { "type": "string" },
"email": { "type": "string", "format": "email" },
"linkedin_handle": { "type": "string", "description": "Handle, not URL" },
"first_name": { "type": "string" },
"last_name": { "type": "string" },
"full_name": { "type": "string" },
"job_title": { "type": "string" },
"company_name": { "type": "string" },
"company_domain": { "type": "string", "description": "e.g., acme.com (no scheme)" }
},
"additionalProperties": false
},
"normalization-decision": {
"title": "normalization-decision",
"type": "object",
"required": ["state", "decided_at"],
"properties": {
"state": { "type": "string", "enum": ["need_enrichment", "no_enrichment", "duplicate"] },
"duplicate_of_id": { "type": ["string", "null"] },
"reason": { "type": ["string", "null"], "maxLength": 300 },
"decided_at": { "type": "string", "format": "date-time" }
},
"allOf": [
{
"if": { "properties": { "state": { "const": "duplicate" } }, "required": ["state"] },
"then": { "required": ["duplicate_of_id"] }
}
],
"additionalProperties": false
},
"enriched-person-content": {
"title": "enriched-person-content",
"type": "object",
"description": "Comprehensive person DTO produced by enrichment (person-focused).",
"properties": {
"id": { "type": "string" },
"name": {
"type": "object",
"properties": {
"full_name": { "type": "string" },
"first_name": { "type": "string" },
"last_name": { "type": "string" }
},
"additionalProperties": false
},
"email": { "type": "string", "format": "email" },
"phone": { "type": "string", "description": "E.164 recommended" },
"location": { "type": "string" },
"time_zone": { "type": "string" },
"utc_offset": { "type": "number" },
"bio": { "type": "string" },
"site": { "type": "string" },
"avatar": { "type": "string" },
"employment": {
"type": "object",
"properties": {
"domain": { "type": "string" },
"name": { "type": "string" },
"title": { "type": "string" },
"role": { "type": "string" },
"sub_role": { "type": "string" },
"seniority": { "type": "string" },
"refreshed": { "type": "boolean" }
},
"additionalProperties": false
},
"facebook": {
"type": "object",
"properties": { "handle": { "type": "string" }, "likes": { "type": "number" } },
"additionalProperties": false
},
"github": {
"type": "object",
"properties": {
"handle": { "type": "string" },
"id": { "type": "number" },
"avatar": { "type": "string" },
"company": { "type": "string" },
"blog": { "type": "string" },
"followers": { "type": "number" },
"following": { "type": "number" }
},
"additionalProperties": false
},
"twitter": {
"type": "object",
"properties": {
"handle": { "type": "string" },
"id": { "type": "number" },
"bio": { "type": "string" },
"followers": { "type": "number" },
"following": { "type": "number" },
"statuses": { "type": "number" },
"favorites": { "type": "number" },
"location": { "type": "string" },
"site": { "type": "string" },
"avatar": { "type": "string" }
},
"additionalProperties": false
},
"linkedin": { "type": "object", "properties": { "handle": { "type": "string" } }, "additionalProperties": false },
"email_provider": { "type": "boolean" },
"indexed_at": { "type": "string", "format": "date-time" },
"phone_last_seen_at": { "type": "string", "format": "date-time" },
"active_at": { "type": "string", "format": "date-time" },
"inactive_at": { "type": "string", "format": "date-time" }
},
"additionalProperties": false
},
"enrichment-metadata": {
"title": "enrichment-metadata",
"type": "object",
"properties": {
"provider": { "type": "string" },
"status": { "type": "string", "enum": ["ok", "not_found", "rate_limited", "error"] },
"confidence": { "type": "number", "minimum": 0, "maximum": 1 },
"freshness_days": { "type": "integer", "minimum": 0 },
"score_overall": { "type": "number", "minimum": 0, "maximum": 1 },
"provider_request_id": { "type": "string" },
"requested_at": { "type": "string", "format": "date-time" },
"responded_at": { "type": "string", "format": "date-time" },
"provider_payload_raw": {
"type": "object",
"description": "Opaque provider response as received.",
"additionalProperties": true
},
"notes": { "type": "string" }
},
"additionalProperties": false
},
"identification-stage": {
"title": "identification-stage",
"type": "object",
"required": ["best", "metadata"],
"properties": {
"best": {
"type": "object",
"required": ["content"],
"properties": {
"content": { "$ref": "#/$defs/enriched-person-content" }
},
"additionalProperties": false
},
"metadata": { "$ref": "#/$defs/identification-metadata" }
},
"additionalProperties": false
},
"identification-metadata": {
"title": "identification-metadata",
"type": "object",
"required": ["engine", "lineage_id"],
"properties": {
"lineage_id": {
"type": "string",
"description": "The lineage_id of the winning item."
},
"engine": { "type": "string", "enum": ["rule", "llm", "hybrid"] },
"rationale": { "type": "string" },
"reasons": {
"type": "array",
"items": {
"type": "object",
"properties": { "code": { "type": "string" }, "detail": { "type": "string" } },
"additionalProperties": false
}
},
"cost": {
"type": "object",
"properties": {
"currency": { "type": "string" },
"amount": { "type": "number" },
"tokens": { "type": "integer", "minimum": 0 }
},
"additionalProperties": false
},
"timestamp": { "type": "string", "format": "date-time" }
},
"additionalProperties": false
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment