jecio/release_projector.py

608 lines
22 KiB
Python
Raw Normal View History

import argparse
import hashlib
import json
import os
import urllib.error
import urllib.request
from datetime import date, datetime, timezone
from typing import Any, Dict, List, Optional
try:
from dotenv import load_dotenv
except Exception:
load_dotenv = None
DEFAULT_SPARK_PACKAGES = (
"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,"
"org.apache.iceberg:iceberg-aws-bundle:1.10.1,"
"org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5"
)
def utc_now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def parse_json_maybe(value: Any, expected_type: type, fallback: Any) -> Any:
if value is None:
return fallback
if isinstance(value, expected_type):
return value
if isinstance(value, str):
try:
parsed = json.loads(value)
if isinstance(parsed, expected_type):
return parsed
except Exception:
return fallback
return fallback
def first_str(row: Dict[str, Any], keys: List[str]) -> Optional[str]:
for key in keys:
val = row.get(key)
if isinstance(val, str) and val.strip():
return val.strip()
return None
def to_iso(value: Any) -> Optional[str]:
if isinstance(value, datetime):
return value.isoformat()
if isinstance(value, date):
return datetime.combine(value, datetime.min.time(), timezone.utc).isoformat()
if isinstance(value, str) and value.strip():
return value.strip()
return None
def make_fingerprint(name: str, kind: Optional[str], external_ids: Dict[str, str]) -> str:
norm = (name or "").strip().lower()
kind_norm = (kind or "").strip().lower()
ext = "|".join(f"{k}:{v}".lower() for k, v in sorted(external_ids.items()))
raw = f"{norm}|{kind_norm}|{ext}"
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
def load_manifest(path: str) -> Dict[str, Any]:
with open(path, "r", encoding="utf-8") as f:
raw = json.load(f)
if isinstance(raw, dict):
manifest_json = raw.get("manifest_json")
if isinstance(manifest_json, str):
try:
parsed = json.loads(manifest_json)
if isinstance(parsed, dict):
return parsed
except Exception:
pass
return raw
if isinstance(raw, list) and raw and isinstance(raw[0], dict):
manifest_json = raw[0].get("manifest_json")
if isinstance(manifest_json, str):
parsed = json.loads(manifest_json)
if isinstance(parsed, dict):
return parsed
raise ValueError("Manifest file must contain a manifest object or releases_v2 row with manifest_json.")
def infer_manifest_ref(manifest: Dict[str, Any]) -> Optional[str]:
nessie = manifest.get("nessie")
if isinstance(nessie, dict):
ref_obj = nessie.get("ref")
if isinstance(ref_obj, dict):
ref_name = ref_obj.get("name")
if isinstance(ref_name, str) and ref_name.strip():
return ref_name.strip()
tag = nessie.get("tag")
if isinstance(tag, str) and tag.strip():
return tag.strip()
release_obj = manifest.get("release")
if isinstance(release_obj, dict):
release_name = release_obj.get("name")
if isinstance(release_name, str) and release_name.strip():
return release_name.strip()
for key in ("nessie_tag", "tag", "release_name"):
val = manifest.get(key)
if isinstance(val, str) and val.strip():
return val.strip()
return None
def extract_table_identifiers(manifest: Dict[str, Any]) -> List[str]:
out: List[str] = []
tables = manifest.get("tables")
if isinstance(tables, list):
for t in tables:
if not isinstance(t, dict):
continue
ident = t.get("table_identifier") or t.get("identifier") or t.get("table")
if isinstance(ident, str) and ident.strip():
out.append(ident.strip())
if out:
return out
rows = manifest.get("rows")
if isinstance(rows, list):
for row in rows:
if not isinstance(row, dict):
continue
ident = row.get("table_identifier")
if isinstance(ident, str) and ident.strip():
out.append(ident.strip())
return out
def infer_concept_table(tables: List[str]) -> Optional[str]:
for t in tables:
lower = t.lower()
if "concept" in lower:
return t
return tables[0] if tables else None
def load_manifest_from_registry(
spark: Any,
catalog: str,
release_name: str,
releases_table: Optional[str] = None,
) -> Dict[str, Any]:
from pyspark.sql import functions as F
table = releases_table or os.getenv("RELEASES_TABLE", "db1.releases_v2")
if table.count(".") == 1:
table = f"{catalog}.{table}"
row = (
spark.table(table)
.where(F.col("release_name") == release_name)
.orderBy(F.col("ingested_at_utc").desc_nulls_last())
.select("manifest_json")
.limit(1)
.collect()
)
if not row:
raise ValueError(f"Release '{release_name}' not found in registry table {table}.")
manifest_json = row[0]["manifest_json"]
if not isinstance(manifest_json, str) or not manifest_json.strip():
raise ValueError(f"Release '{release_name}' has empty manifest_json in {table}.")
manifest = json.loads(manifest_json)
if not isinstance(manifest, dict):
raise ValueError(f"Release '{release_name}' manifest_json is not a JSON object.")
return manifest
def build_spark(ref: str):
try:
from pyspark.sql import SparkSession
except Exception as e:
raise RuntimeError(
"pyspark is not installed. Install it or run this with spark-submit."
) from e
catalog = os.getenv("SPARK_CATALOG", "lake")
builder = (
SparkSession.builder.appName("release-projector")
.config(
"spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,"
"org.projectnessie.spark.extensions.NessieSparkSessionExtensions",
)
.config("spark.jars.packages", os.getenv("SPARK_PACKAGES", DEFAULT_SPARK_PACKAGES))
.config(f"spark.sql.catalog.{catalog}", "org.apache.iceberg.spark.SparkCatalog")
.config(f"spark.sql.catalog.{catalog}.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
.config(f"spark.sql.catalog.{catalog}.uri", os.getenv("NESSIE_URI", "http://lakehouse-core:19120/api/v2"))
.config(f"spark.sql.catalog.{catalog}.ref", ref)
.config(
f"spark.sql.catalog.{catalog}.warehouse",
os.getenv("NESSIE_WAREHOUSE", "s3a://lakehouse/warehouse"),
)
.config(f"spark.sql.catalog.{catalog}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
.config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT", "http://lakehouse-core:9000"))
.config("spark.hadoop.fs.s3a.path.style.access", os.getenv("S3_PATH_STYLE", "true"))
.config(
"spark.hadoop.fs.s3a.access.key",
os.getenv("AWS_ACCESS_KEY_ID", os.getenv("MINIO_ROOT_USER", "minioadmin")),
)
.config(
"spark.hadoop.fs.s3a.secret.key",
os.getenv("AWS_SECRET_ACCESS_KEY", os.getenv("MINIO_ROOT_PASSWORD", "minioadmin")),
)
)
spark_master = os.getenv("SPARK_MASTER")
if spark_master:
builder = builder.master(spark_master)
return builder.getOrCreate(), catalog
def ensure_es_index(es_url: str, es_index: str) -> None:
mapping = {
"mappings": {
"properties": {
"concept_id": {"type": "keyword"},
"concept_type": {"type": "keyword"},
"display_name": {"type": "text"},
"description": {"type": "text"},
"text": {"type": "text"},
"source_table": {"type": "keyword"},
"source_pk": {"type": "keyword"},
"release_name": {"type": "keyword"},
"ref_hash": {"type": "keyword"},
"attributes_json": {"type": "text"},
"canonical_name": {"type": "text"},
"kind": {"type": "keyword"},
"aliases": {"type": "text"},
"tags": {"type": "keyword"},
"summary": {"type": "text"},
"latest_cid": {"type": "keyword"},
"fingerprint": {"type": "keyword"},
"created_at": {"type": "date"},
"updated_at": {"type": "date"},
}
}
}
url = f"{es_url.rstrip('/')}/{es_index}"
req_get = urllib.request.Request(url, method="GET")
try:
with urllib.request.urlopen(req_get, timeout=30) as resp:
if 200 <= resp.status < 300:
return
except urllib.error.HTTPError as e:
if e.code != 404:
raise
body = json.dumps(mapping).encode("utf-8")
req_put = urllib.request.Request(url, data=body, method="PUT")
req_put.add_header("Content-Type", "application/json")
with urllib.request.urlopen(req_put, timeout=30) as resp:
if resp.status >= 400:
raise RuntimeError(f"Failed to create ES index {es_index}: HTTP {resp.status}")
def es_upsert(es_url: str, es_index: str, doc: Dict[str, Any]) -> None:
url = f"{es_url.rstrip('/')}/{es_index}/_doc/{doc['concept_id']}"
body = json.dumps(doc, default=str).encode("utf-8")
req = urllib.request.Request(url, data=body, method="PUT")
req.add_header("Content-Type", "application/json")
with urllib.request.urlopen(req, timeout=30) as resp:
if resp.status >= 400:
raise RuntimeError(f"Failed ES upsert for {doc['concept_id']}: HTTP {resp.status}")
def gremlin_upsert(gremlin_url: str, concept: Dict[str, Any]) -> None:
from gremlin_python.driver import client as gremlin_client
from gremlin_python.driver.serializer import GraphSONSerializersV3d0
created_at = concept.get("created_at") or utc_now_iso()
updated_at = concept.get("updated_at") or utc_now_iso()
query = """
g.V().hasLabel('Concept').has('concept_id', concept_id).fold()
.coalesce(
unfold(),
addV('Concept').property('concept_id', concept_id).property('created_at', created_at)
)
.property('canonical_name', canonical_name)
.property('kind', kind)
.property('concept_type', concept_type)
.property('display_name', display_name)
.property('description', description)
.property('text', text)
.property('source_table', source_table)
.property('source_pk', source_pk)
.property('release_name', release_name)
.property('ref_hash', ref_hash)
.property('attributes_json', attributes_json)
.property('aliases', aliases_json)
.property('external_ids', external_ids_json)
.property('tags', tags_json)
.property('fingerprint', fingerprint)
.property('latest_cid', latest_cid)
.property('summary', summary)
.property('updated_at', updated_at)
.values('concept_id')
"""
c = gremlin_client.Client(
gremlin_url,
"g",
message_serializer=GraphSONSerializersV3d0(),
)
try:
c.submit(
query,
{
"concept_id": concept["concept_id"],
"canonical_name": concept.get("canonical_name") or "",
"kind": concept.get("kind") or "",
"concept_type": concept.get("concept_type") or "",
"display_name": concept.get("display_name") or "",
"description": concept.get("description") or "",
"text": concept.get("text") or "",
"source_table": concept.get("source_table") or "",
"source_pk": concept.get("source_pk") or "",
"release_name": concept.get("release_name") or "",
"ref_hash": concept.get("ref_hash") or "",
"attributes_json": concept.get("attributes_json") or "{}",
"aliases_json": json.dumps(concept.get("aliases", []), ensure_ascii=False),
"external_ids_json": json.dumps(concept.get("external_ids", {}), ensure_ascii=False),
"tags_json": json.dumps(concept.get("tags", []), ensure_ascii=False),
"fingerprint": concept["fingerprint"],
"latest_cid": concept.get("latest_cid") or "",
"summary": concept.get("summary") or "",
"created_at": created_at,
"updated_at": updated_at,
},
).all().result()
finally:
c.close()
def _infer_concept_type(row: Dict[str, Any], source_table: Optional[str]) -> str:
explicit = first_str(row, ["concept_type", "kind", "type"])
if explicit:
return explicit.lower()
lower_table = (source_table or "").lower()
if "messages" in lower_table:
return "message"
if "docs" in lower_table or "documents" in lower_table:
return "document"
if "message_id" in row:
return "message"
if "doc_id" in row or "document_id" in row:
return "document"
return "entity"
def _source_pk(row: Dict[str, Any]) -> Optional[str]:
return first_str(row, ["source_pk", "message_id", "doc_id", "document_id", "id", "uuid"])
def row_to_concept(
row: Dict[str, Any],
source_table: Optional[str],
release_name: Optional[str],
ref_hash: Optional[str],
) -> Optional[Dict[str, Any]]:
concept_type = _infer_concept_type(row, source_table)
source_pk = _source_pk(row)
display_name = first_str(
row,
[
"display_name",
"canonical_name",
"title",
"name",
"subject",
"doc_name",
"document_name",
],
)
if not display_name and source_pk:
display_name = f"{concept_type}:{source_pk}"
if not display_name:
display_name = first_str(row, ["body", "text", "content"])
if display_name:
display_name = display_name[:120]
if not display_name:
return None
external_ids = parse_json_maybe(row.get("external_ids"), dict, {})
aliases = parse_json_maybe(row.get("aliases"), list, [])
tags = parse_json_maybe(row.get("tags"), list, [])
kind = first_str(row, ["kind", "type", "doc_type", "document_type"]) or concept_type
concept_id = first_str(row, ["concept_id", "doc_id", "document_id", "id", "uuid"])
if not concept_id and source_pk:
concept_id = f"{concept_type}:{source_pk}"
if not isinstance(concept_id, str) or not concept_id.strip():
concept_id = hashlib.sha256(
f"{concept_type}|{display_name}|{json.dumps(external_ids, sort_keys=True)}".encode("utf-8")
).hexdigest()
description = first_str(row, ["description", "summary", "abstract"])
if not description:
body = first_str(row, ["content", "text", "body"])
if body:
description = body[:512]
text = first_str(row, ["text", "content", "body"])
if not text:
text = description
# Keep typed attributes stable and searchable without exploding ES mapping.
attributes_obj = row
return {
"concept_id": concept_id,
"concept_type": concept_type,
"display_name": display_name,
"description": description,
"text": text,
"source_table": source_table,
"source_pk": source_pk,
"release_name": release_name,
"ref_hash": ref_hash,
"attributes_json": json.dumps(attributes_obj, ensure_ascii=False, default=str, sort_keys=True),
"canonical_name": display_name,
"kind": kind,
"aliases": aliases,
"external_ids": external_ids,
"tags": tags,
"latest_cid": first_str(row, ["latest_cid", "cid", "ipfs_cid"]),
"summary": description,
"created_at": to_iso(row.get("created_at")) or utc_now_iso(),
"updated_at": to_iso(row.get("updated_at")) or utc_now_iso(),
"fingerprint": make_fingerprint(display_name, concept_type, external_ids),
}
def project_release(
manifest_file: Optional[str],
release_name: Optional[str],
concept_table: Optional[str],
nessie_ref: Optional[str],
releases_ref: Optional[str],
dry_run: bool,
targets: str,
) -> None:
if not manifest_file and not release_name:
raise ValueError("Provide either --manifest-file or --release-name.")
manifest: Optional[Dict[str, Any]] = load_manifest(manifest_file) if manifest_file else None
# Release-name mode: lookup manifest on registry ref (usually main), then project on release tag.
if manifest is None and release_name:
registry_ref = releases_ref or os.getenv("RELEASES_REF", "main")
spark, catalog = build_spark(registry_ref)
manifest = load_manifest_from_registry(spark, catalog, release_name)
ref = nessie_ref or infer_manifest_ref(manifest) or release_name
if ref != registry_ref:
spark.stop()
spark, catalog = build_spark(ref)
else:
ref = nessie_ref or (infer_manifest_ref(manifest) if manifest else None) or release_name
if not ref:
raise ValueError("Unable to infer Nessie ref/tag; pass --nessie-ref explicitly.")
spark, catalog = build_spark(ref)
table_identifiers: List[str] = extract_table_identifiers(manifest) if manifest else []
table = concept_table or (infer_concept_table(table_identifiers) if manifest else None)
if not table:
raise ValueError("Unable to infer concept table; pass --concept-table explicitly.")
if table.count(".") == 1:
table = f"{catalog}.{table}"
print(f"[INFO] Using Nessie ref/tag: {ref}")
print(f"[INFO] Reading table: {table}")
release_name_effective = None
ref_hash = None
if manifest:
rel = manifest.get("release")
if isinstance(rel, dict):
rel_name = rel.get("name")
if isinstance(rel_name, str) and rel_name.strip():
release_name_effective = rel_name.strip()
nes = manifest.get("nessie")
if isinstance(nes, dict):
ref_obj = nes.get("ref")
if isinstance(ref_obj, dict):
h = ref_obj.get("hash")
if isinstance(h, str) and h.strip():
ref_hash = h.strip()
if not release_name_effective and release_name and isinstance(release_name, str) and release_name.strip():
release_name_effective = release_name.strip()
df = spark.table(table)
rows = [r.asDict(recursive=True) for r in df.collect()]
concepts = [c for c in (row_to_concept(r, table, release_name_effective, ref_hash) for r in rows) if c]
print(f"[INFO] Read {len(rows)} rows, {len(concepts)} valid concepts")
print("[STEP] spark_read_done")
if dry_run:
print("[INFO] Dry-run enabled. No writes performed.")
return
use_es = targets in ("both", "es")
use_gremlin = targets in ("both", "gremlin")
print(f"[INFO] Projection targets: {targets}")
gremlin_url = os.getenv("GREMLIN_URL", "ws://localhost:8182/gremlin")
es_url = os.getenv("ES_URL", "http://localhost:9200")
es_index = os.getenv("ES_INDEX", "concepts")
if use_es:
ensure_es_index(es_url, es_index)
success = 0
failures = 0
gremlin_missing = False
es_missing = False
for concept in concepts:
try:
wrote_any = False
if use_gremlin and not gremlin_missing:
try:
gremlin_upsert(gremlin_url, concept)
wrote_any = True
except ModuleNotFoundError as e:
gremlin_missing = True
print(f"[WARN] Gremlin dependency missing ({e}). Continuing with ES only.")
except Exception as e:
print(f"[WARN] Gremlin upsert failed for {concept.get('concept_id')}: {e}")
if use_es and not es_missing:
try:
es_upsert(es_url, es_index, concept)
wrote_any = True
except ModuleNotFoundError as e:
es_missing = True
print(f"[WARN] ES dependency missing ({e}). Continuing with Gremlin only.")
except Exception as e:
print(f"[WARN] ES upsert failed for {concept.get('concept_id')}: {e}")
if wrote_any:
success += 1
else:
failures += 1
print(f"[WARN] No projection target succeeded for {concept.get('concept_id')}")
except Exception as e:
failures += 1
print(f"[WARN] Failed concept {concept.get('concept_id')}: {e}")
print("[STEP] projection_done")
print(f"[DONE] Projected {success} concepts ({failures} failed)")
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Project a lakehouse release into JanusGraph + Elasticsearch.")
p.add_argument("--manifest-file", help="Path to release manifest JSON")
p.add_argument("--release-name", help="Release name to load from releases_v2 registry")
p.add_argument("--concept-table", help="Full Iceberg table identifier holding concepts")
p.add_argument("--nessie-ref", help="Nessie branch/tag to read from (defaults to manifest tag)")
p.add_argument("--releases-ref", help="Nessie ref used to read releases_v2 (default: main)")
p.add_argument(
"--targets",
choices=["es", "gremlin", "both"],
default="both",
help="Projection targets to write (default: both)",
)
p.add_argument("--dry-run", action="store_true", help="Read and validate only")
return p.parse_args()
def main() -> None:
if load_dotenv is not None:
load_dotenv()
args = parse_args()
project_release(
manifest_file=args.manifest_file,
release_name=args.release_name,
concept_table=args.concept_table,
nessie_ref=args.nessie_ref,
releases_ref=args.releases_ref,
dry_run=args.dry_run,
targets=args.targets,
)
if __name__ == "__main__":
main()