commit 912f8ebc56f6df3b3aaac4066bf20b4862cbe1cf Author: Carl Niklas Rydberg Date: Sat Feb 14 21:10:26 2026 +0100 chore: bootstrap assistant platform baseline diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d2853b6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# Python +__pycache__/ +*.py[cod] +*.so +.venv/ +venv/ + +# Env/secrets +.env +.env.* +*.key +*.pem + +# Local/runtime +logs/ +*.log +runs.db + +# OS/editor +.DS_Store +.vscode/ +.idea/ + +# Build/temp +build/ +dist/ +.tmp/ +tmp/ diff --git a/MESSAGES_RELEASE_FLOW.md b/MESSAGES_RELEASE_FLOW.md new file mode 100644 index 0000000..4b24f5a --- /dev/null +++ b/MESSAGES_RELEASE_FLOW.md @@ -0,0 +1,27 @@ +# Messages Release Flow + +This flow creates a Nessie tag for `lake.db1.messages`, generates a manifest JSON, and appends a row to `lake.db1.releases_v2`. + +## Run on lakehouse-core + +```bash +ssh niklas@lakehouse-core.rakeroots.lan 'cd /tmp/jecio && ./create-messages-release-via-spark-container.sh' +``` + +## Custom release name + +```bash +ssh niklas@lakehouse-core.rakeroots.lan 'cd /tmp/jecio && ./create-messages-release-via-spark-container.sh rel_2026-02-14_messages-v1' +``` + +## Outputs + +- Manifest file written to `./manifests/.json` +- Nessie tag `` created at current `main` hash (or reused if already present) +- Registry row appended to `lake.db1.releases_v2` + +## Verify + +```bash +ssh niklas@lakehouse-core.rakeroots.lan "docker exec spark /opt/spark/bin/spark-sql --properties-file /opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf --packages 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5' -e \"SELECT release_name, table_identifier, snapshot_id, created_at_utc FROM lake.db1.releases_v2 WHERE table_identifier='lake.db1.messages' ORDER BY created_at_utc DESC LIMIT 10\"" +``` diff --git a/MESSAGES_SCHEMA.md b/MESSAGES_SCHEMA.md new file mode 100644 index 0000000..62668ef --- /dev/null +++ b/MESSAGES_SCHEMA.md @@ -0,0 +1,23 @@ +# Messages Schema + +Creates Iceberg table `lake.db1.messages` with ingest fields: + +- `thread_id` STRING +- `message_id` STRING +- `sender` STRING +- `channel` STRING +- `sent_at` TIMESTAMP +- `body` STRING +- `metadata_json` STRING + +## Run on lakehouse-core + +```bash +ssh niklas@lakehouse-core.rakeroots.lan 'cd /tmp/jecio && ./create-messages-table-via-spark-container.sh' +``` + +## Verify + +```bash +ssh niklas@lakehouse-core.rakeroots.lan "docker exec spark /opt/spark/bin/spark-sql --properties-file /opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf --packages 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5' -e 'DESCRIBE TABLE lake.db1.messages'" +``` diff --git a/PROJECTOR_USAGE.md b/PROJECTOR_USAGE.md new file mode 100644 index 0000000..1629c7d --- /dev/null +++ b/PROJECTOR_USAGE.md @@ -0,0 +1,142 @@ +# Release Projector + +`release_projector.py` rebuilds serving projections (JanusGraph + Elasticsearch) from a lakehouse release manifest. + +## What it does + +1. Loads a release manifest JSON (or a `releases_v2` row containing `manifest_json`). +2. Resolves Nessie tag/ref from the manifest (or `--nessie-ref`). +3. Reads the concept Iceberg table from that ref through Spark + Iceberg + Nessie. +4. Upserts each concept into JanusGraph and Elasticsearch. + +`release_projector.py` now accepts both concept-shaped rows and document-shaped rows. +For docs tables, it auto-detects typical columns: +- name: `canonical_name|title|name|subject` +- id: `concept_id|doc_id|document_id|id|uuid` +- summary text: `summary|description|abstract|content|text|body` + +## Prerequisites + +- Python deps: `python-dotenv`, `httpx`, `gremlinpython`, `pyspark` +- Spark/Iceberg/Nessie jars (default package coordinates are baked into script) +- Network access to: + - Nessie API (example: `http://lakehouse-core:19120/api/v2`) + - MinIO S3 endpoint (example: `http://lakehouse-core:9000`) + - JanusGraph Gremlin endpoint + - Elasticsearch endpoint + +## Recommended isolated env + +Do not install projector dependencies into system Python. + +## Preferred: existing spark container on lakehouse-core + +This reuses your existing `spark` container and Spark properties file. + +Standard command (frozen): + +```bash +./run-projector-standard.sh +``` + +Run by release name (no manifest path): + +```bash +./run-projector-standard.sh --release-name rel_2026-02-14_docs-v1 +``` + +Standard dry-run: + +```bash +./run-projector-standard.sh --dry-run +``` + +Copy files to host: + +```bash +rsync -av --delete /home/niklas/projects/jecio/ lakehouse-core.rakeroots.lan:/tmp/jecio/ +``` + +Run dry-run projection inside `spark` container: + +```bash +ssh lakehouse-core.rakeroots.lan 'cd /tmp/jecio && ./run-projector-via-spark-container.sh ./manifests/rel_2026-02-14_docs-v1.json lake.db1.docs --dry-run es' +``` + +Run publish projection (writes Janus/ES): + +```bash +ssh lakehouse-core.rakeroots.lan 'cd /tmp/jecio && ./run-projector-standard.sh' +``` + +`run-projector-via-spark-container.sh` uses: +- container: `spark` (override with `SPARK_CONTAINER_NAME`) +- properties file: `/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf` (override with `SPARK_PROPS`) +- Spark packages: Iceberg + Nessie extensions (override with `SPARK_PACKAGES`) +- arg4 `targets`: `es|gremlin|both` (default `both`) +- arg5 `release_name`: optional; if set, loads manifest from `releases_v2` + +Direct projector usage: + +```bash +python3 release_projector.py --release-name rel_2026-02-14_docs-v1 --concept-table lake.db1.docs --targets es --dry-run +python3 release_projector.py --release-name rel_2026-02-14_docs-v1 --concept-table lake.db1.docs --targets both +python3 release_projector.py --manifest-file manifests/rel_2026-02-14_docs-v1.json --concept-table lake.db1.docs --targets es --dry-run +python3 release_projector.py --manifest-file manifests/rel_2026-02-14_docs-v1.json --concept-table lake.db1.docs --targets both +``` + +Local setup (fallback): + +```bash +./setup_local_env.sh .venv-projector +source .venv-projector/bin/activate +``` + +Remote setup (fallback, venv on `lakehouse-core`): + +```bash +scp release_projector.py requirements-projector.txt manifests/rel_2026-02-14_docs-v1.json lakehouse-core.rakeroots.lan:/tmp/ +ssh lakehouse-core.rakeroots.lan 'python3 -m venv /tmp/jecio-projector-venv && /tmp/jecio-projector-venv/bin/pip install --upgrade pip && /tmp/jecio-projector-venv/bin/pip install -r /tmp/requirements-projector.txt' +``` + +## Required env vars (example) + +```bash +export NESSIE_URI=http://lakehouse-core:19120/api/v2 +export NESSIE_WAREHOUSE=s3a://lakehouse/warehouse +export S3_ENDPOINT=http://lakehouse-core:9000 +export AWS_ACCESS_KEY_ID=minioadmin +export AWS_SECRET_ACCESS_KEY=minioadmin + +export GREMLIN_URL=ws://janus.rakeroots.lan:8182/gremlin +export ES_URL=http://janus.rakeroots.lan:9200 +export ES_INDEX=concepts +``` + +## Run + +```bash +/tmp/jecio-projector-venv/bin/python /tmp/release_projector.py \ + --manifest-file /tmp/rel_2026-02-14_docs-v1.json \ + --concept-table lake.db1.docs \ + --dry-run +``` + +Or local: + +```bash +python3 release_projector.py \ + --manifest-file /path/to/release.json \ + --concept-table lake.db1.concepts +``` + +If the manifest has a Nessie tag in fields like `nessie.tag`, you can omit `--nessie-ref`. + +Dry run: + +```bash +python3 release_projector.py \ + --manifest-file /path/to/release.json \ + --concept-table lake.db1.concepts \ + --dry-run +``` diff --git a/app.py b/app.py new file mode 100644 index 0000000..2a0dc79 --- /dev/null +++ b/app.py @@ -0,0 +1,3227 @@ +import os +import json +import hashlib +import asyncio +import shlex +import uuid +import base64 +import imaplib +import email +import tempfile +import re +import time +from pathlib import Path +from email import policy +from email.utils import parseaddr, parsedate_to_datetime +from datetime import datetime, timezone +from typing import Optional, List, Dict, Any, Literal + +import httpx +from fastapi import FastAPI, HTTPException, Header +from fastapi.responses import FileResponse, HTMLResponse +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel, Field +from gremlin_python.driver import client as gremlin_client +from gremlin_python.driver.serializer import GraphSONSerializersV3d0 +from dotenv import load_dotenv + +APP_NAME = "concept-api" + +# Keep env loading behavior aligned with connectivity_check.py. +load_dotenv() + +# ---- config (set these env vars) ---- +GREMLIN_URL = os.getenv("GREMLIN_URL", "ws://localhost:8182/gremlin") +ES_URL = os.getenv("ES_URL", "http://localhost:9200") +ES_INDEX = os.getenv("ES_INDEX", "concepts") +IPFS_API = os.getenv("IPFS_API", "http://localhost:5001") # Kubo HTTP API +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434") +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.1:8b") +OLLAMA_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "nomic-embed-text") +PROJECTOR_SSH_HOST = os.getenv("PROJECTOR_SSH_HOST", "lakehouse-core.rakeroots.lan") +PROJECTOR_REMOTE_DIR = os.getenv("PROJECTOR_REMOTE_DIR", "/tmp/jecio") +PROJECTOR_REMOTE_SCRIPT = os.getenv("PROJECTOR_REMOTE_SCRIPT", "./run-projector-standard.sh") +PROJECTOR_SSH_BIN = os.getenv("PROJECTOR_SSH_BIN", "ssh") +PROJECTOR_SSH_OPTS = os.getenv("PROJECTOR_SSH_OPTS", "-o BatchMode=yes -o ConnectTimeout=10") +PROJECTOR_SCP_BIN = os.getenv("PROJECTOR_SCP_BIN", "scp") +PROJECTOR_SCP_OPTS = os.getenv("PROJECTOR_SCP_OPTS", "-o BatchMode=yes -o ConnectTimeout=10") +PROJECTOR_TIMEOUT_SEC = int(os.getenv("PROJECTOR_TIMEOUT_SEC", "900")) +ADMIN_API_KEY = os.getenv("ADMIN_API_KEY", "") +RUNS_REMOTE_SCRIPT = os.getenv("RUNS_REMOTE_SCRIPT", "./record-run-via-spark-container.sh") +RUN_EVENTS_REMOTE_SCRIPT = os.getenv("RUN_EVENTS_REMOTE_SCRIPT", "./record-run-event-via-spark-container.sh") +INGEST_MESSAGE_REMOTE_SCRIPT = os.getenv("INGEST_MESSAGE_REMOTE_SCRIPT", "./ingest-message-via-spark-container.sh") +INGEST_MESSAGES_BATCH_REMOTE_SCRIPT = os.getenv( + "INGEST_MESSAGES_BATCH_REMOTE_SCRIPT", + "./ingest-messages-batch-via-spark-container.sh", +) +ASSISTANT_FEEDBACK_REMOTE_SCRIPT = os.getenv( + "ASSISTANT_FEEDBACK_REMOTE_SCRIPT", + "./record-assistant-feedback-via-spark-container.sh", +) +ASSISTANT_FEEDBACK_QUERY_REMOTE_SCRIPT = os.getenv( + "ASSISTANT_FEEDBACK_QUERY_REMOTE_SCRIPT", + "./query-assistant-feedback-via-spark-container.sh", +) +ASSISTANT_METRICS_QUERY_REMOTE_SCRIPT = os.getenv( + "ASSISTANT_METRICS_QUERY_REMOTE_SCRIPT", + "./query-assistant-metrics-via-spark-container.sh", +) +ASSISTANT_ACTION_REMOTE_SCRIPT = os.getenv( + "ASSISTANT_ACTION_REMOTE_SCRIPT", + "./record-assistant-action-via-spark-container.sh", +) +ASSISTANT_ACTIONS_QUERY_REMOTE_SCRIPT = os.getenv( + "ASSISTANT_ACTIONS_QUERY_REMOTE_SCRIPT", + "./query-assistant-actions-via-spark-container.sh", +) +IMAP_CHECKPOINT_REMOTE_SCRIPT = os.getenv( + "IMAP_CHECKPOINT_REMOTE_SCRIPT", + "./query-imap-checkpoint-via-spark-container.sh", +) +CREATE_MESSAGES_RELEASE_REMOTE_SCRIPT = os.getenv( + "CREATE_MESSAGES_RELEASE_REMOTE_SCRIPT", + "./create-messages-release-via-spark-container.sh", +) + +app = FastAPI(title=APP_NAME) +UI_DIR = Path(__file__).resolve().parent / "ui" +UI_ASSETS_DIR = UI_DIR / "assets" +if UI_ASSETS_DIR.exists(): + app.mount("/ui/assets", StaticFiles(directory=str(UI_ASSETS_DIR)), name="ui-assets") + + +# --------- models --------- +class ConceptCreate(BaseModel): + canonical_name: str = Field(..., min_length=1) + kind: Optional[str] = None + aliases: List[str] = [] + description: Optional[str] = None + external_ids: Dict[str, str] = {} # {"wikidata":"Q42"} etc. + tags: List[str] = [] + + +class ConceptOut(BaseModel): + concept_id: str + canonical_name: str + kind: Optional[str] = None + aliases: List[str] = [] + external_ids: Dict[str, str] = {} + tags: List[str] = [] + latest_cid: Optional[str] = None + summary: Optional[str] = None + created_at: str + updated_at: str + + +class ProjectionTrigger(BaseModel): + release_name: str = Field(..., min_length=1) + targets: Literal["es", "gremlin", "both"] = "both" + concept_table: Optional[str] = None + dry_run: bool = False + + +class MessageIngestPayload(BaseModel): + thread_id: str = Field(..., min_length=1) + message_id: str = Field(..., min_length=1) + sender: str = Field(..., min_length=1) + channel: str = Field(..., min_length=1) + sent_at: Optional[str] = None + body: str = Field(..., min_length=1) + metadata: Dict[str, Any] = {} + table: str = "lake.db1.messages" + + +class MessageIngestItem(BaseModel): + thread_id: str = Field(..., min_length=1) + message_id: str = Field(..., min_length=1) + sender: str = Field(..., min_length=1) + channel: str = Field(..., min_length=1) + sent_at: Optional[str] = None + body: str = Field(..., min_length=1) + metadata: Dict[str, Any] = {} + + +class MessageIngestBatchPayload(BaseModel): + table: str = "lake.db1.messages" + dedupe_mode: Literal["none", "message_id", "thread_message"] = "none" + messages: List[MessageIngestItem] = Field(default_factory=list) + + +class EmailImapIngestPayload(BaseModel): + host: str = Field(..., min_length=1) + port: int = 993 + use_ssl: bool = True + username: str = Field(..., min_length=1) + password: Optional[str] = None + mailbox: str = "INBOX" + search_criteria: str = "ALL" + max_messages: int = Field(default=50, ge=1, le=500) + table: str = "lake.db1.messages" + dedupe_mode: Literal["none", "message_id", "thread_message"] = "message_id" + channel: str = "email-imap" + incremental: bool = True + since_uid: Optional[int] = None + + +class PollAndProjectPayload(BaseModel): + imap: EmailImapIngestPayload + release_name: Optional[str] = None + release_prefix: str = "rel" + targets: Literal["es", "gremlin", "both"] = "es" + concept_table: str = "lake.db1.messages" + dry_run: bool = False + project_if_no_new: bool = False + + +class AssistantDraftPayload(BaseModel): + task_type: Literal["message", "finance", "gov", "general"] = "general" + goal: str = Field(..., min_length=3) + recipient: Optional[str] = None + tone: Optional[str] = "professional" + constraints: List[str] = Field(default_factory=list) + release_name: Optional[str] = None + max_sources: int = Field(default=5, ge=1, le=20) + + +class AssistantDraftSource(BaseModel): + concept_id: str + source_pk: Optional[str] = None + source_table: Optional[str] = None + release_name: Optional[str] = None + score: Optional[float] = None + + +class AssistantDraftResponse(BaseModel): + task_type: str + draft: str + sources: List[AssistantDraftSource] + confidence: float + needs_review: bool + release_name: Optional[str] = None + + +class AssistantPlanPayload(BaseModel): + task_type: Literal["message", "finance", "gov", "general"] = "general" + objective: str = Field(..., min_length=3) + constraints: List[str] = Field(default_factory=list) + release_name: Optional[str] = None + max_sources: int = Field(default=5, ge=1, le=20) + max_steps: int = Field(default=6, ge=1, le=20) + + +class AssistantPlanStep(BaseModel): + step_id: str + title: str + action_type: Literal["research", "draft", "ask_user", "prepare_data", "review"] + requires_approval: bool = False + notes: Optional[str] = None + + +class AssistantPlanResponse(BaseModel): + objective: str + task_type: str + plan: List[AssistantPlanStep] + sources: List[AssistantDraftSource] + needs_review: bool + confidence: float + release_name: Optional[str] = None + + +class AssistantExecuteStepPayload(BaseModel): + task_type: Literal["message", "finance", "gov", "general"] = "general" + objective: str = Field(..., min_length=3) + release_name: Optional[str] = None + plan: List[AssistantPlanStep] + step_id: str = Field(..., min_length=1) + approved: bool = False + manual_confirm_token: Optional[str] = None + + +class AssistantExecuteStepResponse(BaseModel): + action_id: str + step_id: str + status: Literal["blocked", "executed"] + output: Dict[str, Any] + needs_review: bool + + +class AssistantFeedbackPayload(BaseModel): + outcome: Literal["accepted", "edited", "rejected"] + task_type: Literal["message", "finance", "gov", "general"] = "general" + release_name: Optional[str] = None + goal: Optional[str] = None + draft: str = Field(..., min_length=1) + final_text: Optional[str] = None + sources: List[AssistantDraftSource] = Field(default_factory=list) + confidence: Optional[float] = None + needs_review: bool = True + notes: Optional[str] = None + + +class AssistantLearnPayload(BaseModel): + text: str = Field(..., min_length=3) + title: Optional[str] = None + tags: List[str] = Field(default_factory=list) + release_name: Optional[str] = None + metadata: Dict[str, Any] = Field(default_factory=dict) + + +class AssistantChatMessage(BaseModel): + role: Literal["user", "assistant"] + content: str + + +class AssistantChatPayload(BaseModel): + message: str = Field(..., min_length=1) + session_id: Optional[str] = None + release_name: Optional[str] = None + max_sources: int = Field(default=6, ge=1, le=20) + history: List[AssistantChatMessage] = Field(default_factory=list) + temperature_hint: Optional[str] = "balanced" + + +class AssistantChatResponse(BaseModel): + session_id: str + answer: str + sources: List[AssistantDraftSource] + confidence: float + release_name: Optional[str] = None + + +# --------- helpers --------- +def now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _tail(text: str, max_chars: int = 8000) -> str: + if len(text) <= max_chars: + return text + return text[-max_chars:] + + +def check_admin_api_key(x_admin_api_key: Optional[str]) -> None: + if not ADMIN_API_KEY: + raise HTTPException(status_code=503, detail="ADMIN_API_KEY is not configured") + if x_admin_api_key != ADMIN_API_KEY: + raise HTTPException(status_code=401, detail="Unauthorized") + +def make_fingerprint(name: str, kind: Optional[str], external_ids: Dict[str, str]) -> str: + norm = (name or "").strip().lower() + k = (kind or "").strip().lower() + ext = "|".join(f"{a}:{b}".lower() for a, b in sorted(external_ids.items())) + raw = f"{norm}|{k}|{ext}" + return hashlib.sha256(raw.encode("utf-8")).hexdigest() + + +def _clean_header_id(v: Optional[str]) -> str: + if not v: + return "" + return v.strip().strip("<>").strip() + + +def _normalize_thread_id(msg_id: str, refs: str, in_reply_to: str, subject: str, sender: str) -> str: + refs_clean = _clean_header_id(refs.split()[-1] if refs else "") + in_reply_clean = _clean_header_id(in_reply_to) + if refs_clean: + return f"thread:{refs_clean}" + if in_reply_clean: + return f"thread:{in_reply_clean}" + seed = f"{subject.strip().lower()}|{sender.strip().lower()}" + if not seed.strip("|"): + seed = msg_id + return "thread:" + hashlib.sha256(seed.encode("utf-8")).hexdigest()[:24] + + +def _extract_body_text(msg: email.message.Message) -> str: + try: + if msg.is_multipart(): + for part in msg.walk(): + ctype = (part.get_content_type() or "").lower() + disp = (part.get("Content-Disposition") or "").lower() + if ctype == "text/plain" and "attachment" not in disp: + payload_obj = part.get_content() + if isinstance(payload_obj, str): + return payload_obj.strip() + if isinstance(payload_obj, bytes): + return payload_obj.decode(part.get_content_charset() or "utf-8", errors="replace").strip() + for part in msg.walk(): + ctype = (part.get_content_type() or "").lower() + if ctype == "text/html": + html_obj = part.get_content() + if isinstance(html_obj, bytes): + html_obj = html_obj.decode(part.get_content_charset() or "utf-8", errors="replace") + if isinstance(html_obj, str): + return html_obj.strip() + return "" + payload_obj = msg.get_content() + if isinstance(payload_obj, str): + return payload_obj.strip() + if isinstance(payload_obj, bytes): + return payload_obj.decode(msg.get_content_charset() or "utf-8", errors="replace").strip() + return "" + except Exception: + return "" + + +def fetch_imap_messages_blocking( + payload: EmailImapIngestPayload, + effective_search_criteria: str, + since_uid: Optional[int], +) -> List[MessageIngestItem]: + password = payload.password or os.getenv("IMAP_PASSWORD", "") + if not password: + raise ValueError("IMAP password missing: provide payload.password or set IMAP_PASSWORD") + + if payload.use_ssl: + client = imaplib.IMAP4_SSL(payload.host, payload.port) + else: + client = imaplib.IMAP4(payload.host, payload.port) + + try: + status, _ = client.login(payload.username, password) + if status != "OK": + raise RuntimeError("IMAP login failed") + status, _ = client.select(payload.mailbox, readonly=True) + if status != "OK": + raise RuntimeError(f"IMAP select mailbox failed: {payload.mailbox}") + + if since_uid is not None: + status, search_data = client.uid("search", None, "UID", f"{int(since_uid) + 1}:*") + else: + status, search_data = client.uid("search", None, effective_search_criteria) + if status != "OK": + raise RuntimeError(f"IMAP search failed: {effective_search_criteria}") + uid_bytes = search_data[0] if search_data else b"" + uid_list = [u for u in uid_bytes.decode("utf-8", errors="replace").split() if u] + if since_uid is not None: + filtered: List[str] = [] + for u in uid_list: + try: + if int(u) > int(since_uid): + filtered.append(u) + except Exception: + continue + uid_list = filtered + if not uid_list: + return [] + # For incremental UID windows, process oldest-new first so checkpointing cannot skip gaps. + is_uid_window = since_uid is not None + if is_uid_window: + selected_uids = uid_list[: payload.max_messages] + else: + # For non-incremental scans (e.g. ALL), keep "latest N" behavior. + selected_uids = uid_list[-payload.max_messages :] + + out: List[MessageIngestItem] = [] + for uid in selected_uids: + status, msg_data = client.uid("fetch", uid, "(RFC822)") + if status != "OK" or not msg_data: + continue + raw_bytes = None + for part in msg_data: + if isinstance(part, tuple) and len(part) >= 2 and isinstance(part[1], (bytes, bytearray)): + raw_bytes = bytes(part[1]) + break + if not raw_bytes: + continue + msg = email.message_from_bytes(raw_bytes, policy=policy.default) + + subject = str(msg.get("Subject") or "").strip() + from_raw = str(msg.get("From") or "").strip() + to_raw = str(msg.get("To") or "").strip() + date_raw = str(msg.get("Date") or "").strip() + msg_id_raw = str(msg.get("Message-Id") or msg.get("Message-ID") or "").strip() + refs_raw = str(msg.get("References") or "").strip() + in_reply_raw = str(msg.get("In-Reply-To") or "").strip() + + sender_email = parseaddr(from_raw)[1] or from_raw or "unknown" + msg_id_clean = _clean_header_id(msg_id_raw) + if not msg_id_clean: + seed = f"{uid}|{subject}|{sender_email}|{date_raw}" + msg_id_clean = "imap-" + hashlib.sha256(seed.encode("utf-8")).hexdigest()[:24] + + thread_id = _normalize_thread_id( + msg_id=msg_id_clean, + refs=refs_raw, + in_reply_to=in_reply_raw, + subject=subject, + sender=sender_email, + ) + + sent_at_iso = None + if date_raw: + try: + dt = parsedate_to_datetime(date_raw) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + sent_at_iso = dt.astimezone(timezone.utc).isoformat() + except Exception: + sent_at_iso = None + + body = _extract_body_text(msg) + if not body: + body = f"(no body) {subject}".strip() + + metadata = { + "subject": subject, + "from": from_raw, + "to": to_raw, + "date": date_raw, + "imap_uid": uid, + "mailbox": payload.mailbox, + "host": payload.host, + "username": payload.username, + } + + out.append( + MessageIngestItem( + thread_id=thread_id, + message_id=msg_id_clean, + sender=sender_email, + channel=payload.channel, + sent_at=sent_at_iso, + body=body, + metadata=metadata, + ) + ) + return out + finally: + try: + client.logout() + except Exception: + pass + + +async def run_remote_query_imap_checkpoint( + host: str, + mailbox: str, + username: str, + table: str, +) -> Optional[int]: + parts = [ + IMAP_CHECKPOINT_REMOTE_SCRIPT, + host, + mailbox, + username, + table, + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="IMAP checkpoint query timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + if proc.returncode != 0: + raise HTTPException( + status_code=502, + detail={ + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + }, + ) + try: + obj = _extract_json_object_from_text(out) + val = obj.get("max_uid") + if val is None: + return None + return int(val) + except Exception as e: + raise HTTPException( + status_code=502, + detail={ + "message": f"Unable to parse IMAP checkpoint output: {e}", + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + }, + ) + + +async def run_remote_create_messages_release(release_name: str) -> Dict[str, Any]: + parts = [ + CREATE_MESSAGES_RELEASE_REMOTE_SCRIPT, + release_name, + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Create messages release timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + result = { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "release_name": release_name, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + } + if proc.returncode != 0: + raise HTTPException(status_code=502, detail=result) + return result + + +async def ipfs_add_json(payload: Dict[str, Any]) -> str: + # Kubo: POST /api/v0/add with file content in multipart + data = json.dumps(payload, ensure_ascii=False).encode("utf-8") + files = {"file": ("concept.json", data, "application/json")} + async with httpx.AsyncClient(timeout=30) as h: + r = await h.post(f"{IPFS_API}/api/v0/add", files=files) + r.raise_for_status() + # Response is text lines of JSON; last line contains Hash + # Often single line, but handle both + last = r.text.strip().splitlines()[-1] + obj = json.loads(last) + return obj["Hash"] + +async def ollama_summary(text: str) -> str: + prompt = ( + "Summarize the following concept in 1-2 sentences. " + "Keep it factual and compact.\n\n" + f"{text}" + ) + async with httpx.AsyncClient(timeout=60) as h: + r = await h.post( + f"{OLLAMA_URL}/api/generate", + json={"model": OLLAMA_MODEL, "prompt": prompt, "stream": False}, + ) + r.raise_for_status() + return (r.json().get("response") or "").strip() + +async def ollama_embed(text: str) -> List[float]: + async with httpx.AsyncClient(timeout=60) as h: + r = await h.post( + f"{OLLAMA_URL}/api/embeddings", + json={"model": OLLAMA_EMBED_MODEL, "prompt": text}, + ) + r.raise_for_status() + emb = r.json().get("embedding") + if not isinstance(emb, list): + return [] + return emb + +async def es_ensure_index(): + # Minimal mapping: text fields + dense_vector (optional) + # If your ES doesn't support dense_vector, remove it. + mapping = { + "mappings": { + "properties": { + "concept_id": {"type": "keyword"}, + "canonical_name": {"type": "text"}, + "kind": {"type": "keyword"}, + "aliases": {"type": "text"}, + "tags": {"type": "keyword"}, + "summary": {"type": "text"}, + "latest_cid": {"type": "keyword"}, + "fingerprint": {"type": "keyword"}, + "created_at": {"type": "date"}, + "updated_at": {"type": "date"}, + "embedding": {"type": "dense_vector", "dims": 768, "index": False}, # may vary + } + } + } + async with httpx.AsyncClient(timeout=30) as h: + head = await h.get(f"{ES_URL}/{ES_INDEX}") + if head.status_code == 200: + return + r = await h.put(f"{ES_URL}/{ES_INDEX}", json=mapping) + # If this fails due to dense_vector incompatibility, you can still proceed: + if r.status_code >= 400: + # try without vector + mapping["mappings"]["properties"].pop("embedding", None) + r2 = await h.put(f"{ES_URL}/{ES_INDEX}", json=mapping) + r2.raise_for_status() + +async def es_index(doc: Dict[str, Any]): + async with httpx.AsyncClient(timeout=30) as h: + r = await h.put(f"{ES_URL}/{ES_INDEX}/_doc/{doc['concept_id']}", json=doc) + r.raise_for_status() + +async def es_search(q: str, size: int = 10) -> List[Dict[str, Any]]: + query = { + "size": size, + "query": { + "multi_match": { + "query": q, + "fields": ["canonical_name^3", "aliases^2", "summary", "tags"], + } + }, + } + async with httpx.AsyncClient(timeout=30) as h: + r = await h.post(f"{ES_URL}/{ES_INDEX}/_search", json=query) + r.raise_for_status() + hits = r.json().get("hits", {}).get("hits", []) + return [h["_source"] for h in hits] + + +async def es_search_hits(q: str, size: int = 10, release_name: Optional[str] = None) -> List[Dict[str, Any]]: + must_clause: Dict[str, Any] = { + "multi_match": { + "query": q, + "fields": [ + "display_name^3", + "canonical_name^3", + "description^2", + "text^2", + "summary^2", + "aliases^2", + "tags", + "source_pk^2", + "source_table", + ], + } + } + query: Dict[str, Any] = {"size": size} + if release_name: + release_filter = { + "bool": { + "should": [ + {"term": {"release_name.keyword": release_name}}, + {"term": {"release_name": release_name}}, + {"match_phrase": {"release_name": release_name}}, + ], + "minimum_should_match": 1, + } + } + query["query"] = { + "bool": { + "must": [must_clause], + "filter": [release_filter], + } + } + else: + query["query"] = must_clause + + async with httpx.AsyncClient(timeout=30) as h: + r = await h.post(f"{ES_URL}/{ES_INDEX}/_search", json=query) + r.raise_for_status() + return r.json().get("hits", {}).get("hits", []) + + +async def es_recent_by_release(release_name: str, size: int = 10) -> List[Dict[str, Any]]: + query: Dict[str, Any] = { + "size": size, + "query": { + "bool": { + "filter": [ + { + "bool": { + "should": [ + {"term": {"release_name.keyword": release_name}}, + {"term": {"release_name": release_name}}, + {"match_phrase": {"release_name": release_name}}, + ], + "minimum_should_match": 1, + } + } + ] + } + }, + "sort": [{"updated_at": {"order": "desc", "unmapped_type": "date"}}], + } + async with httpx.AsyncClient(timeout=30) as h: + r = await h.post(f"{ES_URL}/{ES_INDEX}/_search", json=query) + r.raise_for_status() + return r.json().get("hits", {}).get("hits", []) + + +async def es_recent_messages( + size: int = 20, + release_name: Optional[str] = None, + q: Optional[str] = None, +) -> List[Dict[str, Any]]: + filters: List[Dict[str, Any]] = [ + { + "bool": { + "should": [ + {"term": {"concept_type.keyword": "message"}}, + {"term": {"concept_type": "message"}}, + {"term": {"kind.keyword": "message"}}, + {"term": {"kind": "message"}}, + ], + "minimum_should_match": 1, + } + } + ] + if release_name: + filters.append( + { + "bool": { + "should": [ + {"term": {"release_name.keyword": release_name}}, + {"term": {"release_name": release_name}}, + {"match_phrase": {"release_name": release_name}}, + ], + "minimum_should_match": 1, + } + } + ) + + must: List[Dict[str, Any]] = [] + if q and q.strip(): + must.append( + { + "multi_match": { + "query": q.strip(), + "fields": [ + "text^3", + "description^2", + "summary^2", + "display_name^2", + "canonical_name^2", + "source_pk^2", + ], + } + } + ) + + query: Dict[str, Any] = { + "size": size, + "query": { + "bool": { + "filter": filters, + "must": must, + } + }, + "sort": [{"updated_at": {"order": "desc", "unmapped_type": "date"}}], + } + async with httpx.AsyncClient(timeout=30) as h: + r = await h.post(f"{ES_URL}/{ES_INDEX}/_search", json=query) + r.raise_for_status() + return r.json().get("hits", {}).get("hits", []) + + +TASK_TRIGGER_RE = re.compile( + r"\b(" + r"please|can you|could you|need to|needs to|todo|to do|follow up|follow-up|" + r"review|fix|send|reply|schedule|book|call|remind|prepare|draft|submit|pay|" + r"deadline|due|let me know|check in|confirm|update me|get back to me" + r")\b", + flags=re.IGNORECASE, +) +TASK_DONE_RE = re.compile(r"\b(done|completed|closed|resolved|fixed)\b", flags=re.IGNORECASE) +DUE_HINT_RE = re.compile( + r"\b(" + r"today|tomorrow|tonight|next week|next month|monday|tuesday|wednesday|thursday|friday|saturday|sunday|" + r"\d{4}-\d{2}-\d{2}|" + r"\d{1,2}:\d{2}" + r")\b", + flags=re.IGNORECASE, +) + +TASK_AI_CACHE_TTL_SEC = int(os.getenv("TASK_AI_CACHE_TTL_SEC", "3600")) +TASK_AI_CACHE_MAX_SIZE = int(os.getenv("TASK_AI_CACHE_MAX_SIZE", "5000")) +TASK_AI_CACHE: Dict[str, Dict[str, Any]] = {} +ASSISTANT_CHAT_MAX_TURNS = int(os.getenv("ASSISTANT_CHAT_MAX_TURNS", "20")) +ASSISTANT_CHAT_SESSIONS: Dict[str, List[Dict[str, str]]] = {} + + +def _split_sentences(text: str) -> List[str]: + raw_parts = re.split(r"[\n\r]+|(?<=[.!?])\s+", text or "") + return [p.strip() for p in raw_parts if p and p.strip()] + + +def _extract_due_hint(text: str) -> Optional[str]: + m = DUE_HINT_RE.search(text or "") + if not m: + return None + return m.group(1) + + +def _extract_who(text: str, default_sender: Optional[str]) -> Optional[str]: + m = re.search(r"\b(?:to|for)\s+([A-Z][a-zA-Z0-9_-]{1,40})\b", text or "") + if m: + return m.group(1) + return default_sender or None + + +def extract_pending_tasks_from_source(src: Dict[str, Any]) -> List[Dict[str, Any]]: + text = str(src.get("text") or src.get("description") or src.get("summary") or "").strip() + if not text: + return [] + + attrs_raw = src.get("attributes_json") + attrs: Dict[str, Any] = {} + if isinstance(attrs_raw, str) and attrs_raw.strip(): + try: + parsed = json.loads(attrs_raw) + if isinstance(parsed, dict): + attrs = parsed + except Exception: + attrs = {} + elif isinstance(attrs_raw, dict): + attrs = attrs_raw + + thread_id = attrs.get("thread_id") + message_id = attrs.get("message_id") or src.get("source_pk") + sender = attrs.get("sender") + sent_at = attrs.get("sent_at") + base_source = f"{src.get('concept_id') or ''}|{message_id or ''}" + + tasks: List[Dict[str, Any]] = [] + for sentence in _split_sentences(text): + if len(sentence) < 8: + continue + looks_like_question = "?" in sentence + if not TASK_TRIGGER_RE.search(sentence) and not looks_like_question: + continue + + status = "done" if TASK_DONE_RE.search(sentence) else "pending" + due_hint = _extract_due_hint(sentence) + who = _extract_who(sentence, sender) + task_id = "task-" + hashlib.sha256(f"{base_source}|{sentence.lower()}".encode("utf-8")).hexdigest()[:16] + tasks.append( + { + "task_id": task_id, + "status": status, + "todo": sentence[:400], + "due_hint": due_hint, + "who": who, + "concept_id": src.get("concept_id"), + "source_pk": src.get("source_pk"), + "source_table": src.get("source_table"), + "release_name": src.get("release_name"), + "thread_id": thread_id, + "message_id": message_id, + "sender": sender, + "sent_at": sent_at, + "updated_at": src.get("updated_at"), + } + ) + return tasks + + +def _task_ai_cache_key(src: Dict[str, Any]) -> str: + text = str(src.get("text") or src.get("description") or src.get("summary") or "") + text_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()[:24] + base = "|".join( + [ + str(src.get("concept_id") or ""), + str(src.get("source_pk") or ""), + str(src.get("updated_at") or ""), + text_hash, + OLLAMA_MODEL, + ] + ) + return hashlib.sha256(base.encode("utf-8")).hexdigest() + + +def _task_ai_cache_get(key: str) -> Optional[List[Dict[str, Any]]]: + obj = TASK_AI_CACHE.get(key) + if not obj: + return None + expires_at = float(obj.get("expires_at") or 0) + if expires_at <= time.time(): + TASK_AI_CACHE.pop(key, None) + return None + tasks = obj.get("tasks") + if not isinstance(tasks, list): + return None + return tasks + + +def _task_ai_cache_set(key: str, tasks: List[Dict[str, Any]]) -> None: + # Keep cache bounded in a simple way by evicting oldest-expiry items first. + if len(TASK_AI_CACHE) >= TASK_AI_CACHE_MAX_SIZE: + keys_sorted = sorted(TASK_AI_CACHE.items(), key=lambda kv: float(kv[1].get("expires_at") or 0)) + trim_count = max(1, TASK_AI_CACHE_MAX_SIZE // 10) + for k, _ in keys_sorted[:trim_count]: + TASK_AI_CACHE.pop(k, None) + TASK_AI_CACHE[key] = { + "expires_at": time.time() + TASK_AI_CACHE_TTL_SEC, + "tasks": tasks, + } + + +def build_task_extraction_prompt(src: Dict[str, Any]) -> str: + text = str(src.get("text") or src.get("description") or src.get("summary") or "")[:4000] + attrs_raw = src.get("attributes_json") + attrs = attrs_raw if isinstance(attrs_raw, str) else json.dumps(attrs_raw or {}, ensure_ascii=False) + return ( + "Extract actionable tasks from this message. Ignore pure marketing/promotional content.\n" + "Return strict JSON only with shape:\n" + '{"tasks":[{"todo":"...","status":"pending|done","due_hint":"...|null","who":"...|null"}]}\n' + "If no actionable tasks, return {\"tasks\":[]}.\n\n" + f"Message concept_id: {src.get('concept_id')}\n" + f"Source pk: {src.get('source_pk')}\n" + f"Attributes JSON: {attrs}\n" + f"Text:\n{text}\n" + ) + + +async def extract_pending_tasks_from_source_ai(src: Dict[str, Any]) -> List[Dict[str, Any]]: + prompt = build_task_extraction_prompt(src) + raw = await ollama_generate(prompt) + obj = _extract_json_object_from_text(raw) + task_items = obj.get("tasks") + if not isinstance(task_items, list): + return [] + + attrs_raw = src.get("attributes_json") + attrs: Dict[str, Any] = {} + if isinstance(attrs_raw, str) and attrs_raw.strip(): + try: + parsed = json.loads(attrs_raw) + if isinstance(parsed, dict): + attrs = parsed + except Exception: + attrs = {} + elif isinstance(attrs_raw, dict): + attrs = attrs_raw + + thread_id = attrs.get("thread_id") + message_id = attrs.get("message_id") or src.get("source_pk") + sender = attrs.get("sender") + sent_at = attrs.get("sent_at") + base_source = f"{src.get('concept_id') or ''}|{message_id or ''}" + out: List[Dict[str, Any]] = [] + for item in task_items: + if not isinstance(item, dict): + continue + todo = str(item.get("todo") or "").strip() + if len(todo) < 4: + continue + status_raw = str(item.get("status") or "pending").strip().lower() + status = "done" if status_raw == "done" else "pending" + due_hint = item.get("due_hint") + who = item.get("who") + todo_norm = todo[:400] + task_id = "task-" + hashlib.sha256(f"{base_source}|{todo_norm.lower()}".encode("utf-8")).hexdigest()[:16] + out.append( + { + "task_id": task_id, + "status": status, + "todo": todo_norm, + "due_hint": str(due_hint) if due_hint is not None else None, + "who": str(who) if who is not None else sender, + "concept_id": src.get("concept_id"), + "source_pk": src.get("source_pk"), + "source_table": src.get("source_table"), + "release_name": src.get("release_name"), + "thread_id": thread_id, + "message_id": message_id, + "sender": sender, + "sent_at": sent_at, + "updated_at": src.get("updated_at"), + } + ) + return out + + +def build_chat_prompt( + user_message: str, + history: List[Dict[str, str]], + source_docs: List[Dict[str, Any]], + release_name: Optional[str], +) -> str: + history_lines: List[str] = [] + for t in history[-ASSISTANT_CHAT_MAX_TURNS:]: + role = t.get("role", "") + content = (t.get("content", "") or "").strip() + if role in ("user", "assistant") and content: + history_lines.append(f"{role}: {content[:1200]}") + + context_chunks = [] + for d in source_docs: + src = d.get("_source", {}) or {} + context_chunks.append( + "\n".join( + [ + f"concept_id: {src.get('concept_id', '')}", + f"source_pk: {src.get('source_pk', '')}", + f"release_name: {src.get('release_name', '')}", + f"text: {str(src.get('text') or src.get('description') or src.get('summary') or '')[:1200]}", + ] + ) + ) + context = "\n\n---\n\n".join(context_chunks) if context_chunks else "No retrieved context." + hist = "\n".join(history_lines) if history_lines else "(none)" + + return ( + "You are a practical personal assistant. Be concise, factual, and useful.\n" + "Use retrieved context when available. If uncertain, say so briefly and ask one clarifying question.\n" + "Do not claim external actions were already performed.\n\n" + f"Release filter: {release_name or '(none)'}\n" + f"Conversation history:\n{hist}\n\n" + f"Retrieved context:\n{context}\n\n" + f"User: {user_message}\n" + "Assistant:" + ) + + +def _append_chat_turn(session_id: str, role: str, content: str) -> None: + turns = ASSISTANT_CHAT_SESSIONS.get(session_id, []) + turns.append({"role": role, "content": content}) + max_items = ASSISTANT_CHAT_MAX_TURNS * 2 + if len(turns) > max_items: + turns = turns[-max_items:] + ASSISTANT_CHAT_SESSIONS[session_id] = turns + + +def build_assistant_prompt(payload: AssistantDraftPayload, source_docs: List[Dict[str, Any]]) -> str: + recipient = payload.recipient or "unspecified recipient" + tone = payload.tone or "professional" + constraints = payload.constraints or [] + constraint_lines = "\n".join(f"- {c}" for c in constraints) if constraints else "- None" + + context_chunks = [] + for d in source_docs: + src = d.get("_source", {}) or {} + context_chunks.append( + "\n".join( + [ + f"concept_id: {src.get('concept_id', '')}", + f"concept_type: {src.get('concept_type', '')}", + f"source_pk: {src.get('source_pk', '')}", + f"source_table: {src.get('source_table', '')}", + f"release_name: {src.get('release_name', '')}", + f"text: {str(src.get('text') or src.get('description') or src.get('summary') or '')[:1000]}", + ] + ) + ) + context = "\n\n---\n\n".join(context_chunks) if context_chunks else "No retrieved context." + + return ( + "You are a careful personal assistant. Draft a response based only on provided context.\n" + "If context is missing, state uncertainty and ask concise follow-up questions.\n" + "Do not claim actions were sent; provide draft-only text.\n\n" + f"Task type: {payload.task_type}\n" + f"Goal: {payload.goal}\n" + f"Recipient: {recipient}\n" + f"Tone: {tone}\n" + f"Constraints:\n{constraint_lines}\n\n" + "Retrieved context:\n" + f"{context}\n\n" + "Output only the draft text." + ) + + +async def ollama_generate(prompt: str) -> str: + async with httpx.AsyncClient(timeout=90) as h: + r = await h.post( + f"{OLLAMA_URL}/api/generate", + json={"model": OLLAMA_MODEL, "prompt": prompt, "stream": False}, + ) + r.raise_for_status() + return (r.json().get("response") or "").strip() + + +def fallback_draft_text(payload: AssistantDraftPayload) -> str: + recipient = payload.recipient or "there" + tone = (payload.tone or "").lower() + if payload.task_type == "message": + if "friendly" in tone: + return ( + f"Hi {recipient},\n\n" + "Thanks for your message. I received it and will follow up tomorrow.\n\n" + "Best," + ) + return ( + f"Hello {recipient},\n\n" + "I confirm receipt of your message and will follow up tomorrow.\n\n" + "Regards," + ) + return ( + "Draft:\n" + f"{payload.goal}\n\n" + "I can refine this further once retrieval/model services are available." + ) + + +def _extract_json_object_from_text(text: str) -> Dict[str, Any]: + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1 or end < start: + raise ValueError("No JSON object found in output") + candidate = text[start : end + 1] + obj = json.loads(candidate) + if not isinstance(obj, dict): + raise ValueError("Parsed value is not a JSON object") + return obj + + +def build_assistant_plan_prompt(payload: AssistantPlanPayload, source_docs: List[Dict[str, Any]]) -> str: + constraints = payload.constraints or [] + constraint_lines = "\n".join(f"- {c}" for c in constraints) if constraints else "- None" + context_chunks = [] + for d in source_docs: + src = d.get("_source", {}) or {} + context_chunks.append( + "\n".join( + [ + f"concept_id: {src.get('concept_id', '')}", + f"source_pk: {src.get('source_pk', '')}", + f"source_table: {src.get('source_table', '')}", + f"release_name: {src.get('release_name', '')}", + f"text: {str(src.get('text') or src.get('description') or src.get('summary') or '')[:600]}", + ] + ) + ) + context = "\n\n---\n\n".join(context_chunks) if context_chunks else "No retrieved context." + return ( + "You are a cautious personal assistant planner. Produce an execution plan only; do not execute anything.\n" + "Return valid JSON ONLY with this exact shape:\n" + '{' + '"plan": [' + '{"step_id":"S1","title":"...","action_type":"research|draft|ask_user|prepare_data|review","requires_approval":true|false,"notes":"..."}' + "]" + "}\n" + f"Use at most {payload.max_steps} steps.\n" + "Prefer safe read-only and draft actions first.\n\n" + f"Task type: {payload.task_type}\n" + f"Objective: {payload.objective}\n" + f"Constraints:\n{constraint_lines}\n\n" + "Retrieved context:\n" + f"{context}\n" + ) + + +def fallback_plan(payload: AssistantPlanPayload) -> List[AssistantPlanStep]: + return [ + AssistantPlanStep( + step_id="S1", + title="Gather relevant facts and constraints", + action_type="research", + requires_approval=False, + notes="Review messages/concepts and identify required context.", + ), + AssistantPlanStep( + step_id="S2", + title="Draft a response or action proposal", + action_type="draft", + requires_approval=False, + notes="Produce a concise draft aligned with objective and constraints.", + ), + AssistantPlanStep( + step_id="S3", + title="Request user confirmation before any external action", + action_type="ask_user", + requires_approval=True, + notes="Do not send or execute changes until approved.", + ), + ][: payload.max_steps] + + +def find_plan_step(plan: List[AssistantPlanStep], step_id: str) -> Optional[AssistantPlanStep]: + for s in plan: + if s.step_id == step_id: + return s + return None + + +def is_high_risk_step(step: AssistantPlanStep) -> bool: + text = f"{step.title} {step.notes or ''}".lower() + high_risk_terms = [ + "send", + "submit", + "pay", + "payment", + "transfer", + "wire", + "sign", + "file", + "delete", + "close account", + "change account", + ] + return any(t in text for t in high_risk_terms) + + +def enforce_step_policy(payload: AssistantExecuteStepPayload, step: AssistantPlanStep) -> Optional[str]: + # Plan-declared approval gate. + if step.requires_approval and not payload.approved: + return "Step requires approval but approved=false." + # Extra hard gate for risky external actions. + if is_high_risk_step(step): + if not payload.approved: + return "High-risk step requires approved=true." + if not (payload.manual_confirm_token and payload.manual_confirm_token.strip()): + return "High-risk step requires manual_confirm_token." + return None + + +async def execute_plan_step(payload: AssistantExecuteStepPayload, step: AssistantPlanStep) -> Dict[str, Any]: + if step.action_type == "draft": + prompt = ( + "Draft concise text for this approved planning step.\n" + f"Task type: {payload.task_type}\n" + f"Objective: {payload.objective}\n" + f"Step: {step.title}\n" + f"Notes: {step.notes or ''}\n" + "Output only final draft text." + ) + try: + text = await ollama_generate(prompt) + if not text.strip(): + text = f"Draft for step '{step.title}'." + except Exception: + text = f"Draft for step '{step.title}'." + return {"draft": text} + if step.action_type == "research": + return {"note": "Research step acknowledged. Use /search or /assistant/draft for grounded retrieval."} + if step.action_type == "prepare_data": + return {"note": "Prepare-data step acknowledged.", "checklist": ["Collect required inputs", "Normalize format", "Validate completeness"]} + if step.action_type == "review": + return {"note": "Review step requires human review before external action."} + if step.action_type == "ask_user": + return {"question": "Please confirm whether to proceed with the next high-impact action."} + return {"note": "Step recognized but no executor implemented."} + +def _gremlin_submit_blocking(q: str, bindings: Dict[str, Any]) -> Any: + # Create/close client per call so transport loop is owned by this worker thread. + c = gremlin_client.Client( + GREMLIN_URL, + "g", + message_serializer=GraphSONSerializersV3d0(), + ) + try: + return c.submit(q, bindings).all().result() + finally: + c.close() + +async def gremlin_submit(q: str, bindings: Dict[str, Any]) -> Any: + return await asyncio.to_thread(_gremlin_submit_blocking, q, bindings) + + +async def run_remote_projector(payload: ProjectionTrigger) -> Dict[str, Any]: + parts = [ + PROJECTOR_REMOTE_SCRIPT, + "--release-name", payload.release_name, + "--targets", payload.targets, + ] + if payload.concept_table: + parts.extend(["--concept-table", payload.concept_table]) + if payload.dry_run: + parts.append("--dry-run") + + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Projector execution timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + spark_read_done = "[STEP] spark_read_done" in out + projection_done = "[STEP] projection_done" in out + result = { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "spark_read_done": spark_read_done, + "projection_done": projection_done, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + } + if proc.returncode != 0: + raise HTTPException(status_code=502, detail=result) + return result + + +def _b64(s: str) -> str: + return base64.b64encode(s.encode("utf-8")).decode("ascii") + + +async def run_remote_ingest_message(payload: MessageIngestPayload) -> Dict[str, Any]: + sent_at = payload.sent_at or "" + parts = [ + INGEST_MESSAGE_REMOTE_SCRIPT, + payload.table, + payload.thread_id, + payload.message_id, + payload.sender, + payload.channel, + sent_at, + _b64(payload.body), + _b64(json.dumps(payload.metadata, ensure_ascii=False)), + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Message ingest execution timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + result = { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + } + if proc.returncode != 0: + raise HTTPException(status_code=502, detail=result) + return result + + +async def run_remote_ingest_messages_batch(payload: MessageIngestBatchPayload) -> Dict[str, Any]: + rows = [] + for m in payload.messages: + rows.append( + { + "thread_id": m.thread_id, + "message_id": m.message_id, + "sender": m.sender, + "channel": m.channel, + "sent_at": m.sent_at, + "body": m.body, + "metadata": m.metadata, + } + ) + if not rows: + return { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": 0, + "rows": 0, + "stdout_tail": "[INFO] No rows to ingest", + "stderr_tail": "", + } + + local_tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") + remote_tmp = f"{PROJECTOR_REMOTE_DIR}/.ingest_messages_{uuid.uuid4().hex}.json" + try: + json.dump(rows, local_tmp, ensure_ascii=False) + local_tmp.flush() + local_tmp.close() + + scp_target = f"{PROJECTOR_SSH_HOST}:{remote_tmp}" + scp_args = [PROJECTOR_SCP_BIN, *shlex.split(PROJECTOR_SCP_OPTS), local_tmp.name, scp_target] + scp_proc = await asyncio.create_subprocess_exec( + *scp_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + scp_stdout, scp_stderr = await asyncio.wait_for(scp_proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + scp_proc.kill() + await scp_proc.wait() + raise HTTPException(status_code=504, detail="Batch payload upload timed out") + if scp_proc.returncode != 0: + raise HTTPException( + status_code=502, + detail={ + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": scp_proc.returncode, + "stdout_tail": _tail(scp_stdout.decode("utf-8", errors="replace")), + "stderr_tail": _tail(scp_stderr.decode("utf-8", errors="replace")), + }, + ) + + payload_arg = f"@{remote_tmp}" + parts = [ + INGEST_MESSAGES_BATCH_REMOTE_SCRIPT, + payload.table, + payload.dedupe_mode, + payload_arg, + ] + batch_cmd = " ".join(shlex.quote(p) for p in parts) + command = ( + f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && " + f"({batch_cmd}); rc=$?; rm -f {shlex.quote(remote_tmp)}; exit $rc" + ) + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Batch message ingest execution timed out") + finally: + try: + os.unlink(local_tmp.name) + except Exception: + pass + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + result = { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "rows": len(rows), + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + } + if proc.returncode != 0: + raise HTTPException(status_code=502, detail=result) + return result + + +async def run_remote_assistant_feedback( + feedback_id: str, + payload: AssistantFeedbackPayload, +) -> Dict[str, Any]: + confidence = payload.confidence if payload.confidence is not None else 0.0 + parts = [ + ASSISTANT_FEEDBACK_REMOTE_SCRIPT, + feedback_id, + now_iso(), + payload.outcome, + payload.task_type, + payload.release_name or "", + f"{confidence}", + "true" if payload.needs_review else "false", + _b64(payload.goal or ""), + _b64(payload.draft), + _b64(payload.final_text or ""), + _b64(json.dumps([s.model_dump() for s in payload.sources], ensure_ascii=False)), + _b64(payload.notes or ""), + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Assistant feedback execution timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + result = { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + } + if proc.returncode != 0: + raise HTTPException(status_code=502, detail=result) + return result + + +def _extract_json_array_from_text(text: str) -> List[Dict[str, Any]]: + start = text.find("[") + end = text.rfind("]") + if start == -1 or end == -1 or end < start: + raise ValueError("No JSON array found in output") + candidate = text[start : end + 1] + obj = json.loads(candidate) + if not isinstance(obj, list): + raise ValueError("Parsed value is not a JSON array") + out: List[Dict[str, Any]] = [] + for item in obj: + if isinstance(item, dict): + out.append(item) + return out + + +async def run_remote_query_assistant_feedback( + outcome: Optional[str], + task_type: Optional[str], + release_name: Optional[str], + limit: int, +) -> Dict[str, Any]: + parts = [ + ASSISTANT_FEEDBACK_QUERY_REMOTE_SCRIPT, + outcome or "", + task_type or "", + release_name or "", + str(limit), + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Assistant feedback query timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + if proc.returncode != 0: + raise HTTPException( + status_code=502, + detail={ + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + }, + ) + try: + rows = _extract_json_array_from_text(out) + except Exception as e: + raise HTTPException( + status_code=502, + detail={ + "message": f"Unable to parse feedback query output: {e}", + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + }, + ) + return { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "rows": rows, + } + + +async def run_remote_query_assistant_metrics( + task_type: Optional[str], + release_name: Optional[str], + outcome: Optional[str], + group_by: str, + limit: int, +) -> Dict[str, Any]: + parts = [ + ASSISTANT_METRICS_QUERY_REMOTE_SCRIPT, + task_type or "", + release_name or "", + outcome or "", + group_by, + str(limit), + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Assistant metrics query timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + if proc.returncode != 0: + raise HTTPException( + status_code=502, + detail={ + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + }, + ) + try: + rows = _extract_json_array_from_text(out) + except Exception as e: + raise HTTPException( + status_code=502, + detail={ + "message": f"Unable to parse metrics query output: {e}", + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + }, + ) + return { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "rows": rows, + } + + +async def run_remote_assistant_action( + action_id: str, + payload: AssistantExecuteStepPayload, + step: AssistantPlanStep, + status: str, + output_json: Dict[str, Any], + error_text: Optional[str], +) -> Dict[str, Any]: + parts = [ + ASSISTANT_ACTION_REMOTE_SCRIPT, + action_id, + now_iso(), + payload.task_type, + payload.release_name or "", + _b64(payload.objective), + step.step_id, + _b64(step.title), + step.action_type, + "true" if step.requires_approval else "false", + "true" if payload.approved else "false", + status, + _b64(json.dumps(output_json, ensure_ascii=False)), + _b64(error_text or ""), + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Assistant action logging timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + result = { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + } + if proc.returncode != 0: + raise HTTPException(status_code=502, detail=result) + return result + + +async def run_remote_query_assistant_actions( + status: Optional[str], + task_type: Optional[str], + release_name: Optional[str], + step_id: Optional[str], + action_type: Optional[str], + limit: int, +) -> Dict[str, Any]: + parts = [ + ASSISTANT_ACTIONS_QUERY_REMOTE_SCRIPT, + status or "", + task_type or "", + release_name or "", + step_id or "", + action_type or "", + str(limit), + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise HTTPException(status_code=504, detail="Assistant actions query timed out") + + out = stdout.decode("utf-8", errors="replace") + err = stderr.decode("utf-8", errors="replace") + if proc.returncode != 0: + raise HTTPException( + status_code=502, + detail={ + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "exit_code": proc.returncode, + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + }, + ) + try: + rows = _extract_json_array_from_text(out) + except Exception as e: + raise HTTPException( + status_code=502, + detail={ + "message": f"Unable to parse actions query output: {e}", + "stdout_tail": _tail(out), + "stderr_tail": _tail(err), + }, + ) + return { + "host": PROJECTOR_SSH_HOST, + "remote_dir": PROJECTOR_REMOTE_DIR, + "rows": rows, + } + + +async def run_remote_record_run( + run_id: str, + run_type: str, + status: str, + started_at_utc: str, + finished_at_utc: str, + actor: str, + input_json: Dict[str, Any], + output_json: Optional[Dict[str, Any]], + error_text: Optional[str], +) -> None: + parts = [ + RUNS_REMOTE_SCRIPT, + run_id, + run_type, + status, + started_at_utc, + finished_at_utc, + actor, + _b64(json.dumps(input_json, ensure_ascii=False)), + _b64(json.dumps(output_json, ensure_ascii=False) if output_json is not None else ""), + _b64(error_text or ""), + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + if proc.returncode != 0: + raise HTTPException( + status_code=502, + detail={ + "message": "Failed to record run in Iceberg", + "host": PROJECTOR_SSH_HOST, + "exit_code": proc.returncode, + "stdout_tail": _tail(stdout.decode("utf-8", errors="replace")), + "stderr_tail": _tail(stderr.decode("utf-8", errors="replace")), + }, + ) + + +async def run_remote_record_event( + run_id: str, + event_type: str, + detail_json: Dict[str, Any], +) -> None: + parts = [ + RUN_EVENTS_REMOTE_SCRIPT, + run_id, + event_type, + now_iso(), + _b64(json.dumps(detail_json, ensure_ascii=False)), + ] + command = f"cd {shlex.quote(PROJECTOR_REMOTE_DIR)} && {' '.join(shlex.quote(p) for p in parts)}" + ssh_args = [PROJECTOR_SSH_BIN, *shlex.split(PROJECTOR_SSH_OPTS), PROJECTOR_SSH_HOST, command] + proc = await asyncio.create_subprocess_exec( + *ssh_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=PROJECTOR_TIMEOUT_SEC) + if proc.returncode != 0: + raise HTTPException( + status_code=502, + detail={ + "message": "Failed to record run event in Iceberg", + "host": PROJECTOR_SSH_HOST, + "exit_code": proc.returncode, + "stdout_tail": _tail(stdout.decode("utf-8", errors="replace")), + "stderr_tail": _tail(stderr.decode("utf-8", errors="replace")), + }, + ) + + +async def record_event_best_effort(run_id: str, event_type: str, detail_json: Dict[str, Any]) -> None: + try: + await run_remote_record_event(run_id, event_type, detail_json) + except Exception as e: + # Event tracing must never break the primary projection flow. + print(f"[WARN] run event logging failed: run_id={run_id} event={event_type} error={e}") + + +# --------- routes --------- +@app.on_event("startup") +async def startup(): + await es_ensure_index() + + +@app.get("/ui", include_in_schema=False) +async def assistant_ui(): + index_html = UI_DIR / "index.html" + if not index_html.exists(): + raise HTTPException(status_code=404, detail="UI not found") + html = index_html.read_text(encoding="utf-8") + css_path = UI_ASSETS_DIR / "styles.css" + js_path = UI_ASSETS_DIR / "app.js" + css = css_path.read_text(encoding="utf-8") if css_path.exists() else "" + js = js_path.read_text(encoding="utf-8") if js_path.exists() else "" + html = html.replace( + '', + f"", + ) + html = html.replace( + '', + f"", + ) + return HTMLResponse(content=html) + + +@app.post("/concepts", response_model=ConceptOut) +async def create_concept(payload: ConceptCreate): + created = now_iso() + updated = created + fingerprint = make_fingerprint(payload.canonical_name, payload.kind, payload.external_ids) + + # Store long content in IPFS (version 1) + content_doc = { + "canonical_name": payload.canonical_name, + "kind": payload.kind, + "aliases": payload.aliases, + "external_ids": payload.external_ids, + "tags": payload.tags, + "description": payload.description, + "created_at": created, + } + latest_cid = await ipfs_add_json(content_doc) + + # LLM extras (optional) + summary = None + embedding: List[float] = [] + base_text = payload.description or payload.canonical_name + if base_text.strip(): + try: + summary = await ollama_summary(base_text) + except Exception: + summary = None + try: + embedding = await ollama_embed(base_text) + except Exception: + embedding = [] + + # Create vertex in JanusGraph (Concept) + # Uses a stable concept_id as your canonical handle + concept_id = (await gremlin_submit( + """ + import java.util.UUID + def id = UUID.randomUUID().toString() + g.addV('Concept') + .property('concept_id', id) + .property('canonical_name', canonical_name) + .property('kind', kind) + .property('aliases', aliases_json) + .property('external_ids', external_ids_json) + .property('tags', tags_json) + .property('fingerprint', fingerprint) + .property('latest_cid', latest_cid) + .property('summary', summary) + .property('created_at', created_at) + .property('updated_at', updated_at) + .values('concept_id') + """, + { + "canonical_name": payload.canonical_name, + "kind": payload.kind or "", + "aliases_json": json.dumps(payload.aliases, ensure_ascii=False), + "external_ids_json": json.dumps(payload.external_ids, ensure_ascii=False), + "tags_json": json.dumps(payload.tags, ensure_ascii=False), + "fingerprint": fingerprint, + "latest_cid": latest_cid, + "summary": summary or "", + "created_at": created, + "updated_at": updated, + }, + ))[0] + + out = { + "concept_id": concept_id, + "canonical_name": payload.canonical_name, + "kind": payload.kind, + "aliases": payload.aliases, + "external_ids": payload.external_ids, + "tags": payload.tags, + "latest_cid": latest_cid, + "summary": summary, + "created_at": created, + "updated_at": updated, + } + + # Index in Elasticsearch + doc = dict(out) + doc["fingerprint"] = fingerprint + if embedding: + doc["embedding"] = embedding + await es_index(doc) + + return out + +@app.get("/concepts/{concept_id}", response_model=ConceptOut) +async def get_concept(concept_id: str): + rows = await gremlin_submit( + """ + g.V().hasLabel('Concept').has('concept_id', concept_id) + .project('concept_id','canonical_name','kind','aliases','external_ids','tags','latest_cid','summary','created_at','updated_at') + .by(values('concept_id')) + .by(values('canonical_name')) + .by(values('kind')) + .by(values('aliases')) + .by(values('external_ids')) + .by(values('tags')) + .by(values('latest_cid')) + .by(values('summary')) + .by(values('created_at')) + .by(values('updated_at')) + """, + {"concept_id": concept_id}, + ) + if not rows: + raise HTTPException(status_code=404, detail="Concept not found") + + r = rows[0] + external_ids = {} + try: + external_ids = json.loads(r.get("external_ids") or "{}") + except Exception: + external_ids = {} + + aliases = [] + try: + aliases = json.loads(r.get("aliases") or "[]") + if not isinstance(aliases, list): + aliases = [] + except Exception: + aliases = [] + + tags = [] + try: + tags = json.loads(r.get("tags") or "[]") + if not isinstance(tags, list): + tags = [] + except Exception: + tags = [] + + return { + "concept_id": r.get("concept_id"), + "canonical_name": r.get("canonical_name"), + "kind": (r.get("kind") or None), + "aliases": aliases, + "external_ids": external_ids, + "tags": tags, + "latest_cid": (r.get("latest_cid") or None), + "summary": (r.get("summary") or None), + "created_at": r.get("created_at"), + "updated_at": r.get("updated_at"), + } + +@app.get("/search") +async def search(q: str, size: int = 10): + results = await es_search(q, size=size) + return {"q": q, "results": results} + + +@app.get("/assistant/inbox") +async def assistant_inbox( + release_name: Optional[str] = None, + q: Optional[str] = None, + limit: int = 20, + x_admin_api_key: Optional[str] = Header(default=None), +): + check_admin_api_key(x_admin_api_key) + bounded_limit = max(1, min(limit, 200)) + hits = await es_recent_messages( + size=bounded_limit, + release_name=release_name, + q=q, + ) + rows: List[Dict[str, Any]] = [] + for h in hits: + src = h.get("_source", {}) or {} + rows.append( + { + "concept_id": src.get("concept_id"), + "source_pk": src.get("source_pk"), + "source_table": src.get("source_table"), + "release_name": src.get("release_name"), + "concept_type": src.get("concept_type") or src.get("kind"), + "display_name": src.get("display_name") or src.get("canonical_name"), + "text": src.get("text"), + "summary": src.get("summary"), + "description": src.get("description"), + "updated_at": src.get("updated_at"), + "score": float(h.get("_score")) if h.get("_score") is not None else None, + } + ) + return { + "count": len(rows), + "filters": { + "release_name": release_name, + "q": q, + "limit": bounded_limit, + }, + "rows": rows, + } + + +@app.get("/assistant/tasks") +async def assistant_tasks( + release_name: Optional[str] = None, + q: Optional[str] = None, + only_pending: bool = True, + use_ai: bool = True, + limit: int = 50, + x_admin_api_key: Optional[str] = Header(default=None), +): + check_admin_api_key(x_admin_api_key) + bounded_limit = max(1, min(limit, 500)) + # Pull more messages than final task limit because each message can yield 0..N tasks. + hits = await es_recent_messages( + size=min(1000, max(100, bounded_limit * 4)), + release_name=release_name, + q=q, + ) + + rows: List[Dict[str, Any]] = [] + seen_task_ids: set[str] = set() + ai_cache_hits = 0 + ai_calls = 0 + for h in hits: + src = h.get("_source", {}) or {} + extracted: List[Dict[str, Any]] = [] + if use_ai: + cache_key = _task_ai_cache_key(src) + cached = _task_ai_cache_get(cache_key) + if cached is not None: + ai_cache_hits += 1 + extracted = cached + else: + ai_calls += 1 + try: + extracted = await extract_pending_tasks_from_source_ai(src) + except Exception as e: + print(f"[WARN] assistant_tasks ai extraction failed: {e}") + extracted = [] + _task_ai_cache_set(cache_key, extracted) + if not extracted: + extracted = extract_pending_tasks_from_source(src) + for t in extracted: + if only_pending and t.get("status") != "pending": + continue + task_id = str(t.get("task_id") or "") + if not task_id or task_id in seen_task_ids: + continue + seen_task_ids.add(task_id) + rows.append(t) + if len(rows) >= bounded_limit: + break + if len(rows) >= bounded_limit: + break + + return { + "count": len(rows), + "filters": { + "release_name": release_name, + "q": q, + "only_pending": only_pending, + "use_ai": use_ai, + "limit": bounded_limit, + "ai_cache_ttl_sec": TASK_AI_CACHE_TTL_SEC, + }, + "stats": { + "messages_scanned": len(hits), + "ai_cache_hits": ai_cache_hits, + "ai_calls": ai_calls, + "ai_cache_size": len(TASK_AI_CACHE), + }, + "rows": rows, + } + + +@app.post("/assistant/learn") +async def assistant_learn(payload: AssistantLearnPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + now = now_iso() + note_id = "note-" + uuid.uuid4().hex[:16] + title = (payload.title or "").strip() + if not title: + title = payload.text.strip().splitlines()[0][:80] + summary = payload.text.strip()[:280] + + doc = { + "concept_id": f"note:{note_id}", + "concept_type": "note", + "display_name": title, + "description": summary, + "text": payload.text, + "source_table": "assistant.learn", + "source_pk": note_id, + "release_name": payload.release_name or "", + "ref_hash": "", + "attributes_json": json.dumps(payload.metadata or {}, ensure_ascii=False, sort_keys=True), + "canonical_name": title, + "kind": "note", + "aliases": [], + "external_ids": {}, + "tags": payload.tags or [], + "latest_cid": None, + "summary": summary, + "created_at": now, + "updated_at": now, + "fingerprint": make_fingerprint(title, "note", {}), + } + await es_index(doc) + return { + "stored": True, + "concept_id": doc["concept_id"], + "release_name": payload.release_name, + "title": title, + "tags": payload.tags, + } + + +@app.post("/assistant/chat", response_model=AssistantChatResponse) +async def assistant_chat(payload: AssistantChatPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + session_id = (payload.session_id or str(uuid.uuid4())).strip() + + history: List[Dict[str, str]] = [] + if payload.history: + history = [{"role": h.role, "content": h.content} for h in payload.history if h.content.strip()] + else: + history = ASSISTANT_CHAT_SESSIONS.get(session_id, []) + + hits: List[Dict[str, Any]] = [] + try: + hits = await es_search_hits( + q=payload.message, + size=payload.max_sources, + release_name=payload.release_name, + ) + except Exception as e: + print(f"[WARN] assistant_chat retrieval failed: {e}") + hits = [] + if not hits and payload.release_name: + try: + hits = await es_recent_by_release(payload.release_name, size=payload.max_sources) + except Exception as e: + print(f"[WARN] assistant_chat release fallback failed: {e}") + if not hits: + try: + hits = await es_recent_messages(size=payload.max_sources, release_name=payload.release_name, q=None) + except Exception as e: + print(f"[WARN] assistant_chat inbox fallback failed: {e}") + + prompt = build_chat_prompt( + user_message=payload.message, + history=history, + source_docs=hits, + release_name=payload.release_name, + ) + try: + answer = await ollama_generate(prompt) + if not answer.strip(): + answer = "I don't have enough context to answer confidently. Can you share one more detail?" + except Exception as e: + print(f"[WARN] assistant_chat generation failed: {e}") + answer = "I could not generate a response right now. Please retry." + + sources: List[AssistantDraftSource] = [] + for h in hits: + src = h.get("_source", {}) or {} + sources.append( + AssistantDraftSource( + concept_id=str(src.get("concept_id") or ""), + source_pk=src.get("source_pk"), + source_table=src.get("source_table"), + release_name=src.get("release_name"), + score=float(h.get("_score")) if h.get("_score") is not None else None, + ) + ) + source_count = len([s for s in sources if s.concept_id]) + confidence = 0.35 + if source_count >= 5: + confidence = 0.85 + elif source_count >= 3: + confidence = 0.75 + elif source_count >= 1: + confidence = 0.6 + if len(answer.strip()) < 30: + confidence = min(confidence, 0.45) + + _append_chat_turn(session_id, "user", payload.message) + _append_chat_turn(session_id, "assistant", answer) + + return AssistantChatResponse( + session_id=session_id, + answer=answer, + sources=[s for s in sources if s.concept_id], + confidence=confidence, + release_name=payload.release_name, + ) + + +@app.post("/assistant/draft", response_model=AssistantDraftResponse) +async def assistant_draft(payload: AssistantDraftPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + retrieval_query = " ".join( + [ + payload.goal, + payload.recipient or "", + payload.task_type, + " ".join(payload.constraints), + ] + ).strip() + try: + hits = await es_search_hits( + q=retrieval_query, + size=payload.max_sources, + release_name=payload.release_name, + ) + except Exception as e: + print(f"[WARN] assistant_draft retrieval failed: {e}") + hits = [] + if not hits and payload.release_name: + try: + hits = await es_recent_by_release(payload.release_name, size=payload.max_sources) + except Exception as e: + print(f"[WARN] assistant_draft release fallback retrieval failed: {e}") + if not hits and payload.release_name: + try: + hits = await es_recent_messages( + size=payload.max_sources, + release_name=payload.release_name, + q=None, + ) + except Exception as e: + print(f"[WARN] assistant_draft inbox fallback retrieval failed: {e}") + if not hits: + try: + hits = await es_recent_messages( + size=payload.max_sources, + release_name=None, + q=None, + ) + except Exception as e: + print(f"[WARN] assistant_draft global fallback retrieval failed: {e}") + + prompt = build_assistant_prompt(payload, hits) + used_fallback = False + try: + draft = await ollama_generate(prompt) + if not draft.strip(): + used_fallback = True + draft = fallback_draft_text(payload) + except Exception as e: + print(f"[WARN] assistant_draft generation failed: {e}") + used_fallback = True + draft = fallback_draft_text(payload) + + sources: List[AssistantDraftSource] = [] + for h in hits: + src = h.get("_source", {}) or {} + sources.append( + AssistantDraftSource( + concept_id=str(src.get("concept_id") or ""), + source_pk=src.get("source_pk"), + source_table=src.get("source_table"), + release_name=src.get("release_name"), + score=float(h.get("_score")) if h.get("_score") is not None else None, + ) + ) + + source_count = len([s for s in sources if s.concept_id]) + if source_count >= 5: + confidence = 0.85 + elif source_count >= 3: + confidence = 0.75 + elif source_count >= 1: + confidence = 0.6 + else: + confidence = 0.35 + + if len(draft.strip()) < 40: + confidence = min(confidence, 0.45) + if used_fallback: + confidence = min(confidence, 0.4) + + source_ids = [s.concept_id for s in sources if s.concept_id] + citation_required = confidence < 0.75 or used_fallback + if citation_required and source_ids: + already_cited = any(cid in draft for cid in source_ids) + if not already_cited: + cited = ", ".join(source_ids[:3]) + draft = f"{draft.rstrip()}\n\nSources: {cited}" + + return AssistantDraftResponse( + task_type=payload.task_type, + draft=draft, + sources=[s for s in sources if s.concept_id], + confidence=confidence, + needs_review=True, + release_name=payload.release_name, + ) + + +@app.post("/assistant/plan", response_model=AssistantPlanResponse) +async def assistant_plan(payload: AssistantPlanPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + retrieval_query = " ".join([payload.objective, payload.task_type, " ".join(payload.constraints)]).strip() + try: + hits = await es_search_hits( + q=retrieval_query, + size=payload.max_sources, + release_name=payload.release_name, + ) + except Exception as e: + print(f"[WARN] assistant_plan retrieval failed: {e}") + hits = [] + if not hits and payload.release_name: + try: + hits = await es_recent_by_release(payload.release_name, size=payload.max_sources) + except Exception as e: + print(f"[WARN] assistant_plan release fallback retrieval failed: {e}") + + prompt = build_assistant_plan_prompt(payload, hits) + used_fallback = False + plan_steps: List[AssistantPlanStep] = [] + try: + raw = await ollama_generate(prompt) + parsed = _extract_json_object_from_text(raw) + raw_steps = parsed.get("plan") + if isinstance(raw_steps, list): + allowed_action_types = {"research", "draft", "ask_user", "prepare_data", "review"} + for i, s in enumerate(raw_steps[: payload.max_steps]): + if not isinstance(s, dict): + continue + step_id = str(s.get("step_id") or f"S{i+1}").strip() or f"S{i+1}" + title = str(s.get("title") or "").strip() + if not title: + continue + at = str(s.get("action_type") or "research").strip() + action_type = at if at in allowed_action_types else "research" + requires_approval = bool(s.get("requires_approval", False)) + notes = str(s.get("notes")).strip() if s.get("notes") is not None else None + plan_steps.append( + AssistantPlanStep( + step_id=step_id, + title=title, + action_type=action_type, # type: ignore[arg-type] + requires_approval=requires_approval, + notes=notes, + ) + ) + except Exception as e: + print(f"[WARN] assistant_plan generation failed: {e}") + used_fallback = True + + if not plan_steps: + used_fallback = True + plan_steps = fallback_plan(payload) + + sources: List[AssistantDraftSource] = [] + for h in hits: + src = h.get("_source", {}) or {} + sources.append( + AssistantDraftSource( + concept_id=str(src.get("concept_id") or ""), + source_pk=src.get("source_pk"), + source_table=src.get("source_table"), + release_name=src.get("release_name"), + score=float(h.get("_score")) if h.get("_score") is not None else None, + ) + ) + + source_count = len([s for s in sources if s.concept_id]) + if source_count >= 5: + confidence = 0.85 + elif source_count >= 3: + confidence = 0.75 + elif source_count >= 1: + confidence = 0.6 + else: + confidence = 0.35 + if used_fallback: + confidence = min(confidence, 0.45) + + return AssistantPlanResponse( + objective=payload.objective, + task_type=payload.task_type, + plan=plan_steps, + sources=[s for s in sources if s.concept_id], + needs_review=True, + confidence=confidence, + release_name=payload.release_name, + ) + + +@app.post("/assistant/execute-step", response_model=AssistantExecuteStepResponse) +async def assistant_execute_step(payload: AssistantExecuteStepPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + action_id = str(uuid.uuid4()) + started_at_utc = now_iso() + run_input = payload.model_dump() + await record_event_best_effort(action_id, "started", {"step_id": payload.step_id, "approved": payload.approved}) + + step = find_plan_step(payload.plan, payload.step_id) + if step is None: + error_text = f"Step '{payload.step_id}' not found in plan." + await run_remote_assistant_action( + action_id=action_id, + payload=payload, + step=AssistantPlanStep( + step_id=payload.step_id, + title="missing-step", + action_type="review", + requires_approval=True, + notes=error_text, + ), + status="blocked", + output_json={"reason": error_text}, + error_text=error_text, + ) + await run_remote_record_run( + run_id=action_id, + run_type="assistant_execute_step", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json={"reason": error_text}, + error_text=error_text, + ) + raise HTTPException(status_code=400, detail=error_text) + + policy_error = enforce_step_policy(payload, step) + if policy_error: + out = {"policy_blocked": True, "reason": policy_error} + await record_event_best_effort(action_id, "policy_blocked", {"step_id": step.step_id, "reason": policy_error}) + await run_remote_assistant_action( + action_id=action_id, + payload=payload, + step=step, + status="blocked", + output_json=out, + error_text=policy_error, + ) + await run_remote_record_run( + run_id=action_id, + run_type="assistant_execute_step", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=out, + error_text=policy_error, + ) + return AssistantExecuteStepResponse( + action_id=action_id, + step_id=step.step_id, + status="blocked", + output=out, + needs_review=True, + ) + + try: + output = await execute_plan_step(payload, step) + await record_event_best_effort(action_id, "executed", {"step_id": step.step_id, "action_type": step.action_type}) + await run_remote_assistant_action( + action_id=action_id, + payload=payload, + step=step, + status="executed", + output_json=output, + error_text=None, + ) + await run_remote_record_run( + run_id=action_id, + run_type="assistant_execute_step", + status="succeeded", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=output, + error_text=None, + ) + await record_event_best_effort(action_id, "recorded", {"status": "succeeded"}) + return AssistantExecuteStepResponse( + action_id=action_id, + step_id=step.step_id, + status="executed", + output=output, + needs_review=True, + ) + except Exception as e: + err_text = str(e) + await run_remote_assistant_action( + action_id=action_id, + payload=payload, + step=step, + status="blocked", + output_json={"error": err_text}, + error_text=err_text, + ) + await run_remote_record_run( + run_id=action_id, + run_type="assistant_execute_step", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=err_text, + ) + await record_event_best_effort(action_id, "recorded", {"status": "failed"}) + raise HTTPException(status_code=500, detail=err_text) + + +@app.post("/assistant/feedback") +async def assistant_feedback(payload: AssistantFeedbackPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + run_id = str(uuid.uuid4()) + run_input = payload.model_dump() + started_at_utc = now_iso() + await record_event_best_effort( + run_id, + "started", + {"input": {"outcome": payload.outcome, "task_type": payload.task_type, "release_name": payload.release_name}}, + ) + try: + result = await run_remote_assistant_feedback(run_id, payload) + await record_event_best_effort( + run_id, + "feedback_recorded", + {"outcome": payload.outcome, "task_type": payload.task_type}, + ) + await run_remote_record_run( + run_id=run_id, + run_type="assistant_feedback", + status="succeeded", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=result, + error_text=None, + ) + await record_event_best_effort(run_id, "recorded", {"status": "succeeded"}) + return {"run_id": run_id, "result": result} + except HTTPException as e: + err_text = e.detail if isinstance(e.detail, str) else json.dumps(e.detail, ensure_ascii=False) + await run_remote_record_run( + run_id=run_id, + run_type="assistant_feedback", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=err_text, + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + except Exception as e: + await run_remote_record_run( + run_id=run_id, + run_type="assistant_feedback", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=str(e), + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + + +@app.get("/assistant/feedback") +async def assistant_feedback_list( + outcome: Optional[Literal["accepted", "edited", "rejected"]] = None, + task_type: Optional[Literal["message", "finance", "gov", "general"]] = None, + release_name: Optional[str] = None, + limit: int = 50, + x_admin_api_key: Optional[str] = Header(default=None), +): + check_admin_api_key(x_admin_api_key) + bounded_limit = max(1, min(limit, 500)) + result = await run_remote_query_assistant_feedback( + outcome=outcome, + task_type=task_type, + release_name=release_name, + limit=bounded_limit, + ) + rows = result["rows"] + return { + "count": len(rows), + "filters": { + "outcome": outcome, + "task_type": task_type, + "release_name": release_name, + "limit": bounded_limit, + }, + "rows": rows, + } + + +@app.get("/assistant/metrics") +async def assistant_metrics( + task_type: Optional[Literal["message", "finance", "gov", "general"]] = None, + release_name: Optional[str] = None, + outcome: Optional[Literal["accepted", "edited", "rejected"]] = None, + group_by: Literal["task_type", "release_name", "both"] = "both", + limit: int = 100, + x_admin_api_key: Optional[str] = Header(default=None), +): + check_admin_api_key(x_admin_api_key) + bounded_limit = max(1, min(limit, 1000)) + result = await run_remote_query_assistant_metrics( + task_type=task_type, + release_name=release_name, + outcome=outcome, + group_by=group_by, + limit=bounded_limit, + ) + rows = result["rows"] + return { + "count": len(rows), + "filters": { + "task_type": task_type, + "release_name": release_name, + "outcome": outcome, + "group_by": group_by, + "limit": bounded_limit, + }, + "rows": rows, + } + + +@app.get("/assistant/actions") +async def assistant_actions( + status: Optional[Literal["blocked", "executed"]] = None, + task_type: Optional[Literal["message", "finance", "gov", "general"]] = None, + release_name: Optional[str] = None, + step_id: Optional[str] = None, + action_type: Optional[Literal["research", "draft", "ask_user", "prepare_data", "review"]] = None, + limit: int = 50, + x_admin_api_key: Optional[str] = Header(default=None), +): + check_admin_api_key(x_admin_api_key) + bounded_limit = max(1, min(limit, 500)) + result = await run_remote_query_assistant_actions( + status=status, + task_type=task_type, + release_name=release_name, + step_id=step_id, + action_type=action_type, + limit=bounded_limit, + ) + rows = result["rows"] + return { + "count": len(rows), + "filters": { + "status": status, + "task_type": task_type, + "release_name": release_name, + "step_id": step_id, + "action_type": action_type, + "limit": bounded_limit, + }, + "rows": rows, + } + + +@app.post("/admin/project-release") +async def project_release(payload: ProjectionTrigger, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + run_id = str(uuid.uuid4()) + run_input = payload.model_dump() + started_at_utc = now_iso() + await record_event_best_effort(run_id, "started", {"input": run_input}) + try: + result = await run_remote_projector(payload) + if result.get("spark_read_done"): + await record_event_best_effort(run_id, "spark_read_done", {"release_name": payload.release_name}) + if result.get("projection_done"): + await record_event_best_effort( + run_id, + "projection_done", + {"targets": payload.targets, "dry_run": payload.dry_run}, + ) + await run_remote_record_run( + run_id=run_id, + run_type="projection", + status="succeeded", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=result, + error_text=None, + ) + await record_event_best_effort(run_id, "recorded", {"status": "succeeded"}) + return {"run_id": run_id, "result": result} + except HTTPException as e: + err_text = e.detail if isinstance(e.detail, str) else json.dumps(e.detail, ensure_ascii=False) + await run_remote_record_run( + run_id=run_id, + run_type="projection", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=err_text, + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + except Exception as e: + await run_remote_record_run( + run_id=run_id, + run_type="projection", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=str(e), + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + + +@app.post("/admin/ingest-email-imap") +async def ingest_email_imap(payload: EmailImapIngestPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + run_id = str(uuid.uuid4()) + started_at_utc = now_iso() + run_input = payload.model_dump() + if "password" in run_input: + run_input["password"] = "***" + + effective_since_uid = payload.since_uid + if effective_since_uid is None and payload.incremental: + effective_since_uid = await run_remote_query_imap_checkpoint( + host=payload.host, + mailbox=payload.mailbox, + username=payload.username, + table=payload.table, + ) + if effective_since_uid is not None: + effective_search_criteria = f"UID {int(effective_since_uid) + 1}:*" + else: + effective_search_criteria = payload.search_criteria + + await record_event_best_effort( + run_id, + "started", + { + "input": { + "host": payload.host, + "mailbox": payload.mailbox, + "search_criteria": effective_search_criteria, + "since_uid": effective_since_uid, + "max_messages": payload.max_messages, + "table": payload.table, + "dedupe_mode": payload.dedupe_mode, + } + }, + ) + + try: + items = await asyncio.to_thread( + fetch_imap_messages_blocking, + payload, + effective_search_criteria, + effective_since_uid, + ) + max_uid_fetched: Optional[int] = None + for m in items: + uid_raw = m.metadata.get("imap_uid") + try: + uid_int = int(uid_raw) + except Exception: + continue + if max_uid_fetched is None or uid_int > max_uid_fetched: + max_uid_fetched = uid_int + batch_payload = MessageIngestBatchPayload( + table=payload.table, + dedupe_mode=payload.dedupe_mode, + messages=items, + ) + ingest_result = await run_remote_ingest_messages_batch(batch_payload) + result = { + "incremental": payload.incremental, + "since_uid": effective_since_uid, + "search_criteria_used": effective_search_criteria, + "max_uid_fetched": max_uid_fetched, + "fetched_messages": len(items), + "ingested_rows_requested": len(items), + "ingest_result": ingest_result, + } + await record_event_best_effort( + run_id, + "ingest_done", + {"fetched_messages": len(items), "table": payload.table, "max_uid_fetched": max_uid_fetched}, + ) + await run_remote_record_run( + run_id=run_id, + run_type="ingest_email_imap", + status="succeeded", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=result, + error_text=None, + ) + await record_event_best_effort(run_id, "recorded", {"status": "succeeded"}) + return {"run_id": run_id, "result": result} + except HTTPException as e: + err_text = e.detail if isinstance(e.detail, str) else json.dumps(e.detail, ensure_ascii=False) + await run_remote_record_run( + run_id=run_id, + run_type="ingest_email_imap", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=err_text, + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + except Exception as e: + await run_remote_record_run( + run_id=run_id, + run_type="ingest_email_imap", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=str(e), + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + + +@app.post("/admin/poll-and-project") +async def poll_and_project(payload: PollAndProjectPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + if payload.imap.table != "lake.db1.messages": + raise HTTPException( + status_code=400, + detail="poll-and-project currently supports only table lake.db1.messages", + ) + + run_id = str(uuid.uuid4()) + started_at_utc = now_iso() + run_input = payload.model_dump() + if run_input.get("imap", {}).get("password"): + run_input["imap"]["password"] = "***" + + await record_event_best_effort( + run_id, + "started", + { + "input": { + "imap": { + "host": payload.imap.host, + "mailbox": payload.imap.mailbox, + "incremental": payload.imap.incremental, + "since_uid": payload.imap.since_uid, + "max_messages": payload.imap.max_messages, + "table": payload.imap.table, + }, + "targets": payload.targets, + "dry_run": payload.dry_run, + "project_if_no_new": payload.project_if_no_new, + } + }, + ) + + try: + ingest = await ingest_email_imap(payload.imap, x_admin_api_key) + ingest_result = ingest["result"] + fetched_messages = int(ingest_result.get("fetched_messages", 0)) + await record_event_best_effort( + run_id, + "ingest_done", + { + "ingest_run_id": ingest.get("run_id"), + "fetched_messages": fetched_messages, + "max_uid_fetched": ingest_result.get("max_uid_fetched"), + }, + ) + + if fetched_messages <= 0 and not payload.project_if_no_new: + result = { + "ingest_run_id": ingest.get("run_id"), + "ingest_result": ingest_result, + "skipped": True, + "reason": "No new messages", + } + await run_remote_record_run( + run_id=run_id, + run_type="poll_and_project", + status="succeeded", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=result, + error_text=None, + ) + await record_event_best_effort(run_id, "recorded", {"status": "succeeded"}) + return {"run_id": run_id, "result": result} + + if payload.release_name: + release_name = payload.release_name + else: + ts = datetime.now(timezone.utc).strftime("%Y-%m-%d_%H%M%S") + release_name = f"{payload.release_prefix}_{ts}_messages-auto" + + create_result = await run_remote_create_messages_release(release_name) + await record_event_best_effort( + run_id, + "release_created", + {"release_name": release_name}, + ) + + projection = ProjectionTrigger( + release_name=release_name, + targets=payload.targets, + concept_table=payload.concept_table, + dry_run=payload.dry_run, + ) + projection_result = await run_remote_projector(projection) + await record_event_best_effort( + run_id, + "projection_done", + {"release_name": release_name, "targets": payload.targets}, + ) + + result = { + "ingest_run_id": ingest.get("run_id"), + "ingest_result": ingest_result, + "release": create_result, + "projection": projection_result, + "release_name": release_name, + "skipped": False, + } + await run_remote_record_run( + run_id=run_id, + run_type="poll_and_project", + status="succeeded", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=result, + error_text=None, + ) + await record_event_best_effort(run_id, "recorded", {"status": "succeeded"}) + return {"run_id": run_id, "result": result} + except HTTPException as e: + err_text = e.detail if isinstance(e.detail, str) else json.dumps(e.detail, ensure_ascii=False) + await run_remote_record_run( + run_id=run_id, + run_type="poll_and_project", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=err_text, + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + except Exception as e: + await run_remote_record_run( + run_id=run_id, + run_type="poll_and_project", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=str(e), + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + + +@app.post("/admin/ingest-message") +async def ingest_message(payload: MessageIngestPayload, x_admin_api_key: Optional[str] = Header(default=None)): + check_admin_api_key(x_admin_api_key) + run_id = str(uuid.uuid4()) + run_input = payload.model_dump() + started_at_utc = now_iso() + await record_event_best_effort( + run_id, + "started", + {"input": {"table": payload.table, "message_id": payload.message_id, "thread_id": payload.thread_id}}, + ) + try: + result = await run_remote_ingest_message(payload) + await record_event_best_effort( + run_id, + "ingest_done", + {"table": payload.table, "message_id": payload.message_id, "thread_id": payload.thread_id}, + ) + await run_remote_record_run( + run_id=run_id, + run_type="ingest_message", + status="succeeded", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=result, + error_text=None, + ) + await record_event_best_effort(run_id, "recorded", {"status": "succeeded"}) + return {"run_id": run_id, "result": result} + except HTTPException as e: + err_text = e.detail if isinstance(e.detail, str) else json.dumps(e.detail, ensure_ascii=False) + await run_remote_record_run( + run_id=run_id, + run_type="ingest_message", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=err_text, + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + except Exception as e: + await run_remote_record_run( + run_id=run_id, + run_type="ingest_message", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=str(e), + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + + +@app.post("/admin/ingest-messages-batch") +async def ingest_messages_batch( + payload: MessageIngestBatchPayload, + x_admin_api_key: Optional[str] = Header(default=None), +): + check_admin_api_key(x_admin_api_key) + if not payload.messages: + raise HTTPException(status_code=400, detail="messages must contain at least 1 item") + + run_id = str(uuid.uuid4()) + run_input = payload.model_dump() + started_at_utc = now_iso() + await record_event_best_effort( + run_id, + "started", + {"input": {"table": payload.table, "rows": len(payload.messages)}}, + ) + try: + result = await run_remote_ingest_messages_batch(payload) + await record_event_best_effort( + run_id, + "ingest_done", + {"table": payload.table, "rows": len(payload.messages)}, + ) + await run_remote_record_run( + run_id=run_id, + run_type="ingest_messages_batch", + status="succeeded", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=result, + error_text=None, + ) + await record_event_best_effort(run_id, "recorded", {"status": "succeeded"}) + return {"run_id": run_id, "result": result} + except HTTPException as e: + err_text = e.detail if isinstance(e.detail, str) else json.dumps(e.detail, ensure_ascii=False) + await run_remote_record_run( + run_id=run_id, + run_type="ingest_messages_batch", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=err_text, + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise + except Exception as e: + await run_remote_record_run( + run_id=run_id, + run_type="ingest_messages_batch", + status="failed", + started_at_utc=started_at_utc, + finished_at_utc=now_iso(), + actor="admin-api", + input_json=run_input, + output_json=None, + error_text=str(e), + ) + await record_event_best_effort(run_id, "recorded", {"status": "failed"}) + raise diff --git a/connectivity_check.py b/connectivity_check.py new file mode 100644 index 0000000..33ede62 --- /dev/null +++ b/connectivity_check.py @@ -0,0 +1,126 @@ +import os +import sys +import json +import requests +from dotenv import load_dotenv + +# Optional: only needed for Gremlin websocket test +try: + import websocket + HAS_WEBSOCKET = True +except ImportError: + HAS_WEBSOCKET = False + + +def ok(msg): + print(f"[ OK ] {msg}") + + +def fail(msg): + print(f"[FAIL] {msg}") + + +def load_env(): + load_dotenv() + ok("Loaded .env file") + + +def test_http(name, url, path="", method="GET", json_body=None): + full_url = url.rstrip("/") + path + try: + resp = requests.request( + method, + full_url, + json=json_body, + timeout=5, + ) + if resp.status_code < 400: + ok(f"{name} reachable ({resp.status_code}) → {full_url}") + return True + else: + fail(f"{name} error ({resp.status_code}) → {full_url}") + except Exception as e: + fail(f"{name} unreachable → {full_url} ({e})") + return False + + +def test_gremlin_ws(url): + if not HAS_WEBSOCKET: + fail("Gremlin test skipped (websocket-client not installed)") + return False + + try: + ws = websocket.create_connection(url, timeout=5) + ws.close() + ok(f"Gremlin websocket reachable → {url}") + return True + except Exception as e: + fail(f"Gremlin websocket unreachable → {url} ({e})") + return False + + +def main(): + load_env() + + GREMLIN_URL = os.getenv("GREMLIN_URL", "ws://localhost:8182/gremlin") + ES_URL = os.getenv("ES_URL", "http://localhost:9200") + ES_INDEX = os.getenv("ES_INDEX", "concepts") + IPFS_API = os.getenv("IPFS_API", "http://localhost:5001") + OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434") + OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.1:8b") + OLLAMA_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "nomic-embed-text") + + print("\n=== Connectivity checks ===\n") + + # Gremlin + test_gremlin_ws(GREMLIN_URL) + + # Elasticsearch root + test_http("Elasticsearch", ES_URL) + + # Elasticsearch index existence + test_http( + "Elasticsearch index", + ES_URL, + path=f"/{ES_INDEX}", + method="HEAD", + ) + + # IPFS (Kubo) + test_http( + "IPFS API", + IPFS_API, + path="/api/v0/version", + method="POST", + ) + + # Ollama base + test_http( + "Ollama", + OLLAMA_URL, + path="/api/tags", + ) + + # Ollama model availability (best-effort) + try: + resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5) + models = [m["name"] for m in resp.json().get("models", [])] + + if OLLAMA_MODEL in models: + ok(f"Ollama model available → {OLLAMA_MODEL}") + else: + fail(f"Ollama model NOT found → {OLLAMA_MODEL}") + + if OLLAMA_EMBED_MODEL in models: + ok(f"Ollama embed model available → {OLLAMA_EMBED_MODEL}") + else: + fail(f"Ollama embed model NOT found → {OLLAMA_EMBED_MODEL}") + + except Exception as e: + fail(f"Ollama model check failed ({e})") + + print("\n=== Done ===\n") + + +if __name__ == "__main__": + main() diff --git a/create-messages-release-via-spark-container.sh b/create-messages-release-via-spark-container.sh new file mode 100755 index 0000000..5b8a15c --- /dev/null +++ b/create-messages-release-via-spark-container.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +RELEASE_NAME="${1:-rel_$(date -u +%Y-%m-%d)_messages-v1}" +TABLE="${MESSAGES_TABLE:-lake.db1.messages}" +MANIFEST_LOCAL="${2:-./manifests/${RELEASE_NAME}.json}" +DESCRIPTION="${RELEASE_DESCRIPTION:-Messages release for ${TABLE}}" +CREATED_BY="${RELEASE_CREATED_BY:-${USER:-unknown}}" +NESSIE_URI="${NESSIE_URI:-http://nessie:19120/api/v2}" +RELEASES_TABLE="${RELEASES_TABLE:-lake.db1.releases_v2}" + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" + +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./create_release_manifest.py}" +SCRIPT_REMOTE="/tmp/create_release_manifest.py" +MANIFEST_REMOTE="/tmp/${RELEASE_NAME}.json" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "create_release_manifest.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +mkdir -p "$(dirname "$MANIFEST_LOCAL")" + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "$SCRIPT_REMOTE" \ + --release-name "$RELEASE_NAME" \ + --table "$TABLE" \ + --nessie-uri "$NESSIE_URI" \ + --manifest-out "$MANIFEST_REMOTE" \ + --description "$DESCRIPTION" \ + --created-by "$CREATED_BY" \ + --releases-table "$RELEASES_TABLE" + +docker cp "$CONTAINER_NAME":"$MANIFEST_REMOTE" "$MANIFEST_LOCAL" + +echo "[DONE] Saved manifest: $MANIFEST_LOCAL" diff --git a/create-messages-table-via-spark-container.sh b/create-messages-table-via-spark-container.sh new file mode 100755 index 0000000..9cd958e --- /dev/null +++ b/create-messages-table-via-spark-container.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Creates Iceberg table for assistant message ingest. +# Default table: lake.db1.messages + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +MESSAGES_TABLE="${MESSAGES_TABLE:-lake.db1.messages}" + +SQL=" +CREATE NAMESPACE IF NOT EXISTS lake.db1; + +CREATE TABLE IF NOT EXISTS ${MESSAGES_TABLE} ( + thread_id STRING, + message_id STRING, + sender STRING, + channel STRING, + sent_at TIMESTAMP, + body STRING, + metadata_json STRING +) +USING iceberg +PARTITIONED BY (days(sent_at)); +" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-sql \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + -e "$SQL" diff --git a/create_release_manifest.py b/create_release_manifest.py new file mode 100644 index 0000000..168cb20 --- /dev/null +++ b/create_release_manifest.py @@ -0,0 +1,279 @@ +import argparse +import hashlib +import json +import os +import urllib.error +import urllib.parse +import urllib.request +from datetime import datetime, timezone + +from pyspark.sql import SparkSession +from pyspark.sql import types as T + + +def now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z') + + +def http_json(method: str, url: str, payload: dict | None = None) -> dict: + data = json.dumps(payload).encode("utf-8") if payload is not None else None + req = urllib.request.Request(url, data=data, method=method) + req.add_header("Content-Type", "application/json") + with urllib.request.urlopen(req, timeout=30) as resp: + body = resp.read().decode("utf-8") + return json.loads(body) if body else {} + + +def get_ref(nessie_uri: str, ref_name: str) -> dict | None: + try: + return http_json("GET", f"{nessie_uri.rstrip('/')}/trees/{urllib.parse.quote(ref_name, safe='')}") + except urllib.error.HTTPError as e: + if e.code == 404: + return None + raise + + +def extract_ref_hash(ref_obj: dict) -> str: + # Nessie responses can vary by endpoint/version: + # - {"type":"BRANCH","name":"main","hash":"..."} + # - {"reference":{"type":"BRANCH","name":"main","hash":"..."}} + if isinstance(ref_obj.get("hash"), str) and ref_obj["hash"]: + return ref_obj["hash"] + reference = ref_obj.get("reference") + if isinstance(reference, dict) and isinstance(reference.get("hash"), str) and reference["hash"]: + return reference["hash"] + raise KeyError("hash") + + +def ensure_tag(nessie_uri: str, tag_name: str) -> dict: + existing = get_ref(nessie_uri, tag_name) + if existing is not None: + return existing + + main_ref = http_json("GET", f"{nessie_uri.rstrip('/')}/trees/main") + payload = { + "type": "BRANCH", + "name": "main", + "hash": extract_ref_hash(main_ref), + } + query = urllib.parse.urlencode({"name": tag_name, "type": "TAG"}) + http_json("POST", f"{nessie_uri.rstrip('/')}/trees?{query}", payload) + created = get_ref(nessie_uri, tag_name) + if created is None: + raise RuntimeError(f"Tag creation appeared to succeed but tag '{tag_name}' is not retrievable") + return created + + +def create_registry_table_if_missing(spark: SparkSession, releases_table: str) -> None: + spark.sql( + f""" + CREATE TABLE IF NOT EXISTS {releases_table} ( + release_name STRING, + ref_type STRING, + ref_name STRING, + ref_hash STRING, + created_at_utc STRING, + ingested_at_utc STRING, + table_identifier STRING, + snapshot_id BIGINT, + metadata_location STRING, + manifest_sha256 STRING, + manifest_json STRING + ) USING iceberg + """ + ) + + +def _to_utc_datetime(value: str): + # Accept ISO strings with 'Z' suffix. + return datetime.fromisoformat(value.replace("Z", "+00:00")).astimezone(timezone.utc) + + +def _convert_value_for_type(field: T.StructField, value): + if value is None: + return None + dt = field.dataType + if isinstance(dt, T.StringType): + return str(value) + if isinstance(dt, T.LongType): + return int(value) + if isinstance(dt, T.IntegerType): + return int(value) + if isinstance(dt, T.ShortType): + return int(value) + if isinstance(dt, T.ByteType): + return int(value) + if isinstance(dt, T.BooleanType): + return bool(value) + if isinstance(dt, T.FloatType): + return float(value) + if isinstance(dt, T.DoubleType): + return float(value) + if isinstance(dt, T.TimestampType): + if isinstance(value, datetime): + return value + return _to_utc_datetime(str(value)) + if isinstance(dt, T.DateType): + if isinstance(value, datetime): + return value.date() + return _to_utc_datetime(str(value)).date() + # Leave unsupported/complex types as-is; Spark can still validate and fail clearly. + return value + + +def append_registry_row( + spark: SparkSession, + releases_table: str, + release_name: str, + ref_type: str, + ref_name: str, + ref_hash: str, + created_at_utc: str, + ingested_at_utc: str, + table_identifier: str, + snapshot_id: int, + metadata_location: str, + manifest_sha256: str, + manifest_json: str, + created_by: str, + description: str, +) -> None: + target_schema = spark.table(releases_table).schema + base_values = { + "release_name": release_name, + "ref_type": ref_type, + "ref_name": ref_name, + "ref_hash": ref_hash, + "created_at_utc": created_at_utc, + "ingested_at_utc": ingested_at_utc, + "table_identifier": table_identifier, + "snapshot_id": int(snapshot_id), + "metadata_location": metadata_location, + "manifest_sha256": manifest_sha256, + "manifest_json": manifest_json, + "created_by": created_by, + "description": description, + "release_description": description, + } + + row_values = [] + missing_required = [] + for field in target_schema.fields: + name = field.name + if name in base_values: + value = _convert_value_for_type(field, base_values[name]) + row_values.append(value) + continue + if field.nullable: + row_values.append(None) + continue + missing_required.append(name) + + if missing_required: + raise RuntimeError( + "Cannot append to registry table " + f"{releases_table}. Missing required columns with no known mapping: {', '.join(missing_required)}" + ) + + df = spark.createDataFrame([tuple(row_values)], schema=target_schema) + df.writeTo(releases_table).append() + + +def main() -> None: + p = argparse.ArgumentParser(description="Create a release tag + manifest + registry row for a table.") + p.add_argument("--release-name", required=True) + p.add_argument("--table", default="lake.db1.messages") + p.add_argument("--nessie-uri", default=os.getenv("NESSIE_URI", "http://nessie:19120/api/v2")) + p.add_argument("--manifest-out", required=True) + p.add_argument("--description", default="Messages release") + p.add_argument("--created-by", default=os.getenv("USER", "unknown")) + p.add_argument("--releases-table", default=os.getenv("RELEASES_TABLE", "lake.db1.releases_v2")) + p.add_argument("--skip-registry", action="store_true") + args = p.parse_args() + + created_at = now_iso() + tag_ref = ensure_tag(args.nessie_uri, args.release_name) + ref_hash = extract_ref_hash(tag_ref) + + spark = SparkSession.builder.appName("create-release-manifest").getOrCreate() + + snap_row = spark.sql( + f"SELECT snapshot_id FROM {args.table}.snapshots ORDER BY committed_at DESC LIMIT 1" + ).collect() + if not snap_row: + raise RuntimeError(f"No snapshots found for table {args.table}") + snapshot_id = int(snap_row[0]["snapshot_id"]) + + meta_row = spark.sql( + f"SELECT file AS metadata_location FROM {args.table}.metadata_log_entries ORDER BY timestamp DESC LIMIT 1" + ).collect() + if not meta_row: + raise RuntimeError(f"No metadata log entries found for table {args.table}") + metadata_location = str(meta_row[0]["metadata_location"]) + + manifest = { + "schema_version": "lakehouse-release-manifest/v1", + "release": { + "name": args.release_name, + "created_at_utc": created_at, + "created_by": args.created_by, + "description": args.description, + }, + "nessie": { + "uri": args.nessie_uri, + "ref": { + "type": "tag", + "name": args.release_name, + "hash": ref_hash, + }, + }, + "tables": [ + { + "identifier": args.table, + "format": "iceberg", + "current_snapshot_id": snapshot_id, + "metadata_location": metadata_location, + } + ], + } + + manifest_json = json.dumps(manifest, ensure_ascii=False, indent=2) + manifest_sha256 = hashlib.sha256(manifest_json.encode("utf-8")).hexdigest() + + os.makedirs(os.path.dirname(args.manifest_out) or ".", exist_ok=True) + with open(args.manifest_out, "w", encoding="utf-8") as f: + f.write(manifest_json) + + if not args.skip_registry: + create_registry_table_if_missing(spark, args.releases_table) + append_registry_row( + spark=spark, + releases_table=args.releases_table, + release_name=args.release_name, + ref_type="tag", + ref_name=args.release_name, + ref_hash=ref_hash, + created_at_utc=created_at, + ingested_at_utc=now_iso(), + table_identifier=args.table, + snapshot_id=snapshot_id, + metadata_location=metadata_location, + manifest_sha256=manifest_sha256, + manifest_json=manifest_json, + created_by=args.created_by, + description=args.description, + ) + + print(f"[INFO] release_name={args.release_name}") + print(f"[INFO] table={args.table}") + print(f"[INFO] ref_hash={ref_hash}") + print(f"[INFO] snapshot_id={snapshot_id}") + print(f"[INFO] manifest_out={args.manifest_out}") + if args.skip_registry: + print("[INFO] registry=skipped") + else: + print(f"[INFO] registry_table={args.releases_table}") + + +if __name__ == "__main__": + main() diff --git a/docker/projector/Dockerfile b/docker/projector/Dockerfile new file mode 100644 index 0000000..032e8f9 --- /dev/null +++ b/docker/projector/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.11-slim + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + SPARK_LOCAL_HOSTNAME=localhost \ + SPARK_LOCAL_IP=127.0.0.1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends default-jre-headless ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements-projector.txt /app/requirements-projector.txt +RUN pip install --upgrade pip && pip install -r /app/requirements-projector.txt + +COPY release_projector.py /app/release_projector.py + +ENTRYPOINT ["python", "/app/release_projector.py"] diff --git a/docker/projector/README.md b/docker/projector/README.md new file mode 100644 index 0000000..09489d5 --- /dev/null +++ b/docker/projector/README.md @@ -0,0 +1,41 @@ +# Projector Container + +Build on `lakehouse-core`: + +```bash +docker build -t jecio/release-projector:0.1 -f docker/projector/Dockerfile /tmp/jecio +``` + +Dry-run: + +```bash +docker run --rm --network host \ + -e NESSIE_URI=http://lakehouse-core:19120/api/v2 \ + -e NESSIE_WAREHOUSE=s3a://lakehouse/warehouse \ + -e S3_ENDPOINT=http://lakehouse-core:9000 \ + -e AWS_ACCESS_KEY_ID=minioadmin \ + -e AWS_SECRET_ACCESS_KEY=minioadmin \ + -v /tmp:/work \ + jecio/release-projector:0.1 \ + --manifest-file /work/rel_2026-02-14_docs-v1.json \ + --concept-table lake.db1.docs \ + --dry-run +``` + +Publish projection: + +```bash +docker run --rm --network host \ + -e NESSIE_URI=http://lakehouse-core:19120/api/v2 \ + -e NESSIE_WAREHOUSE=s3a://lakehouse/warehouse \ + -e S3_ENDPOINT=http://lakehouse-core:9000 \ + -e AWS_ACCESS_KEY_ID=minioadmin \ + -e AWS_SECRET_ACCESS_KEY=minioadmin \ + -e GREMLIN_URL=ws://janus.rakeroots.lan:8182/gremlin \ + -e ES_URL=http://janus.rakeroots.lan:9200 \ + -e ES_INDEX=concepts \ + -v /tmp:/work \ + jecio/release-projector:0.1 \ + --manifest-file /work/rel_2026-02-14_docs-v1.json \ + --concept-table lake.db1.docs +``` diff --git a/ingest-message-via-spark-container.sh b/ingest-message-via-spark-container.sh new file mode 100755 index 0000000..e4d4fa5 --- /dev/null +++ b/ingest-message-via-spark-container.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +TABLE="${1:-lake.db1.messages}" +THREAD_ID="${2:-}" +MESSAGE_ID="${3:-}" +SENDER="${4:-}" +CHANNEL="${5:-}" +SENT_AT="${6:-}" +BODY_B64="${7:-}" +METADATA_B64="${8:-}" + +if [[ -z "$THREAD_ID" || -z "$MESSAGE_ID" || -z "$SENDER" || -z "$CHANNEL" || -z "$BODY_B64" ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" + +BODY="$(printf '%s' "$BODY_B64" | base64 -d)" +METADATA_JSON="{}" +if [[ -n "$METADATA_B64" ]]; then + METADATA_JSON="$(printf '%s' "$METADATA_B64" | base64 -d)" +fi + +sql_escape() { + printf "%s" "$1" | sed "s/'/''/g" +} + +THREAD_ID_ESC="$(sql_escape "$THREAD_ID")" +MESSAGE_ID_ESC="$(sql_escape "$MESSAGE_ID")" +SENDER_ESC="$(sql_escape "$SENDER")" +CHANNEL_ESC="$(sql_escape "$CHANNEL")" +BODY_ESC="$(sql_escape "$BODY")" +METADATA_ESC="$(sql_escape "$METADATA_JSON")" + +if [[ -n "$SENT_AT" ]]; then + SENT_AT_EXPR="TIMESTAMP '$(sql_escape "$SENT_AT")'" +else + SENT_AT_EXPR="current_timestamp()" +fi + +SQL="INSERT INTO ${TABLE} (thread_id, message_id, sender, channel, sent_at, body, metadata_json) VALUES ('${THREAD_ID_ESC}', '${MESSAGE_ID_ESC}', '${SENDER_ESC}', '${CHANNEL_ESC}', ${SENT_AT_EXPR}, '${BODY_ESC}', '${METADATA_ESC}')" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-sql \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + -e "$SQL" + +echo "[DONE] Inserted message_id=${MESSAGE_ID} thread_id=${THREAD_ID} into ${TABLE}" diff --git a/ingest-messages-batch-via-spark-container.sh b/ingest-messages-batch-via-spark-container.sh new file mode 100755 index 0000000..265d19e --- /dev/null +++ b/ingest-messages-batch-via-spark-container.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -euo pipefail + +TABLE="${1:-lake.db1.messages}" +DEDUPE_MODE="${2:-none}" +PAYLOAD_B64="${3:-}" + +if [[ -z "$PAYLOAD_B64" ]]; then + echo "Usage: $0
" >&2 + exit 1 +fi + +if [[ "$DEDUPE_MODE" != "none" && "$DEDUPE_MODE" != "message_id" && "$DEDUPE_MODE" != "thread_message" ]]; then + echo "Invalid dedupe_mode: $DEDUPE_MODE (expected none|message_id|thread_message)" >&2 + exit 1 +fi + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" + +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./ingest_messages_batch.py}" +SCRIPT_REMOTE="/tmp/ingest_messages_batch.py" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "ingest_messages_batch.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" + +SPARK_ARGS=( + --table "$TABLE" + --dedupe-mode "$DEDUPE_MODE" +) + +if [[ "${PAYLOAD_B64:0:1}" == "@" ]]; then + PAYLOAD_FILE_HOST="${PAYLOAD_B64:1}" + if [[ ! -f "$PAYLOAD_FILE_HOST" ]]; then + echo "Payload file not found: $PAYLOAD_FILE_HOST" >&2 + exit 1 + fi + PAYLOAD_FILE_REMOTE="/opt/spark/work-dir/ingest_messages_payload.json" + docker cp "$PAYLOAD_FILE_HOST" "$CONTAINER_NAME":"$PAYLOAD_FILE_REMOTE" + # Ensure spark user can read the file regardless of ownership from docker cp. + docker exec -u 0 "$CONTAINER_NAME" /bin/sh -lc "chmod 644 '$PAYLOAD_FILE_REMOTE' || true" + SPARK_ARGS+=(--payload-file "$PAYLOAD_FILE_REMOTE") +else + SPARK_ARGS+=(--payload-b64 "$PAYLOAD_B64") +fi + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "$SCRIPT_REMOTE" \ + "${SPARK_ARGS[@]}" diff --git a/ingest_messages_batch.py b/ingest_messages_batch.py new file mode 100644 index 0000000..a50fecd --- /dev/null +++ b/ingest_messages_batch.py @@ -0,0 +1,139 @@ +import argparse +import base64 +import json +from datetime import datetime, timezone +from typing import Any, Dict, List + +from pyspark.sql import SparkSession, types as T + + +def now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def decode_payload(payload_b64: str) -> List[Dict[str, Any]]: + raw = base64.b64decode(payload_b64.encode("ascii")).decode("utf-8") + data = json.loads(raw) + if not isinstance(data, list): + raise ValueError("Payload must decode to a JSON array") + out: List[Dict[str, Any]] = [] + for i, row in enumerate(data): + if not isinstance(row, dict): + raise ValueError(f"Row {i} must be a JSON object") + out.append(row) + return out + + +def normalize_rows(rows: List[Dict[str, Any]]) -> List[tuple]: + norm: List[tuple] = [] + for i, r in enumerate(rows): + thread_id = str(r.get("thread_id") or "").strip() + message_id = str(r.get("message_id") or "").strip() + sender = str(r.get("sender") or "").strip() + channel = str(r.get("channel") or "").strip() + body = str(r.get("body") or "").strip() + if not thread_id or not message_id or not sender or not channel or not body: + raise ValueError( + f"Row {i} missing required fields. " + "Required: thread_id, message_id, sender, channel, body" + ) + + sent_at_raw = r.get("sent_at") + sent_at = str(sent_at_raw).strip() if sent_at_raw is not None else "" + metadata = r.get("metadata", {}) + if not isinstance(metadata, dict): + metadata = {} + metadata_json = json.dumps(metadata, ensure_ascii=False, sort_keys=True) + norm.append((thread_id, message_id, sender, channel, sent_at, body, metadata_json)) + return norm + + +def main() -> None: + p = argparse.ArgumentParser(description="Batch ingest messages into Iceberg table") + p.add_argument("--table", required=True) + p.add_argument( + "--dedupe-mode", + choices=["none", "message_id", "thread_message"], + default="none", + help="Optional dedupe strategy against existing target rows", + ) + p.add_argument("--payload-b64") + p.add_argument("--payload-file") + args = p.parse_args() + + if not args.payload_b64 and not args.payload_file: + raise ValueError("Provide either --payload-b64 or --payload-file") + if args.payload_b64 and args.payload_file: + raise ValueError("Provide only one of --payload-b64 or --payload-file") + + if args.payload_file: + with open(args.payload_file, "r", encoding="utf-8") as f: + file_data = json.load(f) + if not isinstance(file_data, list): + raise ValueError("--payload-file must contain a JSON array") + rows = normalize_rows(file_data) + else: + rows = normalize_rows(decode_payload(args.payload_b64 or "")) + if not rows: + print("[INFO] No rows supplied; nothing to ingest.") + return + + spark = SparkSession.builder.appName("ingest-messages-batch").getOrCreate() + + schema = T.StructType( + [ + T.StructField("thread_id", T.StringType(), False), + T.StructField("message_id", T.StringType(), False), + T.StructField("sender", T.StringType(), False), + T.StructField("channel", T.StringType(), False), + T.StructField("sent_at_raw", T.StringType(), True), + T.StructField("body", T.StringType(), False), + T.StructField("metadata_json", T.StringType(), False), + ] + ) + df = spark.createDataFrame(rows, schema=schema) + df.createOrReplaceTempView("_batch_messages") + + base_select = """ + SELECT + b.thread_id, + b.message_id, + b.sender, + b.channel, + CASE + WHEN b.sent_at_raw IS NULL OR TRIM(b.sent_at_raw) = '' THEN current_timestamp() + ELSE CAST(b.sent_at_raw AS TIMESTAMP) + END AS sent_at, + b.body, + b.metadata_json + FROM _batch_messages b + """ + if args.dedupe_mode == "none": + insert_select = base_select + elif args.dedupe_mode == "message_id": + insert_select = ( + base_select + + f" LEFT ANTI JOIN {args.table} t ON b.message_id = t.message_id" + ) + else: + insert_select = ( + base_select + + f" LEFT ANTI JOIN {args.table} t ON b.thread_id = t.thread_id AND b.message_id = t.message_id" + ) + + spark.sql( + f""" + INSERT INTO {args.table} (thread_id, message_id, sender, channel, sent_at, body, metadata_json) + {insert_select} + """ + ) + + print(f"[INFO] rows_in={len(rows)}") + print(f"[INFO] dedupe_mode={args.dedupe_mode}") + print(f"[INFO] table={args.table}") + print(f"[INFO] ingested_at_utc={now_iso()}") + print(f"[DONE] Batch ingest finished for {args.table}") + + +if __name__ == "__main__": + main() diff --git a/manifests/rel_2026-02-14_docs-v1.json b/manifests/rel_2026-02-14_docs-v1.json new file mode 100644 index 0000000..8539695 --- /dev/null +++ b/manifests/rel_2026-02-14_docs-v1.json @@ -0,0 +1,42 @@ +{ + "schema_version": "lakehouse-release-manifest/v1", + "release": { + "name": "rel_2026-02-14_docs-v1", + "created_at_utc": "2026-02-14T09:48:38Z", + "created_by": "niklas", + "description": "First tagged release for lake.db1.docs" + }, + "nessie": { + "uri": "http://lakehouse-core:19120/api/v2", + "ref": { + "type": "tag", + "name": "rel_2026-02-14_docs-v1", + "hash": "1b16b4c4f6e99d43a27a21712aab319c1840a415f36bc6bebb2c9d2a89f09ef0" + } + }, + "warehouse": { + "bucket": "lakehouse", + "warehouse_path": "s3a://lakehouse/warehouse", + "s3_endpoint": "http://lakehouse-core:9000", + "region": "us-east-1" + }, + "tables": [ + { + "identifier": "lake.db1.docs", + "format": "iceberg", + "current_snapshot_id": 4212875880010474311, + "metadata_location": "s3a://lakehouse/warehouse/db1/docs_2693aab9-54ea-43a8-892b-a922fdfc063a/metadata/00001-64f23fb4-2cb3-45c5-9c20-e6c91c9d73ef.metadata.json" + } + ], + "projection": { + "enabled": false, + "projection_id": null, + "targets": [] + }, + "artifacts": { + "ipfs": { + "pinned": false, + "cid": null + } + } +} diff --git a/query-assistant-actions-via-spark-container.sh b/query-assistant-actions-via-spark-container.sh new file mode 100755 index 0000000..e900f5a --- /dev/null +++ b/query-assistant-actions-via-spark-container.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +STATUS="${1:-}" +TASK_TYPE="${2:-}" +RELEASE_NAME="${3:-}" +STEP_ID="${4:-}" +ACTION_TYPE="${5:-}" +LIMIT="${6:-50}" +ACTION_TABLE="${ACTION_TABLE:-lake.db1.assistant_actions}" + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./query_assistant_actions.py}" +SCRIPT_REMOTE="/tmp/query_assistant_actions.py" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "query_assistant_actions.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "$SCRIPT_REMOTE" \ + --table "$ACTION_TABLE" \ + --status "$STATUS" \ + --task-type "$TASK_TYPE" \ + --release-name "$RELEASE_NAME" \ + --step-id "$STEP_ID" \ + --action-type "$ACTION_TYPE" \ + --limit "$LIMIT" diff --git a/query-assistant-feedback-via-spark-container.sh b/query-assistant-feedback-via-spark-container.sh new file mode 100755 index 0000000..4cc6e01 --- /dev/null +++ b/query-assistant-feedback-via-spark-container.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +OUTCOME="${1:-}" +TASK_TYPE="${2:-}" +RELEASE_NAME="${3:-}" +LIMIT="${4:-50}" +FEEDBACK_TABLE="${FEEDBACK_TABLE:-lake.db1.assistant_feedback}" + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./query_assistant_feedback.py}" +SCRIPT_REMOTE="/tmp/query_assistant_feedback.py" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "query_assistant_feedback.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "$SCRIPT_REMOTE" \ + --table "$FEEDBACK_TABLE" \ + --outcome "$OUTCOME" \ + --task-type "$TASK_TYPE" \ + --release-name "$RELEASE_NAME" \ + --limit "$LIMIT" diff --git a/query-assistant-metrics-via-spark-container.sh b/query-assistant-metrics-via-spark-container.sh new file mode 100755 index 0000000..4e07cb6 --- /dev/null +++ b/query-assistant-metrics-via-spark-container.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -euo pipefail + +TASK_TYPE="${1:-}" +RELEASE_NAME="${2:-}" +OUTCOME="${3:-}" +GROUP_BY="${4:-both}" +LIMIT="${5:-100}" +FEEDBACK_TABLE="${FEEDBACK_TABLE:-lake.db1.assistant_feedback}" + +if [[ "$GROUP_BY" != "task_type" && "$GROUP_BY" != "release_name" && "$GROUP_BY" != "both" ]]; then + echo "Invalid group_by: $GROUP_BY (expected task_type|release_name|both)" >&2 + exit 1 +fi + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./query_assistant_metrics.py}" +SCRIPT_REMOTE="/tmp/query_assistant_metrics.py" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "query_assistant_metrics.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "$SCRIPT_REMOTE" \ + --table "$FEEDBACK_TABLE" \ + --task-type "$TASK_TYPE" \ + --release-name "$RELEASE_NAME" \ + --outcome "$OUTCOME" \ + --group-by "$GROUP_BY" \ + --limit "$LIMIT" diff --git a/query-imap-checkpoint-via-spark-container.sh b/query-imap-checkpoint-via-spark-container.sh new file mode 100755 index 0000000..2a084af --- /dev/null +++ b/query-imap-checkpoint-via-spark-container.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +set -euo pipefail + +HOST="${1:-}" +MAILBOX="${2:-}" +USERNAME="${3:-}" +TABLE="${4:-lake.db1.messages}" + +if [[ -z "$HOST" || -z "$MAILBOX" || -z "$USERNAME" ]]; then + echo "Usage: $0 [table]" >&2 + exit 1 +fi + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./query_imap_checkpoint.py}" +SCRIPT_REMOTE="/tmp/query_imap_checkpoint.py" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "query_imap_checkpoint.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "$SCRIPT_REMOTE" \ + --table "$TABLE" \ + --host "$HOST" \ + --mailbox "$MAILBOX" \ + --username "$USERNAME" diff --git a/query_assistant_actions.py b/query_assistant_actions.py new file mode 100644 index 0000000..d107f79 --- /dev/null +++ b/query_assistant_actions.py @@ -0,0 +1,45 @@ +import argparse +import json +import os + +from pyspark.sql import SparkSession +from pyspark.sql import functions as F + + +def main() -> None: + p = argparse.ArgumentParser(description="Query assistant actions") + p.add_argument("--table", default=os.getenv("ACTION_TABLE", "lake.db1.assistant_actions")) + p.add_argument("--status", default="") + p.add_argument("--task-type", default="") + p.add_argument("--release-name", default="") + p.add_argument("--step-id", default="") + p.add_argument("--action-type", default="") + p.add_argument("--limit", type=int, default=50) + args = p.parse_args() + + spark = SparkSession.builder.appName("query-assistant-actions").getOrCreate() + df = spark.table(args.table) + + if args.status: + df = df.where(F.col("status") == args.status) + if args.task_type: + df = df.where(F.col("task_type") == args.task_type) + if args.release_name: + df = df.where(F.col("release_name") == args.release_name) + if args.step_id: + df = df.where(F.col("step_id") == args.step_id) + if args.action_type: + df = df.where(F.col("action_type") == args.action_type) + + rows = ( + df.orderBy(F.col("created_at_utc").desc_nulls_last()) + .limit(max(1, min(args.limit, 500))) + .collect() + ) + + out = [r.asDict(recursive=True) for r in rows] + print(json.dumps(out, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/query_assistant_feedback.py b/query_assistant_feedback.py new file mode 100644 index 0000000..d40c782 --- /dev/null +++ b/query_assistant_feedback.py @@ -0,0 +1,43 @@ +import argparse +import json +import os + +from pyspark.sql import SparkSession +from pyspark.sql import functions as F + + +def main() -> None: + p = argparse.ArgumentParser(description="Query assistant feedback rows") + p.add_argument("--table", default=os.getenv("FEEDBACK_TABLE", "lake.db1.assistant_feedback")) + p.add_argument("--outcome", default="") + p.add_argument("--task-type", default="") + p.add_argument("--release-name", default="") + p.add_argument("--limit", type=int, default=50) + args = p.parse_args() + + spark = SparkSession.builder.appName("query-assistant-feedback").getOrCreate() + df = spark.table(args.table) + + if args.outcome: + df = df.where(F.col("outcome") == args.outcome) + if args.task_type: + df = df.where(F.col("task_type") == args.task_type) + if args.release_name: + df = df.where(F.col("release_name") == args.release_name) + + rows = ( + df.orderBy(F.col("created_at_utc").desc_nulls_last()) + .limit(max(1, min(args.limit, 500))) + .collect() + ) + + out = [] + for r in rows: + item = r.asDict(recursive=True) + out.append(item) + + print(json.dumps(out, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/query_assistant_metrics.py b/query_assistant_metrics.py new file mode 100644 index 0000000..04c0cd5 --- /dev/null +++ b/query_assistant_metrics.py @@ -0,0 +1,57 @@ +import argparse +import json +import os + +from pyspark.sql import SparkSession +from pyspark.sql import functions as F + + +def main() -> None: + p = argparse.ArgumentParser(description="Query assistant feedback metrics") + p.add_argument("--table", default=os.getenv("FEEDBACK_TABLE", "lake.db1.assistant_feedback")) + p.add_argument("--task-type", default="") + p.add_argument("--release-name", default="") + p.add_argument("--outcome", default="") + p.add_argument("--group-by", choices=["task_type", "release_name", "both"], default="both") + p.add_argument("--limit", type=int, default=100) + args = p.parse_args() + + spark = SparkSession.builder.appName("query-assistant-metrics").getOrCreate() + df = spark.table(args.table) + + if args.task_type: + df = df.where(F.col("task_type") == args.task_type) + if args.release_name: + df = df.where(F.col("release_name") == args.release_name) + if args.outcome: + df = df.where(F.col("outcome") == args.outcome) + + if args.group_by == "task_type": + group_cols = [F.col("task_type")] + elif args.group_by == "release_name": + group_cols = [F.col("release_name")] + else: + group_cols = [F.col("task_type"), F.col("release_name")] + + agg = ( + df.groupBy(*group_cols) + .agg( + F.count(F.lit(1)).alias("total"), + F.sum(F.when(F.col("outcome") == "accepted", F.lit(1)).otherwise(F.lit(0))).alias("accepted"), + F.sum(F.when(F.col("outcome") == "edited", F.lit(1)).otherwise(F.lit(0))).alias("edited"), + F.sum(F.when(F.col("outcome") == "rejected", F.lit(1)).otherwise(F.lit(0))).alias("rejected"), + F.avg(F.col("confidence")).alias("avg_confidence"), + ) + .withColumn("accept_rate", F.when(F.col("total") > 0, F.col("accepted") / F.col("total")).otherwise(F.lit(0.0))) + .withColumn("edit_rate", F.when(F.col("total") > 0, F.col("edited") / F.col("total")).otherwise(F.lit(0.0))) + .withColumn("reject_rate", F.when(F.col("total") > 0, F.col("rejected") / F.col("total")).otherwise(F.lit(0.0))) + .orderBy(F.col("total").desc(), *[c.asc() for c in group_cols]) + .limit(max(1, min(args.limit, 1000))) + ) + + rows = [r.asDict(recursive=True) for r in agg.collect()] + print(json.dumps(rows, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/query_imap_checkpoint.py b/query_imap_checkpoint.py new file mode 100644 index 0000000..07eab14 --- /dev/null +++ b/query_imap_checkpoint.py @@ -0,0 +1,43 @@ +import argparse +import json +import os + +from pyspark.sql import SparkSession +from pyspark.sql import functions as F + + +def main() -> None: + p = argparse.ArgumentParser(description="Query latest IMAP UID checkpoint from messages table") + p.add_argument("--table", default=os.getenv("MESSAGES_TABLE", "lake.db1.messages")) + p.add_argument("--host", required=True) + p.add_argument("--mailbox", required=True) + p.add_argument("--username", required=True) + args = p.parse_args() + + spark = SparkSession.builder.appName("query-imap-checkpoint").getOrCreate() + df = spark.table(args.table) + + md = F.col("metadata_json") + uid_col = F.get_json_object(md, "$.imap_uid") + host_col = F.get_json_object(md, "$.host") + mailbox_col = F.get_json_object(md, "$.mailbox") + username_col = F.get_json_object(md, "$.username") + + filtered = ( + df.where(F.col("channel") == "email-imap") + .where(host_col == args.host) + .where(mailbox_col == args.mailbox) + .where((username_col == args.username) | username_col.isNull() | (username_col == "")) + .where(uid_col.isNotNull()) + ) + + row = filtered.select(F.max(uid_col.cast("long")).alias("max_uid")).collect() + max_uid = None + if row and row[0]["max_uid"] is not None: + max_uid = int(row[0]["max_uid"]) + + print(json.dumps({"max_uid": max_uid}, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/record-assistant-action-via-spark-container.sh b/record-assistant-action-via-spark-container.sh new file mode 100755 index 0000000..ef3760e --- /dev/null +++ b/record-assistant-action-via-spark-container.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -euo pipefail + +ACTION_TABLE="${ACTION_TABLE:-lake.db1.assistant_actions}" +ACTION_ID="${1:-}" +CREATED_AT_UTC="${2:-}" +TASK_TYPE="${3:-}" +RELEASE_NAME="${4:-}" +OBJECTIVE_B64="${5:-}" +STEP_ID="${6:-}" +STEP_TITLE_B64="${7:-}" +ACTION_TYPE="${8:-}" +REQUIRES_APPROVAL="${9:-false}" +APPROVED="${10:-false}" +STATUS="${11:-}" +OUTPUT_B64="${12:-}" +ERROR_B64="${13:-}" + +if [[ -z "$ACTION_ID" || -z "$CREATED_AT_UTC" || -z "$TASK_TYPE" || -z "$STEP_ID" || -z "$ACTION_TYPE" || -z "$STATUS" ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./write_assistant_action.py}" +SCRIPT_REMOTE="/tmp/write_assistant_action.py" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "write_assistant_action.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "$SCRIPT_REMOTE" \ + --table "$ACTION_TABLE" \ + --action-id "$ACTION_ID" \ + --created-at-utc "$CREATED_AT_UTC" \ + --task-type "$TASK_TYPE" \ + --release-name "$RELEASE_NAME" \ + --objective-b64 "$OBJECTIVE_B64" \ + --step-id "$STEP_ID" \ + --step-title-b64 "$STEP_TITLE_B64" \ + --action-type "$ACTION_TYPE" \ + --requires-approval "$REQUIRES_APPROVAL" \ + --approved "$APPROVED" \ + --status "$STATUS" \ + --output-b64 "$OUTPUT_B64" \ + --error-b64 "$ERROR_B64" + +echo "[DONE] Recorded assistant action ${ACTION_ID} into ${ACTION_TABLE}" diff --git a/record-assistant-feedback-via-spark-container.sh b/record-assistant-feedback-via-spark-container.sh new file mode 100755 index 0000000..c3d8e5e --- /dev/null +++ b/record-assistant-feedback-via-spark-container.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +set -euo pipefail + +FEEDBACK_TABLE="${FEEDBACK_TABLE:-lake.db1.assistant_feedback}" +FEEDBACK_ID="${1:-}" +CREATED_AT_UTC="${2:-}" +OUTCOME="${3:-}" +TASK_TYPE="${4:-}" +RELEASE_NAME="${5:-}" +CONFIDENCE="${6:-0}" +NEEDS_REVIEW="${7:-true}" +GOAL_B64="${8:-}" +DRAFT_B64="${9:-}" +FINAL_B64="${10:-}" +SOURCES_B64="${11:-}" +NOTES_B64="${12:-}" + +if [[ -z "$FEEDBACK_ID" || -z "$CREATED_AT_UTC" || -z "$OUTCOME" || -z "$TASK_TYPE" ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./write_assistant_feedback.py}" +SCRIPT_REMOTE="/tmp/write_assistant_feedback.py" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "write_assistant_feedback.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "$SCRIPT_REMOTE" \ + --table "$FEEDBACK_TABLE" \ + --feedback-id "$FEEDBACK_ID" \ + --created-at-utc "$CREATED_AT_UTC" \ + --outcome "$OUTCOME" \ + --task-type "$TASK_TYPE" \ + --release-name "$RELEASE_NAME" \ + --confidence "$CONFIDENCE" \ + --needs-review "$NEEDS_REVIEW" \ + --goal-b64 "$GOAL_B64" \ + --draft-b64 "$DRAFT_B64" \ + --final-b64 "$FINAL_B64" \ + --sources-b64 "$SOURCES_B64" \ + --notes-b64 "$NOTES_B64" + +echo "[DONE] Recorded assistant feedback ${FEEDBACK_ID} into ${FEEDBACK_TABLE}" diff --git a/record-run-event-via-spark-container.sh b/record-run-event-via-spark-container.sh new file mode 100755 index 0000000..b5e1a51 --- /dev/null +++ b/record-run-event-via-spark-container.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Args: +# 1 run_id +# 2 event_type +# 3 event_at_utc +# 4 detail_json_b64 +RUN_ID="${1:-}" +EVENT_TYPE="${2:-}" +EVENT_AT_UTC="${3:-}" +DETAIL_JSON_B64="${4:-}" + +if [[ -z "$RUN_ID" || -z "$EVENT_TYPE" || -z "$EVENT_AT_UTC" ]]; then + echo "usage: $0 " >&2 + exit 1 +fi + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +RUN_EVENTS_TABLE="${RUN_EVENTS_TABLE:-lake.db1.run_events}" + +decode_b64() { + local s="$1" + if [[ -z "$s" ]]; then + printf "" + return + fi + printf '%s' "$s" | base64 -d +} + +escape_sql() { + sed "s/'/''/g" +} + +DETAIL_JSON="$(decode_b64 "$DETAIL_JSON_B64" | escape_sql)" +RUN_ID_ESC="$(printf '%s' "$RUN_ID" | escape_sql)" +EVENT_TYPE_ESC="$(printf '%s' "$EVENT_TYPE" | escape_sql)" +EVENT_AT_ESC="$(printf '%s' "$EVENT_AT_UTC" | escape_sql)" + +SQL=" +CREATE TABLE IF NOT EXISTS ${RUN_EVENTS_TABLE} ( + run_id STRING, + event_type STRING, + event_at_utc STRING, + detail_json STRING, + ingested_at_utc STRING +) USING iceberg; + +INSERT INTO ${RUN_EVENTS_TABLE} VALUES ( + '${RUN_ID_ESC}', + '${EVENT_TYPE_ESC}', + '${EVENT_AT_ESC}', + '${DETAIL_JSON}', + '${EVENT_AT_ESC}' +); +" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-sql \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + -e "$SQL" diff --git a/record-run-via-spark-container.sh b/record-run-via-spark-container.sh new file mode 100755 index 0000000..62788cf --- /dev/null +++ b/record-run-via-spark-container.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Args: +# 1 run_id +# 2 run_type +# 3 status +# 4 started_at_utc +# 5 finished_at_utc (or empty) +# 6 actor +# 7 input_json_b64 +# 8 output_json_b64 +# 9 error_text_b64 +RUN_ID="${1:-}" +RUN_TYPE="${2:-}" +STATUS="${3:-}" +STARTED_AT_UTC="${4:-}" +FINISHED_AT_UTC="${5:-}" +ACTOR="${6:-}" +INPUT_JSON_B64="${7:-}" +OUTPUT_JSON_B64="${8:-}" +ERROR_TEXT_B64="${9:-}" + +if [[ -z "$RUN_ID" || -z "$RUN_TYPE" || -z "$STATUS" || -z "$STARTED_AT_UTC" ]]; then + echo "usage: $0 " >&2 + exit 1 +fi + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" +RUNS_TABLE="${RUNS_TABLE:-lake.db1.runs}" + +decode_b64() { + local s="$1" + if [[ -z "$s" ]]; then + printf "" + return + fi + printf '%s' "$s" | base64 -d +} + +escape_sql() { + sed "s/'/''/g" +} + +INPUT_JSON="$(decode_b64 "$INPUT_JSON_B64" | escape_sql)" +OUTPUT_JSON="$(decode_b64 "$OUTPUT_JSON_B64" | escape_sql)" +ERROR_TEXT="$(decode_b64 "$ERROR_TEXT_B64" | escape_sql)" +RUN_ID_ESC="$(printf '%s' "$RUN_ID" | escape_sql)" +RUN_TYPE_ESC="$(printf '%s' "$RUN_TYPE" | escape_sql)" +STATUS_ESC="$(printf '%s' "$STATUS" | escape_sql)" +STARTED_ESC="$(printf '%s' "$STARTED_AT_UTC" | escape_sql)" +FINISHED_ESC="$(printf '%s' "$FINISHED_AT_UTC" | escape_sql)" +ACTOR_ESC="$(printf '%s' "$ACTOR" | escape_sql)" + +SQL=" +CREATE TABLE IF NOT EXISTS ${RUNS_TABLE} ( + run_id STRING, + run_type STRING, + status STRING, + started_at_utc STRING, + finished_at_utc STRING, + actor STRING, + input_json STRING, + output_json STRING, + error_text STRING, + ingested_at_utc STRING +) USING iceberg; + +INSERT INTO ${RUNS_TABLE} VALUES ( + '${RUN_ID_ESC}', + '${RUN_TYPE_ESC}', + '${STATUS_ESC}', + '${STARTED_ESC}', + '${FINISHED_ESC}', + '${ACTOR_ESC}', + '${INPUT_JSON}', + '${OUTPUT_JSON}', + '${ERROR_TEXT}', + '${STARTED_ESC}' +); +" + +docker exec \ + -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-sql \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + -e "$SQL" diff --git a/release_projector.py b/release_projector.py new file mode 100644 index 0000000..1274e9d --- /dev/null +++ b/release_projector.py @@ -0,0 +1,607 @@ +import argparse +import hashlib +import json +import os +import urllib.error +import urllib.request +from datetime import date, datetime, timezone +from typing import Any, Dict, List, Optional + +try: + from dotenv import load_dotenv +except Exception: + load_dotenv = None + + +DEFAULT_SPARK_PACKAGES = ( + "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1," + "org.apache.iceberg:iceberg-aws-bundle:1.10.1," + "org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5" +) + + +def utc_now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def parse_json_maybe(value: Any, expected_type: type, fallback: Any) -> Any: + if value is None: + return fallback + if isinstance(value, expected_type): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + if isinstance(parsed, expected_type): + return parsed + except Exception: + return fallback + return fallback + + +def first_str(row: Dict[str, Any], keys: List[str]) -> Optional[str]: + for key in keys: + val = row.get(key) + if isinstance(val, str) and val.strip(): + return val.strip() + return None + + +def to_iso(value: Any) -> Optional[str]: + if isinstance(value, datetime): + return value.isoformat() + if isinstance(value, date): + return datetime.combine(value, datetime.min.time(), timezone.utc).isoformat() + if isinstance(value, str) and value.strip(): + return value.strip() + return None + + +def make_fingerprint(name: str, kind: Optional[str], external_ids: Dict[str, str]) -> str: + norm = (name or "").strip().lower() + kind_norm = (kind or "").strip().lower() + ext = "|".join(f"{k}:{v}".lower() for k, v in sorted(external_ids.items())) + raw = f"{norm}|{kind_norm}|{ext}" + return hashlib.sha256(raw.encode("utf-8")).hexdigest() + + +def load_manifest(path: str) -> Dict[str, Any]: + with open(path, "r", encoding="utf-8") as f: + raw = json.load(f) + + if isinstance(raw, dict): + manifest_json = raw.get("manifest_json") + if isinstance(manifest_json, str): + try: + parsed = json.loads(manifest_json) + if isinstance(parsed, dict): + return parsed + except Exception: + pass + return raw + + if isinstance(raw, list) and raw and isinstance(raw[0], dict): + manifest_json = raw[0].get("manifest_json") + if isinstance(manifest_json, str): + parsed = json.loads(manifest_json) + if isinstance(parsed, dict): + return parsed + + raise ValueError("Manifest file must contain a manifest object or releases_v2 row with manifest_json.") + + +def infer_manifest_ref(manifest: Dict[str, Any]) -> Optional[str]: + nessie = manifest.get("nessie") + if isinstance(nessie, dict): + ref_obj = nessie.get("ref") + if isinstance(ref_obj, dict): + ref_name = ref_obj.get("name") + if isinstance(ref_name, str) and ref_name.strip(): + return ref_name.strip() + tag = nessie.get("tag") + if isinstance(tag, str) and tag.strip(): + return tag.strip() + + release_obj = manifest.get("release") + if isinstance(release_obj, dict): + release_name = release_obj.get("name") + if isinstance(release_name, str) and release_name.strip(): + return release_name.strip() + + for key in ("nessie_tag", "tag", "release_name"): + val = manifest.get(key) + if isinstance(val, str) and val.strip(): + return val.strip() + + return None + + +def extract_table_identifiers(manifest: Dict[str, Any]) -> List[str]: + out: List[str] = [] + tables = manifest.get("tables") + if isinstance(tables, list): + for t in tables: + if not isinstance(t, dict): + continue + ident = t.get("table_identifier") or t.get("identifier") or t.get("table") + if isinstance(ident, str) and ident.strip(): + out.append(ident.strip()) + + if out: + return out + + rows = manifest.get("rows") + if isinstance(rows, list): + for row in rows: + if not isinstance(row, dict): + continue + ident = row.get("table_identifier") + if isinstance(ident, str) and ident.strip(): + out.append(ident.strip()) + + return out + + +def infer_concept_table(tables: List[str]) -> Optional[str]: + for t in tables: + lower = t.lower() + if "concept" in lower: + return t + return tables[0] if tables else None + + +def load_manifest_from_registry( + spark: Any, + catalog: str, + release_name: str, + releases_table: Optional[str] = None, +) -> Dict[str, Any]: + from pyspark.sql import functions as F + + table = releases_table or os.getenv("RELEASES_TABLE", "db1.releases_v2") + if table.count(".") == 1: + table = f"{catalog}.{table}" + + row = ( + spark.table(table) + .where(F.col("release_name") == release_name) + .orderBy(F.col("ingested_at_utc").desc_nulls_last()) + .select("manifest_json") + .limit(1) + .collect() + ) + if not row: + raise ValueError(f"Release '{release_name}' not found in registry table {table}.") + + manifest_json = row[0]["manifest_json"] + if not isinstance(manifest_json, str) or not manifest_json.strip(): + raise ValueError(f"Release '{release_name}' has empty manifest_json in {table}.") + + manifest = json.loads(manifest_json) + if not isinstance(manifest, dict): + raise ValueError(f"Release '{release_name}' manifest_json is not a JSON object.") + return manifest + + +def build_spark(ref: str): + try: + from pyspark.sql import SparkSession + except Exception as e: + raise RuntimeError( + "pyspark is not installed. Install it or run this with spark-submit." + ) from e + + catalog = os.getenv("SPARK_CATALOG", "lake") + + builder = ( + SparkSession.builder.appName("release-projector") + .config( + "spark.sql.extensions", + "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions," + "org.projectnessie.spark.extensions.NessieSparkSessionExtensions", + ) + .config("spark.jars.packages", os.getenv("SPARK_PACKAGES", DEFAULT_SPARK_PACKAGES)) + .config(f"spark.sql.catalog.{catalog}", "org.apache.iceberg.spark.SparkCatalog") + .config(f"spark.sql.catalog.{catalog}.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog") + .config(f"spark.sql.catalog.{catalog}.uri", os.getenv("NESSIE_URI", "http://lakehouse-core:19120/api/v2")) + .config(f"spark.sql.catalog.{catalog}.ref", ref) + .config( + f"spark.sql.catalog.{catalog}.warehouse", + os.getenv("NESSIE_WAREHOUSE", "s3a://lakehouse/warehouse"), + ) + .config(f"spark.sql.catalog.{catalog}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") + .config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT", "http://lakehouse-core:9000")) + .config("spark.hadoop.fs.s3a.path.style.access", os.getenv("S3_PATH_STYLE", "true")) + .config( + "spark.hadoop.fs.s3a.access.key", + os.getenv("AWS_ACCESS_KEY_ID", os.getenv("MINIO_ROOT_USER", "minioadmin")), + ) + .config( + "spark.hadoop.fs.s3a.secret.key", + os.getenv("AWS_SECRET_ACCESS_KEY", os.getenv("MINIO_ROOT_PASSWORD", "minioadmin")), + ) + ) + + spark_master = os.getenv("SPARK_MASTER") + if spark_master: + builder = builder.master(spark_master) + + return builder.getOrCreate(), catalog + + +def ensure_es_index(es_url: str, es_index: str) -> None: + mapping = { + "mappings": { + "properties": { + "concept_id": {"type": "keyword"}, + "concept_type": {"type": "keyword"}, + "display_name": {"type": "text"}, + "description": {"type": "text"}, + "text": {"type": "text"}, + "source_table": {"type": "keyword"}, + "source_pk": {"type": "keyword"}, + "release_name": {"type": "keyword"}, + "ref_hash": {"type": "keyword"}, + "attributes_json": {"type": "text"}, + "canonical_name": {"type": "text"}, + "kind": {"type": "keyword"}, + "aliases": {"type": "text"}, + "tags": {"type": "keyword"}, + "summary": {"type": "text"}, + "latest_cid": {"type": "keyword"}, + "fingerprint": {"type": "keyword"}, + "created_at": {"type": "date"}, + "updated_at": {"type": "date"}, + } + } + } + url = f"{es_url.rstrip('/')}/{es_index}" + req_get = urllib.request.Request(url, method="GET") + try: + with urllib.request.urlopen(req_get, timeout=30) as resp: + if 200 <= resp.status < 300: + return + except urllib.error.HTTPError as e: + if e.code != 404: + raise + + body = json.dumps(mapping).encode("utf-8") + req_put = urllib.request.Request(url, data=body, method="PUT") + req_put.add_header("Content-Type", "application/json") + with urllib.request.urlopen(req_put, timeout=30) as resp: + if resp.status >= 400: + raise RuntimeError(f"Failed to create ES index {es_index}: HTTP {resp.status}") + + +def es_upsert(es_url: str, es_index: str, doc: Dict[str, Any]) -> None: + url = f"{es_url.rstrip('/')}/{es_index}/_doc/{doc['concept_id']}" + body = json.dumps(doc, default=str).encode("utf-8") + req = urllib.request.Request(url, data=body, method="PUT") + req.add_header("Content-Type", "application/json") + with urllib.request.urlopen(req, timeout=30) as resp: + if resp.status >= 400: + raise RuntimeError(f"Failed ES upsert for {doc['concept_id']}: HTTP {resp.status}") + + +def gremlin_upsert(gremlin_url: str, concept: Dict[str, Any]) -> None: + from gremlin_python.driver import client as gremlin_client + from gremlin_python.driver.serializer import GraphSONSerializersV3d0 + + created_at = concept.get("created_at") or utc_now_iso() + updated_at = concept.get("updated_at") or utc_now_iso() + + query = """ + g.V().hasLabel('Concept').has('concept_id', concept_id).fold() + .coalesce( + unfold(), + addV('Concept').property('concept_id', concept_id).property('created_at', created_at) + ) + .property('canonical_name', canonical_name) + .property('kind', kind) + .property('concept_type', concept_type) + .property('display_name', display_name) + .property('description', description) + .property('text', text) + .property('source_table', source_table) + .property('source_pk', source_pk) + .property('release_name', release_name) + .property('ref_hash', ref_hash) + .property('attributes_json', attributes_json) + .property('aliases', aliases_json) + .property('external_ids', external_ids_json) + .property('tags', tags_json) + .property('fingerprint', fingerprint) + .property('latest_cid', latest_cid) + .property('summary', summary) + .property('updated_at', updated_at) + .values('concept_id') + """ + + c = gremlin_client.Client( + gremlin_url, + "g", + message_serializer=GraphSONSerializersV3d0(), + ) + try: + c.submit( + query, + { + "concept_id": concept["concept_id"], + "canonical_name": concept.get("canonical_name") or "", + "kind": concept.get("kind") or "", + "concept_type": concept.get("concept_type") or "", + "display_name": concept.get("display_name") or "", + "description": concept.get("description") or "", + "text": concept.get("text") or "", + "source_table": concept.get("source_table") or "", + "source_pk": concept.get("source_pk") or "", + "release_name": concept.get("release_name") or "", + "ref_hash": concept.get("ref_hash") or "", + "attributes_json": concept.get("attributes_json") or "{}", + "aliases_json": json.dumps(concept.get("aliases", []), ensure_ascii=False), + "external_ids_json": json.dumps(concept.get("external_ids", {}), ensure_ascii=False), + "tags_json": json.dumps(concept.get("tags", []), ensure_ascii=False), + "fingerprint": concept["fingerprint"], + "latest_cid": concept.get("latest_cid") or "", + "summary": concept.get("summary") or "", + "created_at": created_at, + "updated_at": updated_at, + }, + ).all().result() + finally: + c.close() + + +def _infer_concept_type(row: Dict[str, Any], source_table: Optional[str]) -> str: + explicit = first_str(row, ["concept_type", "kind", "type"]) + if explicit: + return explicit.lower() + lower_table = (source_table or "").lower() + if "messages" in lower_table: + return "message" + if "docs" in lower_table or "documents" in lower_table: + return "document" + if "message_id" in row: + return "message" + if "doc_id" in row or "document_id" in row: + return "document" + return "entity" + + +def _source_pk(row: Dict[str, Any]) -> Optional[str]: + return first_str(row, ["source_pk", "message_id", "doc_id", "document_id", "id", "uuid"]) + + +def row_to_concept( + row: Dict[str, Any], + source_table: Optional[str], + release_name: Optional[str], + ref_hash: Optional[str], +) -> Optional[Dict[str, Any]]: + concept_type = _infer_concept_type(row, source_table) + source_pk = _source_pk(row) + display_name = first_str( + row, + [ + "display_name", + "canonical_name", + "title", + "name", + "subject", + "doc_name", + "document_name", + ], + ) + if not display_name and source_pk: + display_name = f"{concept_type}:{source_pk}" + if not display_name: + display_name = first_str(row, ["body", "text", "content"]) + if display_name: + display_name = display_name[:120] + if not display_name: + return None + + external_ids = parse_json_maybe(row.get("external_ids"), dict, {}) + aliases = parse_json_maybe(row.get("aliases"), list, []) + tags = parse_json_maybe(row.get("tags"), list, []) + + kind = first_str(row, ["kind", "type", "doc_type", "document_type"]) or concept_type + + concept_id = first_str(row, ["concept_id", "doc_id", "document_id", "id", "uuid"]) + if not concept_id and source_pk: + concept_id = f"{concept_type}:{source_pk}" + if not isinstance(concept_id, str) or not concept_id.strip(): + concept_id = hashlib.sha256( + f"{concept_type}|{display_name}|{json.dumps(external_ids, sort_keys=True)}".encode("utf-8") + ).hexdigest() + + description = first_str(row, ["description", "summary", "abstract"]) + if not description: + body = first_str(row, ["content", "text", "body"]) + if body: + description = body[:512] + + text = first_str(row, ["text", "content", "body"]) + if not text: + text = description + + # Keep typed attributes stable and searchable without exploding ES mapping. + attributes_obj = row + + return { + "concept_id": concept_id, + "concept_type": concept_type, + "display_name": display_name, + "description": description, + "text": text, + "source_table": source_table, + "source_pk": source_pk, + "release_name": release_name, + "ref_hash": ref_hash, + "attributes_json": json.dumps(attributes_obj, ensure_ascii=False, default=str, sort_keys=True), + "canonical_name": display_name, + "kind": kind, + "aliases": aliases, + "external_ids": external_ids, + "tags": tags, + "latest_cid": first_str(row, ["latest_cid", "cid", "ipfs_cid"]), + "summary": description, + "created_at": to_iso(row.get("created_at")) or utc_now_iso(), + "updated_at": to_iso(row.get("updated_at")) or utc_now_iso(), + "fingerprint": make_fingerprint(display_name, concept_type, external_ids), + } + + +def project_release( + manifest_file: Optional[str], + release_name: Optional[str], + concept_table: Optional[str], + nessie_ref: Optional[str], + releases_ref: Optional[str], + dry_run: bool, + targets: str, +) -> None: + if not manifest_file and not release_name: + raise ValueError("Provide either --manifest-file or --release-name.") + + manifest: Optional[Dict[str, Any]] = load_manifest(manifest_file) if manifest_file else None + + # Release-name mode: lookup manifest on registry ref (usually main), then project on release tag. + if manifest is None and release_name: + registry_ref = releases_ref or os.getenv("RELEASES_REF", "main") + spark, catalog = build_spark(registry_ref) + manifest = load_manifest_from_registry(spark, catalog, release_name) + ref = nessie_ref or infer_manifest_ref(manifest) or release_name + if ref != registry_ref: + spark.stop() + spark, catalog = build_spark(ref) + else: + ref = nessie_ref or (infer_manifest_ref(manifest) if manifest else None) or release_name + if not ref: + raise ValueError("Unable to infer Nessie ref/tag; pass --nessie-ref explicitly.") + spark, catalog = build_spark(ref) + + table_identifiers: List[str] = extract_table_identifiers(manifest) if manifest else [] + table = concept_table or (infer_concept_table(table_identifiers) if manifest else None) + if not table: + raise ValueError("Unable to infer concept table; pass --concept-table explicitly.") + + if table.count(".") == 1: + table = f"{catalog}.{table}" + + print(f"[INFO] Using Nessie ref/tag: {ref}") + print(f"[INFO] Reading table: {table}") + + release_name_effective = None + ref_hash = None + if manifest: + rel = manifest.get("release") + if isinstance(rel, dict): + rel_name = rel.get("name") + if isinstance(rel_name, str) and rel_name.strip(): + release_name_effective = rel_name.strip() + nes = manifest.get("nessie") + if isinstance(nes, dict): + ref_obj = nes.get("ref") + if isinstance(ref_obj, dict): + h = ref_obj.get("hash") + if isinstance(h, str) and h.strip(): + ref_hash = h.strip() + if not release_name_effective and release_name and isinstance(release_name, str) and release_name.strip(): + release_name_effective = release_name.strip() + + df = spark.table(table) + rows = [r.asDict(recursive=True) for r in df.collect()] + concepts = [c for c in (row_to_concept(r, table, release_name_effective, ref_hash) for r in rows) if c] + + print(f"[INFO] Read {len(rows)} rows, {len(concepts)} valid concepts") + print("[STEP] spark_read_done") + if dry_run: + print("[INFO] Dry-run enabled. No writes performed.") + return + + use_es = targets in ("both", "es") + use_gremlin = targets in ("both", "gremlin") + print(f"[INFO] Projection targets: {targets}") + + gremlin_url = os.getenv("GREMLIN_URL", "ws://localhost:8182/gremlin") + es_url = os.getenv("ES_URL", "http://localhost:9200") + es_index = os.getenv("ES_INDEX", "concepts") + + if use_es: + ensure_es_index(es_url, es_index) + + success = 0 + failures = 0 + gremlin_missing = False + es_missing = False + for concept in concepts: + try: + wrote_any = False + if use_gremlin and not gremlin_missing: + try: + gremlin_upsert(gremlin_url, concept) + wrote_any = True + except ModuleNotFoundError as e: + gremlin_missing = True + print(f"[WARN] Gremlin dependency missing ({e}). Continuing with ES only.") + except Exception as e: + print(f"[WARN] Gremlin upsert failed for {concept.get('concept_id')}: {e}") + + if use_es and not es_missing: + try: + es_upsert(es_url, es_index, concept) + wrote_any = True + except ModuleNotFoundError as e: + es_missing = True + print(f"[WARN] ES dependency missing ({e}). Continuing with Gremlin only.") + except Exception as e: + print(f"[WARN] ES upsert failed for {concept.get('concept_id')}: {e}") + + if wrote_any: + success += 1 + else: + failures += 1 + print(f"[WARN] No projection target succeeded for {concept.get('concept_id')}") + except Exception as e: + failures += 1 + print(f"[WARN] Failed concept {concept.get('concept_id')}: {e}") + + print("[STEP] projection_done") + print(f"[DONE] Projected {success} concepts ({failures} failed)") + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Project a lakehouse release into JanusGraph + Elasticsearch.") + p.add_argument("--manifest-file", help="Path to release manifest JSON") + p.add_argument("--release-name", help="Release name to load from releases_v2 registry") + p.add_argument("--concept-table", help="Full Iceberg table identifier holding concepts") + p.add_argument("--nessie-ref", help="Nessie branch/tag to read from (defaults to manifest tag)") + p.add_argument("--releases-ref", help="Nessie ref used to read releases_v2 (default: main)") + p.add_argument( + "--targets", + choices=["es", "gremlin", "both"], + default="both", + help="Projection targets to write (default: both)", + ) + p.add_argument("--dry-run", action="store_true", help="Read and validate only") + return p.parse_args() + + +def main() -> None: + if load_dotenv is not None: + load_dotenv() + args = parse_args() + project_release( + manifest_file=args.manifest_file, + release_name=args.release_name, + concept_table=args.concept_table, + nessie_ref=args.nessie_ref, + releases_ref=args.releases_ref, + dry_run=args.dry_run, + targets=args.targets, + ) + + +if __name__ == "__main__": + main() diff --git a/requirements-app.txt b/requirements-app.txt new file mode 100644 index 0000000..1c6f48f --- /dev/null +++ b/requirements-app.txt @@ -0,0 +1,8 @@ +fastapi>=0.115,<1.0 +uvicorn[standard]>=0.32,<1.0 +pydantic>=2.9,<3.0 +httpx>=0.28,<1.0 +gremlinpython>=3.7,<4.0 +python-dotenv>=1.0,<2.0 +requests>=2.32,<3.0 +websocket-client>=1.8,<2.0 diff --git a/requirements-projector.txt b/requirements-projector.txt new file mode 100644 index 0000000..ac3ff30 --- /dev/null +++ b/requirements-projector.txt @@ -0,0 +1,4 @@ +pyspark==3.5.8 +python-dotenv>=1.0,<2.0 +httpx>=0.28,<1.0 +gremlinpython>=3.7,<4.0 diff --git a/run-projector-standard.sh b/run-projector-standard.sh new file mode 100755 index 0000000..45cd079 --- /dev/null +++ b/run-projector-standard.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Canonical projector command for lakehouse-core. +# Usage: +# ./run-projector-standard.sh # publish (both targets) +# ./run-projector-standard.sh --dry-run # validate only +# ./run-projector-standard.sh --targets es # ES-only publish +# ./run-projector-standard.sh --release-name rel_2026-02-14_docs-v1 + +MANIFEST_FILE="${MANIFEST_FILE:-./manifests/rel_2026-02-14_docs-v1.json}" +CONCEPT_TABLE="${CONCEPT_TABLE:-lake.db1.docs}" +TARGETS="${TARGETS:-both}" +RELEASE_NAME="${RELEASE_NAME:-}" +MODE="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) + MODE="--dry-run" + shift + ;; + --targets) + TARGETS="${2:-}" + if [[ -z "$TARGETS" ]]; then + echo "--targets requires one of: es|gremlin|both" >&2 + exit 1 + fi + shift 2 + ;; + --manifest-file) + MANIFEST_FILE="${2:-}" + if [[ -z "$MANIFEST_FILE" ]]; then + echo "--manifest-file requires a value" >&2 + exit 1 + fi + shift 2 + ;; + --release-name) + RELEASE_NAME="${2:-}" + if [[ -z "$RELEASE_NAME" ]]; then + echo "--release-name requires a value" >&2 + exit 1 + fi + shift 2 + ;; + --concept-table) + CONCEPT_TABLE="${2:-}" + if [[ -z "$CONCEPT_TABLE" ]]; then + echo "--concept-table requires a value" >&2 + exit 1 + fi + shift 2 + ;; + *) + echo "Unknown argument: $1" >&2 + exit 1 + ;; + esac +done + +if [[ "$TARGETS" != "es" && "$TARGETS" != "gremlin" && "$TARGETS" != "both" ]]; then + echo "Invalid --targets value: $TARGETS (expected es|gremlin|both)" >&2 + exit 1 +fi + +./run-projector-via-spark-container.sh "$MANIFEST_FILE" "$CONCEPT_TABLE" "$MODE" "$TARGETS" "$RELEASE_NAME" diff --git a/run-projector-via-spark-container.sh b/run-projector-via-spark-container.sh new file mode 100755 index 0000000..8cad8f7 --- /dev/null +++ b/run-projector-via-spark-container.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -euo pipefail + +MANIFEST_FILE="${1:-/tmp/rel_2026-02-14_docs-v1.json}" +CONCEPT_TABLE="${2:-lake.db1.docs}" +MODE="${3:-}" +TARGETS="${4:-both}" +RELEASE_NAME="${5:-${RELEASE_NAME:-}}" + +CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" +SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" +PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" + +SCRIPT_LOCAL="${SCRIPT_LOCAL:-./release_projector.py}" +SCRIPT_REMOTE="/tmp/release_projector.py" +MANIFEST_REMOTE="/tmp/$(basename "$MANIFEST_FILE")" + +if [[ ! -f "$SCRIPT_LOCAL" ]]; then + echo "release_projector.py not found at: $SCRIPT_LOCAL" >&2 + exit 1 +fi + +if [[ -z "$RELEASE_NAME" && ! -f "$MANIFEST_FILE" ]]; then + echo "manifest file not found: $MANIFEST_FILE (or provide release name arg5)" >&2 + exit 1 +fi + +docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" +if [[ -f "$MANIFEST_FILE" ]]; then + docker cp "$MANIFEST_FILE" "$CONTAINER_NAME":"$MANIFEST_REMOTE" +fi + +ARGS=( + "$SCRIPT_REMOTE" + "--concept-table" "$CONCEPT_TABLE" + "--targets" "$TARGETS" +) + +if [[ -n "$RELEASE_NAME" ]]; then + ARGS+=("--release-name" "$RELEASE_NAME") +else + ARGS+=("--manifest-file" "$MANIFEST_REMOTE") +fi + +if [[ -n "$MODE" ]]; then + ARGS+=("$MODE") +fi + +docker exec -e AWS_REGION="${AWS_REGION:-us-east-1}" \ + -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ + -e NESSIE_URI="${NESSIE_URI:-http://lakehouse-core:19120/api/v2}" \ + -e NESSIE_WAREHOUSE="${NESSIE_WAREHOUSE:-s3a://lakehouse/warehouse}" \ + -e S3_ENDPOINT="${S3_ENDPOINT:-http://lakehouse-core:9000}" \ + -e AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-minioadmin}" \ + -e AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-minioadmin}" \ + -e GREMLIN_URL="${GREMLIN_URL:-ws://janus.rakeroots.lan:8182/gremlin}" \ + -e ES_URL="${ES_URL:-http://janus.rakeroots.lan:9200}" \ + -e ES_INDEX="${ES_INDEX:-concepts}" \ + "$CONTAINER_NAME" \ + /opt/spark/bin/spark-submit \ + --properties-file "$SPARK_PROPS" \ + --packages "$PACKAGES" \ + "${ARGS[@]}" diff --git a/setup_local_env.sh b/setup_local_env.sh new file mode 100755 index 0000000..4940cee --- /dev/null +++ b/setup_local_env.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail + +VENV_DIR="${1:-.venv}" + +python3 -m venv "$VENV_DIR" +"$VENV_DIR/bin/pip" install --upgrade pip +"$VENV_DIR/bin/pip" install -r requirements-app.txt -r requirements-projector.txt + +echo "Environment ready: $VENV_DIR" +echo "Activate with: source $VENV_DIR/bin/activate" diff --git a/ui/assets/app.js b/ui/assets/app.js new file mode 100644 index 0000000..7830555 --- /dev/null +++ b/ui/assets/app.js @@ -0,0 +1,215 @@ +function getConfig() { + return { + apiKey: document.getElementById("apiKey").value.trim(), + releaseName: document.getElementById("releaseName").value.trim(), + }; +} + +function saveConfig() { + const cfg = getConfig(); + cfg.chatSessionId = document.getElementById("chatSessionId").value.trim(); + localStorage.setItem("assistant_ui_cfg", JSON.stringify(cfg)); +} + +function loadConfig() { + try { + const raw = localStorage.getItem("assistant_ui_cfg"); + if (!raw) return; + const cfg = JSON.parse(raw); + document.getElementById("apiKey").value = cfg.apiKey || ""; + document.getElementById("releaseName").value = cfg.releaseName || ""; + document.getElementById("chatSessionId").value = cfg.chatSessionId || "main"; + } catch (_) {} +} + +async function apiGet(path, params) { + const cfg = getConfig(); + const url = new URL(path, window.location.origin); + Object.entries(params || {}).forEach(([k, v]) => { + if (v !== null && v !== undefined && String(v).length > 0) url.searchParams.set(k, String(v)); + }); + const r = await fetch(url, { + headers: { "X-Admin-Api-Key": cfg.apiKey }, + }); + if (!r.ok) throw new Error(await r.text()); + return r.json(); +} + +async function apiPost(path, payload) { + const cfg = getConfig(); + const r = await fetch(path, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Admin-Api-Key": cfg.apiKey, + }, + body: JSON.stringify(payload), + }); + if (!r.ok) throw new Error(await r.text()); + return r.json(); +} + +function renderRows(target, rows, formatter) { + target.innerHTML = ""; + if (!rows || rows.length === 0) { + target.innerHTML = '
No rows.
'; + return; + } + rows.forEach((row) => { + const el = document.createElement("div"); + el.className = "row"; + el.innerHTML = formatter(row); + target.appendChild(el); + }); +} + +async function loadInbox() { + const cfg = getConfig(); + const q = document.getElementById("inboxQuery").value.trim(); + const out = document.getElementById("inboxResults"); + out.innerHTML = '
Loading...
'; + try { + const data = await apiGet("/assistant/inbox", { release_name: cfg.releaseName, q, limit: 20 }); + renderRows(out, data.rows || [], (r) => { + const text = (r.text || r.summary || r.description || "").slice(0, 280); + return ` +
${r.display_name || r.concept_id || "message"}
+
${text || "(no text)"}
+
${r.source_pk || ""} | ${r.release_name || ""}
+ `; + }); + } catch (e) { + out.innerHTML = `
Error: ${String(e)}
`; + } +} + +async function loadTasks() { + const cfg = getConfig(); + const onlyPending = document.getElementById("onlyPending").checked; + const out = document.getElementById("taskResults"); + out.innerHTML = '
Loading...
'; + try { + const data = await apiGet("/assistant/tasks", { + release_name: cfg.releaseName, + only_pending: onlyPending, + limit: 30, + }); + renderRows(out, data.rows || [], (r) => { + const safeTodo = (r.todo || "").replace(/"/g, """); + return ` +
${r.todo || "(empty task)"}
+
status=${r.status} | due=${r.due_hint || "-"} | who=${r.who || "-"}
+
source=${r.source_pk || ""} | release=${r.release_name || ""}
+
+ `; + }); + document.querySelectorAll(".use-goal").forEach((btn) => { + btn.addEventListener("click", () => { + const goal = btn.getAttribute("data-goal") || ""; + document.getElementById("goalText").value = goal; + }); + }); + } catch (e) { + out.innerHTML = `
Error: ${String(e)}
`; + } +} + +async function makeDraft() { + const cfg = getConfig(); + const goal = document.getElementById("goalText").value.trim(); + const recipient = document.getElementById("recipient").value.trim(); + const out = document.getElementById("draftOutput"); + if (!goal) { + out.textContent = "Provide goal text first."; + return; + } + out.textContent = "Generating..."; + try { + const data = await apiPost("/assistant/draft", { + task_type: "message", + goal, + recipient: recipient || null, + tone: "friendly-professional", + constraints: ["keep it concise"], + release_name: cfg.releaseName || null, + max_sources: 5, + }); + const sourceLine = (data.sources || []).map((s) => s.concept_id).filter(Boolean).slice(0, 5).join(", "); + out.textContent = `${data.draft || ""}\n\nconfidence=${data.confidence}\nneeds_review=${data.needs_review}\nsources=${sourceLine}`; + } catch (e) { + out.textContent = `Error: ${String(e)}`; + } +} + +async function saveLearn() { + const cfg = getConfig(); + const title = document.getElementById("learnTitle").value.trim(); + const tags = document.getElementById("learnTags").value + .split(",") + .map((x) => x.trim()) + .filter(Boolean); + const text = document.getElementById("learnText").value.trim(); + const out = document.getElementById("learnOutput"); + if (!text) { + out.textContent = "Provide note text first."; + return; + } + out.textContent = "Saving..."; + try { + const data = await apiPost("/assistant/learn", { + text, + title: title || null, + tags, + release_name: cfg.releaseName || null, + }); + out.textContent = `saved=${data.stored}\nconcept_id=${data.concept_id}\ntitle=${data.title}`; + document.getElementById("learnText").value = ""; + } catch (e) { + out.textContent = `Error: ${String(e)}`; + } +} + +function appendChat(role, text, meta) { + const target = document.getElementById("chatTranscript"); + const el = document.createElement("div"); + el.className = "row"; + el.innerHTML = ` +
${role}
+
${(text || "").replace(/\n/g, "
")}
+ ${meta ? `
${meta}
` : ""} + `; + target.prepend(el); +} + +async function sendChat() { + const cfg = getConfig(); + const sessionInput = document.getElementById("chatSessionId"); + const session_id = (sessionInput.value || "main").trim(); + sessionInput.value = session_id; + const messageEl = document.getElementById("chatMessage"); + const message = messageEl.value.trim(); + if (!message) return; + appendChat("user", message, `session=${session_id}`); + messageEl.value = ""; + try { + const data = await apiPost("/assistant/chat", { + session_id, + message, + release_name: cfg.releaseName || null, + max_sources: 6, + }); + const sourceLine = (data.sources || []).map((s) => s.concept_id).filter(Boolean).slice(0, 4).join(", "); + appendChat("assistant", data.answer || "", `confidence=${data.confidence} | sources=${sourceLine || "-"}`); + } catch (e) { + appendChat("assistant", `Error: ${String(e)}`, ""); + } +} + +document.getElementById("saveConfig").addEventListener("click", saveConfig); +document.getElementById("loadInbox").addEventListener("click", loadInbox); +document.getElementById("loadTasks").addEventListener("click", loadTasks); +document.getElementById("makeDraft").addEventListener("click", makeDraft); +document.getElementById("saveLearn").addEventListener("click", saveLearn); +document.getElementById("sendChat").addEventListener("click", sendChat); + +loadConfig(); diff --git a/ui/assets/styles.css b/ui/assets/styles.css new file mode 100644 index 0000000..fd4c6f4 --- /dev/null +++ b/ui/assets/styles.css @@ -0,0 +1,124 @@ +:root { + --bg: #f2f4f5; + --panel: #ffffff; + --ink: #182126; + --muted: #5c6770; + --line: #dde4e8; + --accent: #0f766e; +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + font-family: "IBM Plex Sans", "Segoe UI", sans-serif; + color: var(--ink); + background: linear-gradient(165deg, #e9eff2 0%, #f8fafb 100%); +} + +.layout { + max-width: 1100px; + margin: 0 auto; + padding: 18px; + display: grid; + gap: 14px; +} + +.topbar { + background: var(--panel); + border: 1px solid var(--line); + border-radius: 10px; + padding: 12px; + display: flex; + justify-content: space-between; + align-items: center; + gap: 12px; +} + +.topbar h1, +.panel h2 { + margin: 0; + font-size: 18px; +} + +.panel { + background: var(--panel); + border: 1px solid var(--line); + border-radius: 10px; + padding: 12px; +} + +.panel-header { + display: flex; + justify-content: space-between; + align-items: center; + gap: 12px; + margin-bottom: 8px; +} + +.controls { + display: flex; + gap: 8px; + align-items: center; + flex-wrap: wrap; +} + +input, +textarea, +button { + font: inherit; +} + +input, +textarea { + border: 1px solid var(--line); + border-radius: 7px; + padding: 8px; + background: #fff; +} + +button { + border: 1px solid #0d5f59; + background: var(--accent); + color: #fff; + border-radius: 7px; + padding: 8px 10px; + cursor: pointer; +} + +button:hover { + filter: brightness(0.95); +} + +.list { + display: grid; + gap: 8px; +} + +.row { + border: 1px solid var(--line); + border-radius: 8px; + padding: 8px; +} + +.row .meta { + color: var(--muted); + font-size: 12px; + margin-top: 4px; +} + +.output { + white-space: pre-wrap; + border: 1px solid var(--line); + border-radius: 8px; + padding: 10px; + min-height: 96px; + background: #fbfdfe; +} + +#chatTranscript { + max-height: 360px; + overflow: auto; +} diff --git a/ui/index.html b/ui/index.html new file mode 100644 index 0000000..03f8194 --- /dev/null +++ b/ui/index.html @@ -0,0 +1,82 @@ + + + + + + Jecio Assistant Console + + + +
+
+

Assistant Console

+
+ + + +
+
+ +
+
+

Inbox

+
+ + +
+
+
+
+ +
+
+

Pending Tasks

+
+ + +
+
+
+
+ +
+
+

Draft

+
+ + +
+
+ +

+    
+ +
+
+

Learn

+
+ + + +
+
+ +

+    
+ +
+
+

Chat

+
+ + +
+
+ +
+
+
+ + + + diff --git a/write_assistant_action.py b/write_assistant_action.py new file mode 100644 index 0000000..a297a80 --- /dev/null +++ b/write_assistant_action.py @@ -0,0 +1,106 @@ +import argparse +import json +import base64 + +from pyspark.sql import SparkSession, types as T + + +def d(s: str) -> str: + if not s: + return "" + return base64.b64decode(s.encode("ascii")).decode("utf-8") + + +def main() -> None: + p = argparse.ArgumentParser(description="Write assistant action row via Spark DataFrame") + p.add_argument("--table", required=True) + p.add_argument("--action-id", required=True) + p.add_argument("--created-at-utc", required=True) + p.add_argument("--task-type", required=True) + p.add_argument("--release-name", default="") + p.add_argument("--objective-b64", default="") + p.add_argument("--step-id", required=True) + p.add_argument("--step-title-b64", default="") + p.add_argument("--action-type", required=True) + p.add_argument("--requires-approval", default="false") + p.add_argument("--approved", default="false") + p.add_argument("--status", required=True) + p.add_argument("--output-b64", default="") + p.add_argument("--error-b64", default="") + args = p.parse_args() + + requires_approval = str(args.requires_approval).lower() == "true" + approved = str(args.approved).lower() == "true" + objective = d(args.objective_b64) + step_title = d(args.step_title_b64) + output_json = d(args.output_b64) + error_text = d(args.error_b64) + if not output_json: + output_json = "{}" + try: + json.loads(output_json) + except Exception: + output_json = "{}" + + spark = SparkSession.builder.appName("write-assistant-action").getOrCreate() + spark.sql( + f""" + CREATE TABLE IF NOT EXISTS {args.table} ( + action_id STRING, + created_at_utc STRING, + task_type STRING, + release_name STRING, + objective STRING, + step_id STRING, + step_title STRING, + action_type STRING, + requires_approval BOOLEAN, + approved BOOLEAN, + status STRING, + output_json STRING, + error_text STRING + ) USING iceberg + """ + ) + + schema = T.StructType( + [ + T.StructField("action_id", T.StringType(), False), + T.StructField("created_at_utc", T.StringType(), False), + T.StructField("task_type", T.StringType(), False), + T.StructField("release_name", T.StringType(), True), + T.StructField("objective", T.StringType(), True), + T.StructField("step_id", T.StringType(), False), + T.StructField("step_title", T.StringType(), True), + T.StructField("action_type", T.StringType(), False), + T.StructField("requires_approval", T.BooleanType(), False), + T.StructField("approved", T.BooleanType(), False), + T.StructField("status", T.StringType(), False), + T.StructField("output_json", T.StringType(), True), + T.StructField("error_text", T.StringType(), True), + ] + ) + row = [ + ( + args.action_id, + args.created_at_utc, + args.task_type, + args.release_name or "", + objective, + args.step_id, + step_title, + args.action_type, + requires_approval, + approved, + args.status, + output_json, + error_text, + ) + ] + df = spark.createDataFrame(row, schema=schema) + df.writeTo(args.table).append() + print(f"[DONE] Recorded assistant action {args.action_id} into {args.table}") + + +if __name__ == "__main__": + main() diff --git a/write_assistant_feedback.py b/write_assistant_feedback.py new file mode 100644 index 0000000..6051c17 --- /dev/null +++ b/write_assistant_feedback.py @@ -0,0 +1,103 @@ +import argparse +import base64 +import json + +from pyspark.sql import SparkSession, types as T + + +def d(s: str) -> str: + if not s: + return "" + return base64.b64decode(s.encode("ascii")).decode("utf-8") + + +def main() -> None: + p = argparse.ArgumentParser(description="Write assistant feedback row via Spark DataFrame") + p.add_argument("--table", required=True) + p.add_argument("--feedback-id", required=True) + p.add_argument("--created-at-utc", required=True) + p.add_argument("--outcome", required=True) + p.add_argument("--task-type", required=True) + p.add_argument("--release-name", default="") + p.add_argument("--confidence", type=float, default=0.0) + p.add_argument("--needs-review", default="true") + p.add_argument("--goal-b64", default="") + p.add_argument("--draft-b64", default="") + p.add_argument("--final-b64", default="") + p.add_argument("--sources-b64", default="") + p.add_argument("--notes-b64", default="") + args = p.parse_args() + + needs_review = str(args.needs_review).lower() == "true" + goal = d(args.goal_b64) + draft_text = d(args.draft_b64) + final_text = d(args.final_b64) + sources_json = d(args.sources_b64) + notes = d(args.notes_b64) + if not sources_json: + sources_json = "[]" + # Validate JSON shape but keep raw string in table. + try: + json.loads(sources_json) + except Exception: + sources_json = "[]" + + spark = SparkSession.builder.appName("write-assistant-feedback").getOrCreate() + spark.sql( + f""" + CREATE TABLE IF NOT EXISTS {args.table} ( + feedback_id STRING, + created_at_utc STRING, + outcome STRING, + task_type STRING, + release_name STRING, + confidence DOUBLE, + needs_review BOOLEAN, + goal STRING, + draft_text STRING, + final_text STRING, + sources_json STRING, + notes STRING + ) USING iceberg + """ + ) + + schema = T.StructType( + [ + T.StructField("feedback_id", T.StringType(), False), + T.StructField("created_at_utc", T.StringType(), False), + T.StructField("outcome", T.StringType(), False), + T.StructField("task_type", T.StringType(), False), + T.StructField("release_name", T.StringType(), True), + T.StructField("confidence", T.DoubleType(), True), + T.StructField("needs_review", T.BooleanType(), False), + T.StructField("goal", T.StringType(), True), + T.StructField("draft_text", T.StringType(), True), + T.StructField("final_text", T.StringType(), True), + T.StructField("sources_json", T.StringType(), True), + T.StructField("notes", T.StringType(), True), + ] + ) + row = [ + ( + args.feedback_id, + args.created_at_utc, + args.outcome, + args.task_type, + args.release_name or "", + float(args.confidence), + needs_review, + goal, + draft_text, + final_text, + sources_json, + notes, + ) + ] + df = spark.createDataFrame(row, schema=schema) + df.writeTo(args.table).append() + print(f"[DONE] Recorded assistant feedback {args.feedback_id} into {args.table}") + + +if __name__ == "__main__": + main()