#!/usr/bin/env bash set -euo pipefail TABLE="${1:-lake.db1.messages}" DEDUPE_MODE="${2:-none}" PAYLOAD_B64="${3:-}" if [[ -z "$PAYLOAD_B64" ]]; then echo "Usage: $0 " >&2 exit 1 fi if [[ "$DEDUPE_MODE" != "none" && "$DEDUPE_MODE" != "message_id" && "$DEDUPE_MODE" != "thread_message" ]]; then echo "Invalid dedupe_mode: $DEDUPE_MODE (expected none|message_id|thread_message)" >&2 exit 1 fi CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}" SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}" PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}" SCRIPT_LOCAL="${SCRIPT_LOCAL:-./ingest_messages_batch.py}" SCRIPT_REMOTE="/tmp/ingest_messages_batch.py" if [[ ! -f "$SCRIPT_LOCAL" ]]; then echo "ingest_messages_batch.py not found at: $SCRIPT_LOCAL" >&2 exit 1 fi docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE" SPARK_ARGS=( --table "$TABLE" --dedupe-mode "$DEDUPE_MODE" ) if [[ "${PAYLOAD_B64:0:1}" == "@" ]]; then PAYLOAD_FILE_HOST="${PAYLOAD_B64:1}" if [[ ! -f "$PAYLOAD_FILE_HOST" ]]; then echo "Payload file not found: $PAYLOAD_FILE_HOST" >&2 exit 1 fi PAYLOAD_FILE_REMOTE="/opt/spark/work-dir/ingest_messages_payload.json" docker cp "$PAYLOAD_FILE_HOST" "$CONTAINER_NAME":"$PAYLOAD_FILE_REMOTE" # Ensure spark user can read the file regardless of ownership from docker cp. docker exec -u 0 "$CONTAINER_NAME" /bin/sh -lc "chmod 644 '$PAYLOAD_FILE_REMOTE' || true" SPARK_ARGS+=(--payload-file "$PAYLOAD_FILE_REMOTE") else SPARK_ARGS+=(--payload-b64 "$PAYLOAD_B64") fi docker exec \ -e AWS_REGION="${AWS_REGION:-us-east-1}" \ -e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \ "$CONTAINER_NAME" \ /opt/spark/bin/spark-submit \ --properties-file "$SPARK_PROPS" \ --packages "$PACKAGES" \ "$SCRIPT_REMOTE" \ "${SPARK_ARGS[@]}"