61 lines
2.1 KiB
Bash
61 lines
2.1 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
TABLE="${1:-lake.db1.messages}"
|
||
|
|
DEDUPE_MODE="${2:-none}"
|
||
|
|
PAYLOAD_B64="${3:-}"
|
||
|
|
|
||
|
|
if [[ -z "$PAYLOAD_B64" ]]; then
|
||
|
|
echo "Usage: $0 <table> <dedupe_mode:none|message_id|thread_message> <payload_b64_json_array|@/path/to/payload.json>" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [[ "$DEDUPE_MODE" != "none" && "$DEDUPE_MODE" != "message_id" && "$DEDUPE_MODE" != "thread_message" ]]; then
|
||
|
|
echo "Invalid dedupe_mode: $DEDUPE_MODE (expected none|message_id|thread_message)" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}"
|
||
|
|
SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}"
|
||
|
|
PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}"
|
||
|
|
|
||
|
|
SCRIPT_LOCAL="${SCRIPT_LOCAL:-./ingest_messages_batch.py}"
|
||
|
|
SCRIPT_REMOTE="/tmp/ingest_messages_batch.py"
|
||
|
|
|
||
|
|
if [[ ! -f "$SCRIPT_LOCAL" ]]; then
|
||
|
|
echo "ingest_messages_batch.py not found at: $SCRIPT_LOCAL" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE"
|
||
|
|
|
||
|
|
SPARK_ARGS=(
|
||
|
|
--table "$TABLE"
|
||
|
|
--dedupe-mode "$DEDUPE_MODE"
|
||
|
|
)
|
||
|
|
|
||
|
|
if [[ "${PAYLOAD_B64:0:1}" == "@" ]]; then
|
||
|
|
PAYLOAD_FILE_HOST="${PAYLOAD_B64:1}"
|
||
|
|
if [[ ! -f "$PAYLOAD_FILE_HOST" ]]; then
|
||
|
|
echo "Payload file not found: $PAYLOAD_FILE_HOST" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
PAYLOAD_FILE_REMOTE="/opt/spark/work-dir/ingest_messages_payload.json"
|
||
|
|
docker cp "$PAYLOAD_FILE_HOST" "$CONTAINER_NAME":"$PAYLOAD_FILE_REMOTE"
|
||
|
|
# Ensure spark user can read the file regardless of ownership from docker cp.
|
||
|
|
docker exec -u 0 "$CONTAINER_NAME" /bin/sh -lc "chmod 644 '$PAYLOAD_FILE_REMOTE' || true"
|
||
|
|
SPARK_ARGS+=(--payload-file "$PAYLOAD_FILE_REMOTE")
|
||
|
|
else
|
||
|
|
SPARK_ARGS+=(--payload-b64 "$PAYLOAD_B64")
|
||
|
|
fi
|
||
|
|
|
||
|
|
docker exec \
|
||
|
|
-e AWS_REGION="${AWS_REGION:-us-east-1}" \
|
||
|
|
-e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \
|
||
|
|
"$CONTAINER_NAME" \
|
||
|
|
/opt/spark/bin/spark-submit \
|
||
|
|
--properties-file "$SPARK_PROPS" \
|
||
|
|
--packages "$PACKAGES" \
|
||
|
|
"$SCRIPT_REMOTE" \
|
||
|
|
"${SPARK_ARGS[@]}"
|