jecio/query-assistant-metrics-via-spark-container.sh

43 lines
1.4 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
set -euo pipefail
TASK_TYPE="${1:-}"
RELEASE_NAME="${2:-}"
OUTCOME="${3:-}"
GROUP_BY="${4:-both}"
LIMIT="${5:-100}"
FEEDBACK_TABLE="${FEEDBACK_TABLE:-lake.db1.assistant_feedback}"
if [[ "$GROUP_BY" != "task_type" && "$GROUP_BY" != "release_name" && "$GROUP_BY" != "both" ]]; then
echo "Invalid group_by: $GROUP_BY (expected task_type|release_name|both)" >&2
exit 1
fi
CONTAINER_NAME="${SPARK_CONTAINER_NAME:-spark}"
SPARK_PROPS="${SPARK_PROPS:-/opt/lakehouse/spark-conf/lakehouse-spark-defaults.conf}"
PACKAGES="${SPARK_PACKAGES:-org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1,org.apache.iceberg:iceberg-aws-bundle:1.10.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.5}"
SCRIPT_LOCAL="${SCRIPT_LOCAL:-./query_assistant_metrics.py}"
SCRIPT_REMOTE="/tmp/query_assistant_metrics.py"
if [[ ! -f "$SCRIPT_LOCAL" ]]; then
echo "query_assistant_metrics.py not found at: $SCRIPT_LOCAL" >&2
exit 1
fi
docker cp "$SCRIPT_LOCAL" "$CONTAINER_NAME":"$SCRIPT_REMOTE"
docker exec \
-e AWS_REGION="${AWS_REGION:-us-east-1}" \
-e AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" \
"$CONTAINER_NAME" \
/opt/spark/bin/spark-submit \
--properties-file "$SPARK_PROPS" \
--packages "$PACKAGES" \
"$SCRIPT_REMOTE" \
--table "$FEEDBACK_TABLE" \
--task-type "$TASK_TYPE" \
--release-name "$RELEASE_NAME" \
--outcome "$OUTCOME" \
--group-by "$GROUP_BY" \
--limit "$LIMIT"