119 lines
4.6 KiB
Bash
119 lines
4.6 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||
|
|
# shellcheck source=/dev/null
|
||
|
|
source "${ROOT_DIR}/src/app_v2.sh"
|
||
|
|
|
||
|
|
require_jq() {
|
||
|
|
if ! command -v jq >/dev/null 2>&1; then
|
||
|
|
echo "ai_answer_eval.sh: jq is required" >&2
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
fail() {
|
||
|
|
echo "ai_answer_eval.sh: FAIL: $1" >&2
|
||
|
|
exit 1
|
||
|
|
}
|
||
|
|
|
||
|
|
require_jq
|
||
|
|
app_init
|
||
|
|
|
||
|
|
tmp_dir="$(mktemp -d /tmp/ai-answer-eval.XXXXXX)"
|
||
|
|
cleanup() {
|
||
|
|
rm -rf "${tmp_dir}"
|
||
|
|
}
|
||
|
|
trap cleanup EXIT
|
||
|
|
|
||
|
|
mock_prompt_file="${tmp_dir}/prompt.txt"
|
||
|
|
mock_retrieve_path_file="${tmp_dir}/retrieve.path"
|
||
|
|
mock_retrieve_payload_file="${tmp_dir}/retrieve.payload"
|
||
|
|
|
||
|
|
amduat_api_call() {
|
||
|
|
local method="$1"
|
||
|
|
local path="$2"
|
||
|
|
local body="${3:-}"
|
||
|
|
|
||
|
|
if [[ "${method}" == "GET" && "${path}" == "/v2/graph/schema/predicates" ]]; then
|
||
|
|
AMDUAT_LAST_STATUS="200"
|
||
|
|
AMDUAT_LAST_BODY='{"predicates":[{"predicate_ref":"ref-ms-within-domain","alias":"ms.within_domain"}]}'
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [[ "${method}" == "POST" && "${path}" == "/v2/graph/retrieve" ]]; then
|
||
|
|
printf '%s' "${path}" > "${mock_retrieve_path_file}"
|
||
|
|
printf '%s' "${body}" > "${mock_retrieve_payload_file}"
|
||
|
|
AMDUAT_LAST_STATUS="200"
|
||
|
|
if [[ "${MOCK_NO_EDGES:-0}" == "1" ]]; then
|
||
|
|
AMDUAT_LAST_BODY='{"nodes":[{"name":"doc:1"},{"name":"topic:alpha"}],"edges":[]}'
|
||
|
|
else
|
||
|
|
AMDUAT_LAST_BODY='{"nodes":[{"name":"doc:1","concept_ref":"ref-doc1"},{"name":"topic:alpha","concept_ref":"ref-topic-alpha"}],"edges":[{"subject_ref":"ref-doc1","predicate_ref":"ref-ms-within-domain","object_ref":"ref-topic-alpha","edge_ref":"ref-edge-1"}]}'
|
||
|
|
fi
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
|
||
|
|
AMDUAT_LAST_STATUS="404"
|
||
|
|
AMDUAT_LAST_BODY='{"error":"not mocked"}'
|
||
|
|
return 1
|
||
|
|
}
|
||
|
|
|
||
|
|
curl() {
|
||
|
|
local body=""
|
||
|
|
local endpoint=""
|
||
|
|
while [[ $# -gt 0 ]]; do
|
||
|
|
case "$1" in
|
||
|
|
--data-binary)
|
||
|
|
body="$2"
|
||
|
|
shift 2
|
||
|
|
;;
|
||
|
|
http://*|https://*)
|
||
|
|
endpoint="$1"
|
||
|
|
shift
|
||
|
|
;;
|
||
|
|
*)
|
||
|
|
shift
|
||
|
|
;;
|
||
|
|
esac
|
||
|
|
done
|
||
|
|
|
||
|
|
[[ "${endpoint}" == "${OLLAMA_HOST}/api/generate" ]] || fail "unexpected curl endpoint: ${endpoint}"
|
||
|
|
prompt="$(printf '%s' "${body}" | jq -r '.prompt')"
|
||
|
|
printf '%s' "${prompt}" > "${mock_prompt_file}"
|
||
|
|
printf '%s\n' '{"model":"mock-model","response":"Grounded answer from mock model."}'
|
||
|
|
}
|
||
|
|
|
||
|
|
json_out="$(app_ai_answer_json "doc:1" "What domain is doc:1 in?" "ms.within_domain")" || fail "app_ai_answer_json failed"
|
||
|
|
printf '%s' "${json_out}" | jq -e '.response == "Grounded answer from mock model."' >/dev/null || fail "unexpected response payload"
|
||
|
|
printf '%s' "${json_out}" | jq -e '.evidence | length == 1' >/dev/null || fail "missing evidence"
|
||
|
|
printf '%s' "${json_out}" | jq -e '.grounding.has_evidence == true' >/dev/null || fail "grounding.has_evidence should be true"
|
||
|
|
printf '%s' "${json_out}" | jq -e '.evidence[0].subject == "doc:1" and .evidence[0].predicate == "ms.within_domain" and .evidence[0].object == "topic:alpha"' >/dev/null \
|
||
|
|
|| fail "evidence triplet mismatch"
|
||
|
|
printf '%s' "${json_out}" | jq -e '.evidence[0].predicate_name == "ms.within_domain"' >/dev/null || fail "predicate_name should resolve from schema"
|
||
|
|
|
||
|
|
[[ -f "${mock_retrieve_path_file}" ]] || fail "retrieve call was not made"
|
||
|
|
[[ "$(cat "${mock_retrieve_path_file}")" == "/v2/graph/retrieve" ]] || fail "retrieve path mismatch"
|
||
|
|
|
||
|
|
retrieve_payload="$(cat "${mock_retrieve_payload_file}")"
|
||
|
|
printf '%s' "${retrieve_payload}" | jq -e '.roots == ["doc:1"]' >/dev/null || fail "roots payload mismatch"
|
||
|
|
printf '%s' "${retrieve_payload}" | jq -e '.goal_predicates == ["ms.within_domain"]' >/dev/null || fail "goal_predicates payload mismatch"
|
||
|
|
|
||
|
|
prompt_text="$(cat "${mock_prompt_file}")"
|
||
|
|
[[ "${prompt_text}" == *"Question:"* ]] || fail "prompt missing question label"
|
||
|
|
[[ "${prompt_text}" == *"What domain is doc:1 in?"* ]] || fail "prompt missing question"
|
||
|
|
[[ "${prompt_text}" == *"doc:1 --ms.within_domain--> topic:alpha"* ]] || fail "prompt missing graph edge context"
|
||
|
|
|
||
|
|
text_out="$(app_ai_answer_text "doc:1" "What domain is doc:1 in?" "ms.within_domain")" || fail "app_ai_answer_text failed"
|
||
|
|
[[ "${text_out}" == "Grounded answer from mock model." ]] || fail "text output mismatch"
|
||
|
|
|
||
|
|
set +e
|
||
|
|
MOCK_NO_EDGES=1 strict_out="$(app_ai_answer_json "doc:1" "What domain is doc:1 in?" "ms.within_domain" "1")"
|
||
|
|
strict_rc=$?
|
||
|
|
set -e
|
||
|
|
[[ "${strict_rc}" -ne 0 ]] || fail "expected non-zero for --require-evidence with no supporting edges"
|
||
|
|
printf '%s' "${strict_out}" | jq -e '.done_reason == "no_evidence"' >/dev/null || fail "expected done_reason no_evidence"
|
||
|
|
printf '%s' "${strict_out}" | jq -e '.grounding.require_evidence == true and .grounding.has_evidence == false' >/dev/null \
|
||
|
|
|| fail "expected strict grounding flags"
|
||
|
|
|
||
|
|
echo "ai_answer_eval.sh: PASS"
|