amduat-api/tests/ai_answer_eval.sh
2026-02-08 00:07:35 +01:00

119 lines
4.6 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# shellcheck source=/dev/null
source "${ROOT_DIR}/src/app_v2.sh"
require_jq() {
if ! command -v jq >/dev/null 2>&1; then
echo "ai_answer_eval.sh: jq is required" >&2
exit 2
fi
}
fail() {
echo "ai_answer_eval.sh: FAIL: $1" >&2
exit 1
}
require_jq
app_init
tmp_dir="$(mktemp -d /tmp/ai-answer-eval.XXXXXX)"
cleanup() {
rm -rf "${tmp_dir}"
}
trap cleanup EXIT
mock_prompt_file="${tmp_dir}/prompt.txt"
mock_retrieve_path_file="${tmp_dir}/retrieve.path"
mock_retrieve_payload_file="${tmp_dir}/retrieve.payload"
amduat_api_call() {
local method="$1"
local path="$2"
local body="${3:-}"
if [[ "${method}" == "GET" && "${path}" == "/v2/graph/schema/predicates" ]]; then
AMDUAT_LAST_STATUS="200"
AMDUAT_LAST_BODY='{"predicates":[{"predicate_ref":"ref-ms-within-domain","alias":"ms.within_domain"}]}'
return 0
fi
if [[ "${method}" == "POST" && "${path}" == "/v2/graph/retrieve" ]]; then
printf '%s' "${path}" > "${mock_retrieve_path_file}"
printf '%s' "${body}" > "${mock_retrieve_payload_file}"
AMDUAT_LAST_STATUS="200"
if [[ "${MOCK_NO_EDGES:-0}" == "1" ]]; then
AMDUAT_LAST_BODY='{"nodes":[{"name":"doc:1"},{"name":"topic:alpha"}],"edges":[]}'
else
AMDUAT_LAST_BODY='{"nodes":[{"name":"doc:1","concept_ref":"ref-doc1"},{"name":"topic:alpha","concept_ref":"ref-topic-alpha"}],"edges":[{"subject_ref":"ref-doc1","predicate_ref":"ref-ms-within-domain","object_ref":"ref-topic-alpha","edge_ref":"ref-edge-1"}]}'
fi
return 0
fi
AMDUAT_LAST_STATUS="404"
AMDUAT_LAST_BODY='{"error":"not mocked"}'
return 1
}
curl() {
local body=""
local endpoint=""
while [[ $# -gt 0 ]]; do
case "$1" in
--data-binary)
body="$2"
shift 2
;;
http://*|https://*)
endpoint="$1"
shift
;;
*)
shift
;;
esac
done
[[ "${endpoint}" == "${OLLAMA_HOST}/api/generate" ]] || fail "unexpected curl endpoint: ${endpoint}"
prompt="$(printf '%s' "${body}" | jq -r '.prompt')"
printf '%s' "${prompt}" > "${mock_prompt_file}"
printf '%s\n' '{"model":"mock-model","response":"Grounded answer from mock model."}'
}
json_out="$(app_ai_answer_json "doc:1" "What domain is doc:1 in?" "ms.within_domain")" || fail "app_ai_answer_json failed"
printf '%s' "${json_out}" | jq -e '.response == "Grounded answer from mock model."' >/dev/null || fail "unexpected response payload"
printf '%s' "${json_out}" | jq -e '.evidence | length == 1' >/dev/null || fail "missing evidence"
printf '%s' "${json_out}" | jq -e '.grounding.has_evidence == true' >/dev/null || fail "grounding.has_evidence should be true"
printf '%s' "${json_out}" | jq -e '.evidence[0].subject == "doc:1" and .evidence[0].predicate == "ms.within_domain" and .evidence[0].object == "topic:alpha"' >/dev/null \
|| fail "evidence triplet mismatch"
printf '%s' "${json_out}" | jq -e '.evidence[0].predicate_name == "ms.within_domain"' >/dev/null || fail "predicate_name should resolve from schema"
[[ -f "${mock_retrieve_path_file}" ]] || fail "retrieve call was not made"
[[ "$(cat "${mock_retrieve_path_file}")" == "/v2/graph/retrieve" ]] || fail "retrieve path mismatch"
retrieve_payload="$(cat "${mock_retrieve_payload_file}")"
printf '%s' "${retrieve_payload}" | jq -e '.roots == ["doc:1"]' >/dev/null || fail "roots payload mismatch"
printf '%s' "${retrieve_payload}" | jq -e '.goal_predicates == ["ms.within_domain"]' >/dev/null || fail "goal_predicates payload mismatch"
prompt_text="$(cat "${mock_prompt_file}")"
[[ "${prompt_text}" == *"Question:"* ]] || fail "prompt missing question label"
[[ "${prompt_text}" == *"What domain is doc:1 in?"* ]] || fail "prompt missing question"
[[ "${prompt_text}" == *"doc:1 --ms.within_domain--> topic:alpha"* ]] || fail "prompt missing graph edge context"
text_out="$(app_ai_answer_text "doc:1" "What domain is doc:1 in?" "ms.within_domain")" || fail "app_ai_answer_text failed"
[[ "${text_out}" == "Grounded answer from mock model." ]] || fail "text output mismatch"
set +e
MOCK_NO_EDGES=1 strict_out="$(app_ai_answer_json "doc:1" "What domain is doc:1 in?" "ms.within_domain" "1")"
strict_rc=$?
set -e
[[ "${strict_rc}" -ne 0 ]] || fail "expected non-zero for --require-evidence with no supporting edges"
printf '%s' "${strict_out}" | jq -e '.done_reason == "no_evidence"' >/dev/null || fail "expected done_reason no_evidence"
printf '%s' "${strict_out}" | jq -e '.grounding.require_evidence == true and .grounding.has_evidence == false' >/dev/null \
|| fail "expected strict grounding flags"
echo "ai_answer_eval.sh: PASS"