From 0667cee17d92d19be894fbd44b4b96eb8f61df7b Mon Sep 17 00:00:00 2001 From: Carl Niklas Rydberg Date: Sat, 17 Jan 2026 16:55:46 +0100 Subject: [PATCH] Add bloom filters to ASL index segments --- CMakeLists.txt | 1 + include/amduat/asl/index_bloom.h | 33 ++++ .../asl_store_index_fs/asl_store_index_fs.c | 66 ++++++++ src/near_core/asl/index_bloom.c | 152 ++++++++++++++++++ tests/asl/test_asl_store_index_fs.c | 136 ++++++++++++++++ 5 files changed, 388 insertions(+) create mode 100644 include/amduat/asl/index_bloom.h create mode 100644 src/near_core/asl/index_bloom.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f70a48..b3fe118 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,6 +63,7 @@ set(AMDUAT_ASL_SRCS src/kernel/asl/core.c src/near_core/asl/artifact_io.c src/near_core/asl/io.c + src/near_core/asl/index_bloom.c src/near_core/asl/index_snapshot.c src/near_core/asl/index_replay.c src/near_core/asl/parse.c diff --git a/include/amduat/asl/index_bloom.h b/include/amduat/asl/index_bloom.h new file mode 100644 index 0000000..4a97dcd --- /dev/null +++ b/include/amduat/asl/index_bloom.h @@ -0,0 +1,33 @@ +#ifndef AMDUAT_ASL_INDEX_BLOOM_H +#define AMDUAT_ASL_INDEX_BLOOM_H + +#include "amduat/asl/core.h" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + AMDUAT_ASL_INDEX_BLOOM_BYTES = 256, + AMDUAT_ASL_INDEX_BLOOM_HASHES = 4 +}; + +bool amduat_asl_index_bloom_init(amduat_octets_t *out_bloom); + +bool amduat_asl_index_bloom_add(amduat_octets_t bloom, + amduat_hash_id_t hash_id, + amduat_octets_t digest); + +bool amduat_asl_index_bloom_maybe_contains(amduat_octets_t bloom, + amduat_hash_id_t hash_id, + amduat_octets_t digest); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* AMDUAT_ASL_INDEX_BLOOM_H */ diff --git a/src/adapters/asl_store_index_fs/asl_store_index_fs.c b/src/adapters/asl_store_index_fs/asl_store_index_fs.c index 3eb049c..e5695b4 100644 --- a/src/adapters/asl_store_index_fs/asl_store_index_fs.c +++ b/src/adapters/asl_store_index_fs/asl_store_index_fs.c @@ -1,6 +1,7 @@ #include "amduat/asl/asl_store_index_fs.h" #include "asl_store_index_fs_layout.h" +#include "amduat/asl/index_bloom.h" #include "amduat/asl/index_snapshot.h" #include "amduat/asl/index_replay.h" #include "amduat/asl/ref_derive.h" @@ -1028,6 +1029,56 @@ static bool amduat_asl_store_index_fs_record_matches( return memcmp(digest, ref.digest.data, record->digest_len) == 0; } +static bool amduat_asl_store_index_fs_build_bloom( + const amduat_asl_core_index_segment_t *segment, + amduat_octets_t *out_bloom) { + size_t i; + size_t digest_cursor; + + if (out_bloom == NULL) { + return false; + } + *out_bloom = amduat_octets(NULL, 0u); + if (segment == NULL || segment->record_count == 0u) { + return true; + } + if (segment->digests.len != 0u && segment->digests.data == NULL) { + return false; + } + + if (!amduat_asl_index_bloom_init(out_bloom)) { + return false; + } + + digest_cursor = 0u; + for (i = 0u; i < segment->record_count; ++i) { + const amduat_asl_index_record_t *record = &segment->records[i]; + uint16_t digest_len = record->digest_len; + const uint8_t *digest_data; + + if (digest_len == 0u) { + amduat_octets_free(out_bloom); + return false; + } + if (digest_len > segment->digests.len - digest_cursor) { + amduat_octets_free(out_bloom); + return false; + } + + digest_data = segment->digests.data + digest_cursor; + if (!amduat_asl_index_bloom_add( + *out_bloom, + (amduat_hash_id_t)record->hash_id, + amduat_octets(digest_data, digest_len))) { + amduat_octets_free(out_bloom); + return false; + } + digest_cursor += digest_len; + } + + return true; +} + static amduat_asl_store_error_t amduat_asl_store_index_fs_read_extent_bytes( const char *root_path, const amduat_asl_extent_record_t *extent, @@ -1198,6 +1249,13 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_scan_segments( return AMDUAT_ASL_STORE_ERR_INTEGRITY; } + if (!amduat_asl_index_bloom_maybe_contains(segment.bloom, + ref.hash_id, + ref.digest)) { + amduat_enc_asl_core_index_free(&segment); + continue; + } + for (r = 0; r < segment.record_count; ++r) { const amduat_asl_index_record_t *record = &segment.records[r]; if (!amduat_asl_store_index_fs_record_matches(&segment, record, ref)) { @@ -1641,8 +1699,15 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_put_indexed_impl( segment.extents = &extent; segment.extent_count = 1u; + segment.bloom = amduat_octets(NULL, 0u); + if (!amduat_asl_store_index_fs_build_bloom(&segment, &segment.bloom)) { + amduat_octets_free(&segment.bloom); + segment.bloom = amduat_octets(NULL, 0u); + } + segment_bytes = amduat_octets(NULL, 0u); if (!amduat_enc_asl_core_index_encode_v1(&segment, &segment_bytes)) { + amduat_octets_free(&segment.bloom); amduat_reference_free(&derived_ref); amduat_octets_free(&artifact_bytes); free(index_path); @@ -1650,6 +1715,7 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_put_indexed_impl( free(blocks_path); return AMDUAT_ASL_STORE_ERR_IO; } + amduat_octets_free(&segment.bloom); if (!amduat_hash_asl1_digest(AMDUAT_HASH_ASL1_ID_SHA256, segment_bytes, diff --git a/src/near_core/asl/index_bloom.c b/src/near_core/asl/index_bloom.c new file mode 100644 index 0000000..4ed979c --- /dev/null +++ b/src/near_core/asl/index_bloom.c @@ -0,0 +1,152 @@ +#include "amduat/asl/index_bloom.h" + +#include +#include +#include + +static uint64_t amduat_asl_index_bloom_hash_update(uint64_t hash, + const uint8_t *data, + size_t len) { + size_t i; + + if (data == NULL || len == 0u) { + return hash; + } + + for (i = 0u; i < len; ++i) { + hash ^= (uint64_t)data[i]; + hash *= 1099511628211ull; + } + return hash; +} + +static bool amduat_asl_index_bloom_hash_pair(amduat_hash_id_t hash_id, + amduat_octets_t digest, + uint64_t *out_h1, + uint64_t *out_h2) { + uint64_t hash; + uint8_t header[4]; + uint16_t digest_len; + + if (out_h1 == NULL || out_h2 == NULL) { + return false; + } + if (digest.len != 0u && digest.data == NULL) { + return false; + } + if (digest.len > UINT16_MAX) { + return false; + } + + digest_len = (uint16_t)digest.len; + header[0] = (uint8_t)(hash_id & 0xffu); + header[1] = (uint8_t)((hash_id >> 8) & 0xffu); + header[2] = (uint8_t)(digest_len & 0xffu); + header[3] = (uint8_t)((digest_len >> 8) & 0xffu); + + hash = 1469598103934665603ull ^ 0x9e3779b97f4a7c15ull; + hash = amduat_asl_index_bloom_hash_update(hash, header, sizeof(header)); + hash = amduat_asl_index_bloom_hash_update(hash, digest.data, digest.len); + *out_h1 = hash; + + hash = 1469598103934665603ull ^ 0xbf58476d1ce4e5b9ull; + hash = amduat_asl_index_bloom_hash_update(hash, header, sizeof(header)); + hash = amduat_asl_index_bloom_hash_update(hash, digest.data, digest.len); + if (hash == 0u) { + hash = 0x94d049bb133111ebull; + } + *out_h2 = hash; + + return true; +} + +static void amduat_asl_index_bloom_set_bit(uint8_t *data, size_t bit) { + data[bit >> 3] |= (uint8_t)(1u << (bit & 7u)); +} + +static bool amduat_asl_index_bloom_test_bit(const uint8_t *data, size_t bit) { + return (data[bit >> 3] & (uint8_t)(1u << (bit & 7u))) != 0u; +} + +bool amduat_asl_index_bloom_init(amduat_octets_t *out_bloom) { + uint8_t *data; + + if (out_bloom == NULL) { + return false; + } + *out_bloom = amduat_octets(NULL, 0u); + + if (AMDUAT_ASL_INDEX_BLOOM_BYTES == 0u) { + return true; + } + + data = (uint8_t *)calloc(1u, AMDUAT_ASL_INDEX_BLOOM_BYTES); + if (data == NULL) { + return false; + } + *out_bloom = amduat_octets(data, AMDUAT_ASL_INDEX_BLOOM_BYTES); + return true; +} + +bool amduat_asl_index_bloom_add(amduat_octets_t bloom, + amduat_hash_id_t hash_id, + amduat_octets_t digest) { + uint64_t h1; + uint64_t h2; + size_t i; + size_t bit_count; + uint8_t *data; + + if (bloom.len == 0u) { + return false; + } + if (bloom.data == NULL) { + return false; + } + bit_count = bloom.len * 8u; + if (bit_count == 0u) { + return false; + } + if (!amduat_asl_index_bloom_hash_pair(hash_id, digest, &h1, &h2)) { + return false; + } + + data = (uint8_t *)bloom.data; + for (i = 0u; i < AMDUAT_ASL_INDEX_BLOOM_HASHES; ++i) { + uint64_t mix = h1 + (uint64_t)i * h2; + size_t bit = (size_t)(mix % (uint64_t)bit_count); + amduat_asl_index_bloom_set_bit(data, bit); + } + + return true; +} + +bool amduat_asl_index_bloom_maybe_contains(amduat_octets_t bloom, + amduat_hash_id_t hash_id, + amduat_octets_t digest) { + uint64_t h1; + uint64_t h2; + size_t i; + size_t bit_count; + + if (bloom.len == 0u || bloom.data == NULL) { + return true; + } + bit_count = bloom.len * 8u; + if (bit_count == 0u) { + return true; + } + if (!amduat_asl_index_bloom_hash_pair(hash_id, digest, &h1, &h2)) { + return true; + } + + for (i = 0u; i < AMDUAT_ASL_INDEX_BLOOM_HASHES; ++i) { + uint64_t mix = h1 + (uint64_t)i * h2; + size_t bit = (size_t)(mix % (uint64_t)bit_count); + if (!amduat_asl_index_bloom_test_bit(bloom.data, bit)) { + return false; + } + } + + return true; +} diff --git a/tests/asl/test_asl_store_index_fs.c b/tests/asl/test_asl_store_index_fs.c index 395a43d..23a5b05 100644 --- a/tests/asl/test_asl_store_index_fs.c +++ b/tests/asl/test_asl_store_index_fs.c @@ -1,10 +1,13 @@ #include "amduat/asl/asl_store_index_fs.h" +#include "amduat/asl/index_bloom.h" #include "amduat/asl/store.h" #include "amduat/enc/asl1_core.h" +#include "amduat/enc/asl_core_index.h" #include "amduat/hash/asl1.h" #include #include +#include #include #include #include @@ -98,6 +101,85 @@ static bool remove_tree(const char *path) { return rmdir(path) == 0; } +static bool read_file(const char *path, uint8_t **out_bytes, size_t *out_len) { + FILE *fp; + long size; + uint8_t *buffer; + size_t read_len; + + if (path == NULL || out_bytes == NULL || out_len == NULL) { + return false; + } + *out_bytes = NULL; + *out_len = 0u; + + fp = fopen(path, "rb"); + if (fp == NULL) { + return false; + } + if (fseek(fp, 0, SEEK_END) != 0) { + fclose(fp); + return false; + } + size = ftell(fp); + if (size < 0) { + fclose(fp); + return false; + } + if (fseek(fp, 0, SEEK_SET) != 0) { + fclose(fp); + return false; + } + + buffer = (uint8_t *)malloc((size_t)size); + if (buffer == NULL) { + fclose(fp); + return false; + } + read_len = fread(buffer, 1u, (size_t)size, fp); + fclose(fp); + if (read_len != (size_t)size) { + free(buffer); + return false; + } + + *out_bytes = buffer; + *out_len = (size_t)size; + return true; +} + +static bool build_segment_path(const char *root, + uint64_t segment_id, + char **out_path) { + int needed; + char *buffer; + + if (root == NULL || out_path == NULL) { + return false; + } + + needed = snprintf(NULL, + 0, + "%s/index/segments/segment-%016" PRIx64 ".asl", + root, + segment_id); + if (needed <= 0) { + return false; + } + + buffer = (char *)malloc((size_t)needed + 1u); + if (buffer == NULL) { + return false; + } + snprintf(buffer, + (size_t)needed + 1u, + "%s/index/segments/segment-%016" PRIx64 ".asl", + root, + segment_id); + *out_path = buffer; + return true; +} + static char *make_temp_root(void) { char *templ; const char template_prefix[] = "/tmp/amduat_test_asl_store_index_fs_XXXXXX"; @@ -186,6 +268,60 @@ static int test_round_trip(void) { } } + { + char *segment_path = NULL; + uint8_t *segment_bytes = NULL; + size_t segment_len = 0u; + amduat_asl_core_index_segment_t segment; + bool bloom_nonzero = false; + size_t i; + + memset(&segment, 0, sizeof(segment)); + + if (!build_segment_path(root, 1u, &segment_path)) { + fprintf(stderr, "segment path build failed\n"); + goto cleanup; + } + if (!read_file(segment_path, &segment_bytes, &segment_len)) { + fprintf(stderr, "segment read failed\n"); + free(segment_path); + goto cleanup; + } + free(segment_path); + if (!amduat_enc_asl_core_index_decode_v1( + amduat_octets(segment_bytes, segment_len), &segment)) { + fprintf(stderr, "segment decode failed\n"); + free(segment_bytes); + goto cleanup; + } + free(segment_bytes); + + if (segment.bloom.len != AMDUAT_ASL_INDEX_BLOOM_BYTES) { + fprintf(stderr, "segment bloom size mismatch\n"); + amduat_enc_asl_core_index_free(&segment); + goto cleanup; + } + if (!amduat_asl_index_bloom_maybe_contains(segment.bloom, + ref.hash_id, + ref.digest)) { + fprintf(stderr, "segment bloom missing digest\n"); + amduat_enc_asl_core_index_free(&segment); + goto cleanup; + } + for (i = 0u; i < segment.bloom.len; ++i) { + if (segment.bloom.data[i] != 0u) { + bloom_nonzero = true; + break; + } + } + if (!bloom_nonzero) { + fprintf(stderr, "segment bloom was empty\n"); + amduat_enc_asl_core_index_free(&segment); + goto cleanup; + } + amduat_enc_asl_core_index_free(&segment); + } + exit_code = 0; cleanup: