Add bloom filters to ASL index segments

This commit is contained in:
Carl Niklas Rydberg 2026-01-17 16:55:46 +01:00
parent 06a96f25db
commit 0667cee17d
5 changed files with 388 additions and 0 deletions

View file

@ -63,6 +63,7 @@ set(AMDUAT_ASL_SRCS
src/kernel/asl/core.c
src/near_core/asl/artifact_io.c
src/near_core/asl/io.c
src/near_core/asl/index_bloom.c
src/near_core/asl/index_snapshot.c
src/near_core/asl/index_replay.c
src/near_core/asl/parse.c

View file

@ -0,0 +1,33 @@
#ifndef AMDUAT_ASL_INDEX_BLOOM_H
#define AMDUAT_ASL_INDEX_BLOOM_H
#include "amduat/asl/core.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
enum {
AMDUAT_ASL_INDEX_BLOOM_BYTES = 256,
AMDUAT_ASL_INDEX_BLOOM_HASHES = 4
};
bool amduat_asl_index_bloom_init(amduat_octets_t *out_bloom);
bool amduat_asl_index_bloom_add(amduat_octets_t bloom,
amduat_hash_id_t hash_id,
amduat_octets_t digest);
bool amduat_asl_index_bloom_maybe_contains(amduat_octets_t bloom,
amduat_hash_id_t hash_id,
amduat_octets_t digest);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* AMDUAT_ASL_INDEX_BLOOM_H */

View file

@ -1,6 +1,7 @@
#include "amduat/asl/asl_store_index_fs.h"
#include "asl_store_index_fs_layout.h"
#include "amduat/asl/index_bloom.h"
#include "amduat/asl/index_snapshot.h"
#include "amduat/asl/index_replay.h"
#include "amduat/asl/ref_derive.h"
@ -1028,6 +1029,56 @@ static bool amduat_asl_store_index_fs_record_matches(
return memcmp(digest, ref.digest.data, record->digest_len) == 0;
}
static bool amduat_asl_store_index_fs_build_bloom(
const amduat_asl_core_index_segment_t *segment,
amduat_octets_t *out_bloom) {
size_t i;
size_t digest_cursor;
if (out_bloom == NULL) {
return false;
}
*out_bloom = amduat_octets(NULL, 0u);
if (segment == NULL || segment->record_count == 0u) {
return true;
}
if (segment->digests.len != 0u && segment->digests.data == NULL) {
return false;
}
if (!amduat_asl_index_bloom_init(out_bloom)) {
return false;
}
digest_cursor = 0u;
for (i = 0u; i < segment->record_count; ++i) {
const amduat_asl_index_record_t *record = &segment->records[i];
uint16_t digest_len = record->digest_len;
const uint8_t *digest_data;
if (digest_len == 0u) {
amduat_octets_free(out_bloom);
return false;
}
if (digest_len > segment->digests.len - digest_cursor) {
amduat_octets_free(out_bloom);
return false;
}
digest_data = segment->digests.data + digest_cursor;
if (!amduat_asl_index_bloom_add(
*out_bloom,
(amduat_hash_id_t)record->hash_id,
amduat_octets(digest_data, digest_len))) {
amduat_octets_free(out_bloom);
return false;
}
digest_cursor += digest_len;
}
return true;
}
static amduat_asl_store_error_t amduat_asl_store_index_fs_read_extent_bytes(
const char *root_path,
const amduat_asl_extent_record_t *extent,
@ -1198,6 +1249,13 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_scan_segments(
return AMDUAT_ASL_STORE_ERR_INTEGRITY;
}
if (!amduat_asl_index_bloom_maybe_contains(segment.bloom,
ref.hash_id,
ref.digest)) {
amduat_enc_asl_core_index_free(&segment);
continue;
}
for (r = 0; r < segment.record_count; ++r) {
const amduat_asl_index_record_t *record = &segment.records[r];
if (!amduat_asl_store_index_fs_record_matches(&segment, record, ref)) {
@ -1641,8 +1699,15 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_put_indexed_impl(
segment.extents = &extent;
segment.extent_count = 1u;
segment.bloom = amduat_octets(NULL, 0u);
if (!amduat_asl_store_index_fs_build_bloom(&segment, &segment.bloom)) {
amduat_octets_free(&segment.bloom);
segment.bloom = amduat_octets(NULL, 0u);
}
segment_bytes = amduat_octets(NULL, 0u);
if (!amduat_enc_asl_core_index_encode_v1(&segment, &segment_bytes)) {
amduat_octets_free(&segment.bloom);
amduat_reference_free(&derived_ref);
amduat_octets_free(&artifact_bytes);
free(index_path);
@ -1650,6 +1715,7 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_put_indexed_impl(
free(blocks_path);
return AMDUAT_ASL_STORE_ERR_IO;
}
amduat_octets_free(&segment.bloom);
if (!amduat_hash_asl1_digest(AMDUAT_HASH_ASL1_ID_SHA256,
segment_bytes,

View file

@ -0,0 +1,152 @@
#include "amduat/asl/index_bloom.h"
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
static uint64_t amduat_asl_index_bloom_hash_update(uint64_t hash,
const uint8_t *data,
size_t len) {
size_t i;
if (data == NULL || len == 0u) {
return hash;
}
for (i = 0u; i < len; ++i) {
hash ^= (uint64_t)data[i];
hash *= 1099511628211ull;
}
return hash;
}
static bool amduat_asl_index_bloom_hash_pair(amduat_hash_id_t hash_id,
amduat_octets_t digest,
uint64_t *out_h1,
uint64_t *out_h2) {
uint64_t hash;
uint8_t header[4];
uint16_t digest_len;
if (out_h1 == NULL || out_h2 == NULL) {
return false;
}
if (digest.len != 0u && digest.data == NULL) {
return false;
}
if (digest.len > UINT16_MAX) {
return false;
}
digest_len = (uint16_t)digest.len;
header[0] = (uint8_t)(hash_id & 0xffu);
header[1] = (uint8_t)((hash_id >> 8) & 0xffu);
header[2] = (uint8_t)(digest_len & 0xffu);
header[3] = (uint8_t)((digest_len >> 8) & 0xffu);
hash = 1469598103934665603ull ^ 0x9e3779b97f4a7c15ull;
hash = amduat_asl_index_bloom_hash_update(hash, header, sizeof(header));
hash = amduat_asl_index_bloom_hash_update(hash, digest.data, digest.len);
*out_h1 = hash;
hash = 1469598103934665603ull ^ 0xbf58476d1ce4e5b9ull;
hash = amduat_asl_index_bloom_hash_update(hash, header, sizeof(header));
hash = amduat_asl_index_bloom_hash_update(hash, digest.data, digest.len);
if (hash == 0u) {
hash = 0x94d049bb133111ebull;
}
*out_h2 = hash;
return true;
}
static void amduat_asl_index_bloom_set_bit(uint8_t *data, size_t bit) {
data[bit >> 3] |= (uint8_t)(1u << (bit & 7u));
}
static bool amduat_asl_index_bloom_test_bit(const uint8_t *data, size_t bit) {
return (data[bit >> 3] & (uint8_t)(1u << (bit & 7u))) != 0u;
}
bool amduat_asl_index_bloom_init(amduat_octets_t *out_bloom) {
uint8_t *data;
if (out_bloom == NULL) {
return false;
}
*out_bloom = amduat_octets(NULL, 0u);
if (AMDUAT_ASL_INDEX_BLOOM_BYTES == 0u) {
return true;
}
data = (uint8_t *)calloc(1u, AMDUAT_ASL_INDEX_BLOOM_BYTES);
if (data == NULL) {
return false;
}
*out_bloom = amduat_octets(data, AMDUAT_ASL_INDEX_BLOOM_BYTES);
return true;
}
bool amduat_asl_index_bloom_add(amduat_octets_t bloom,
amduat_hash_id_t hash_id,
amduat_octets_t digest) {
uint64_t h1;
uint64_t h2;
size_t i;
size_t bit_count;
uint8_t *data;
if (bloom.len == 0u) {
return false;
}
if (bloom.data == NULL) {
return false;
}
bit_count = bloom.len * 8u;
if (bit_count == 0u) {
return false;
}
if (!amduat_asl_index_bloom_hash_pair(hash_id, digest, &h1, &h2)) {
return false;
}
data = (uint8_t *)bloom.data;
for (i = 0u; i < AMDUAT_ASL_INDEX_BLOOM_HASHES; ++i) {
uint64_t mix = h1 + (uint64_t)i * h2;
size_t bit = (size_t)(mix % (uint64_t)bit_count);
amduat_asl_index_bloom_set_bit(data, bit);
}
return true;
}
bool amduat_asl_index_bloom_maybe_contains(amduat_octets_t bloom,
amduat_hash_id_t hash_id,
amduat_octets_t digest) {
uint64_t h1;
uint64_t h2;
size_t i;
size_t bit_count;
if (bloom.len == 0u || bloom.data == NULL) {
return true;
}
bit_count = bloom.len * 8u;
if (bit_count == 0u) {
return true;
}
if (!amduat_asl_index_bloom_hash_pair(hash_id, digest, &h1, &h2)) {
return true;
}
for (i = 0u; i < AMDUAT_ASL_INDEX_BLOOM_HASHES; ++i) {
uint64_t mix = h1 + (uint64_t)i * h2;
size_t bit = (size_t)(mix % (uint64_t)bit_count);
if (!amduat_asl_index_bloom_test_bit(bloom.data, bit)) {
return false;
}
}
return true;
}

View file

@ -1,10 +1,13 @@
#include "amduat/asl/asl_store_index_fs.h"
#include "amduat/asl/index_bloom.h"
#include "amduat/asl/store.h"
#include "amduat/enc/asl1_core.h"
#include "amduat/enc/asl_core_index.h"
#include "amduat/hash/asl1.h"
#include <dirent.h>
#include <errno.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
@ -98,6 +101,85 @@ static bool remove_tree(const char *path) {
return rmdir(path) == 0;
}
static bool read_file(const char *path, uint8_t **out_bytes, size_t *out_len) {
FILE *fp;
long size;
uint8_t *buffer;
size_t read_len;
if (path == NULL || out_bytes == NULL || out_len == NULL) {
return false;
}
*out_bytes = NULL;
*out_len = 0u;
fp = fopen(path, "rb");
if (fp == NULL) {
return false;
}
if (fseek(fp, 0, SEEK_END) != 0) {
fclose(fp);
return false;
}
size = ftell(fp);
if (size < 0) {
fclose(fp);
return false;
}
if (fseek(fp, 0, SEEK_SET) != 0) {
fclose(fp);
return false;
}
buffer = (uint8_t *)malloc((size_t)size);
if (buffer == NULL) {
fclose(fp);
return false;
}
read_len = fread(buffer, 1u, (size_t)size, fp);
fclose(fp);
if (read_len != (size_t)size) {
free(buffer);
return false;
}
*out_bytes = buffer;
*out_len = (size_t)size;
return true;
}
static bool build_segment_path(const char *root,
uint64_t segment_id,
char **out_path) {
int needed;
char *buffer;
if (root == NULL || out_path == NULL) {
return false;
}
needed = snprintf(NULL,
0,
"%s/index/segments/segment-%016" PRIx64 ".asl",
root,
segment_id);
if (needed <= 0) {
return false;
}
buffer = (char *)malloc((size_t)needed + 1u);
if (buffer == NULL) {
return false;
}
snprintf(buffer,
(size_t)needed + 1u,
"%s/index/segments/segment-%016" PRIx64 ".asl",
root,
segment_id);
*out_path = buffer;
return true;
}
static char *make_temp_root(void) {
char *templ;
const char template_prefix[] = "/tmp/amduat_test_asl_store_index_fs_XXXXXX";
@ -186,6 +268,60 @@ static int test_round_trip(void) {
}
}
{
char *segment_path = NULL;
uint8_t *segment_bytes = NULL;
size_t segment_len = 0u;
amduat_asl_core_index_segment_t segment;
bool bloom_nonzero = false;
size_t i;
memset(&segment, 0, sizeof(segment));
if (!build_segment_path(root, 1u, &segment_path)) {
fprintf(stderr, "segment path build failed\n");
goto cleanup;
}
if (!read_file(segment_path, &segment_bytes, &segment_len)) {
fprintf(stderr, "segment read failed\n");
free(segment_path);
goto cleanup;
}
free(segment_path);
if (!amduat_enc_asl_core_index_decode_v1(
amduat_octets(segment_bytes, segment_len), &segment)) {
fprintf(stderr, "segment decode failed\n");
free(segment_bytes);
goto cleanup;
}
free(segment_bytes);
if (segment.bloom.len != AMDUAT_ASL_INDEX_BLOOM_BYTES) {
fprintf(stderr, "segment bloom size mismatch\n");
amduat_enc_asl_core_index_free(&segment);
goto cleanup;
}
if (!amduat_asl_index_bloom_maybe_contains(segment.bloom,
ref.hash_id,
ref.digest)) {
fprintf(stderr, "segment bloom missing digest\n");
amduat_enc_asl_core_index_free(&segment);
goto cleanup;
}
for (i = 0u; i < segment.bloom.len; ++i) {
if (segment.bloom.data[i] != 0u) {
bloom_nonzero = true;
break;
}
}
if (!bloom_nonzero) {
fprintf(stderr, "segment bloom was empty\n");
amduat_enc_asl_core_index_free(&segment);
goto cleanup;
}
amduat_enc_asl_core_index_free(&segment);
}
exit_code = 0;
cleanup: