Add bloom filters to ASL index segments
This commit is contained in:
parent
06a96f25db
commit
0667cee17d
|
|
@ -63,6 +63,7 @@ set(AMDUAT_ASL_SRCS
|
|||
src/kernel/asl/core.c
|
||||
src/near_core/asl/artifact_io.c
|
||||
src/near_core/asl/io.c
|
||||
src/near_core/asl/index_bloom.c
|
||||
src/near_core/asl/index_snapshot.c
|
||||
src/near_core/asl/index_replay.c
|
||||
src/near_core/asl/parse.c
|
||||
|
|
|
|||
33
include/amduat/asl/index_bloom.h
Normal file
33
include/amduat/asl/index_bloom.h
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
#ifndef AMDUAT_ASL_INDEX_BLOOM_H
|
||||
#define AMDUAT_ASL_INDEX_BLOOM_H
|
||||
|
||||
#include "amduat/asl/core.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum {
|
||||
AMDUAT_ASL_INDEX_BLOOM_BYTES = 256,
|
||||
AMDUAT_ASL_INDEX_BLOOM_HASHES = 4
|
||||
};
|
||||
|
||||
bool amduat_asl_index_bloom_init(amduat_octets_t *out_bloom);
|
||||
|
||||
bool amduat_asl_index_bloom_add(amduat_octets_t bloom,
|
||||
amduat_hash_id_t hash_id,
|
||||
amduat_octets_t digest);
|
||||
|
||||
bool amduat_asl_index_bloom_maybe_contains(amduat_octets_t bloom,
|
||||
amduat_hash_id_t hash_id,
|
||||
amduat_octets_t digest);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* AMDUAT_ASL_INDEX_BLOOM_H */
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
#include "amduat/asl/asl_store_index_fs.h"
|
||||
|
||||
#include "asl_store_index_fs_layout.h"
|
||||
#include "amduat/asl/index_bloom.h"
|
||||
#include "amduat/asl/index_snapshot.h"
|
||||
#include "amduat/asl/index_replay.h"
|
||||
#include "amduat/asl/ref_derive.h"
|
||||
|
|
@ -1028,6 +1029,56 @@ static bool amduat_asl_store_index_fs_record_matches(
|
|||
return memcmp(digest, ref.digest.data, record->digest_len) == 0;
|
||||
}
|
||||
|
||||
static bool amduat_asl_store_index_fs_build_bloom(
|
||||
const amduat_asl_core_index_segment_t *segment,
|
||||
amduat_octets_t *out_bloom) {
|
||||
size_t i;
|
||||
size_t digest_cursor;
|
||||
|
||||
if (out_bloom == NULL) {
|
||||
return false;
|
||||
}
|
||||
*out_bloom = amduat_octets(NULL, 0u);
|
||||
if (segment == NULL || segment->record_count == 0u) {
|
||||
return true;
|
||||
}
|
||||
if (segment->digests.len != 0u && segment->digests.data == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!amduat_asl_index_bloom_init(out_bloom)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
digest_cursor = 0u;
|
||||
for (i = 0u; i < segment->record_count; ++i) {
|
||||
const amduat_asl_index_record_t *record = &segment->records[i];
|
||||
uint16_t digest_len = record->digest_len;
|
||||
const uint8_t *digest_data;
|
||||
|
||||
if (digest_len == 0u) {
|
||||
amduat_octets_free(out_bloom);
|
||||
return false;
|
||||
}
|
||||
if (digest_len > segment->digests.len - digest_cursor) {
|
||||
amduat_octets_free(out_bloom);
|
||||
return false;
|
||||
}
|
||||
|
||||
digest_data = segment->digests.data + digest_cursor;
|
||||
if (!amduat_asl_index_bloom_add(
|
||||
*out_bloom,
|
||||
(amduat_hash_id_t)record->hash_id,
|
||||
amduat_octets(digest_data, digest_len))) {
|
||||
amduat_octets_free(out_bloom);
|
||||
return false;
|
||||
}
|
||||
digest_cursor += digest_len;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static amduat_asl_store_error_t amduat_asl_store_index_fs_read_extent_bytes(
|
||||
const char *root_path,
|
||||
const amduat_asl_extent_record_t *extent,
|
||||
|
|
@ -1198,6 +1249,13 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_scan_segments(
|
|||
return AMDUAT_ASL_STORE_ERR_INTEGRITY;
|
||||
}
|
||||
|
||||
if (!amduat_asl_index_bloom_maybe_contains(segment.bloom,
|
||||
ref.hash_id,
|
||||
ref.digest)) {
|
||||
amduat_enc_asl_core_index_free(&segment);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (r = 0; r < segment.record_count; ++r) {
|
||||
const amduat_asl_index_record_t *record = &segment.records[r];
|
||||
if (!amduat_asl_store_index_fs_record_matches(&segment, record, ref)) {
|
||||
|
|
@ -1641,8 +1699,15 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_put_indexed_impl(
|
|||
segment.extents = &extent;
|
||||
segment.extent_count = 1u;
|
||||
|
||||
segment.bloom = amduat_octets(NULL, 0u);
|
||||
if (!amduat_asl_store_index_fs_build_bloom(&segment, &segment.bloom)) {
|
||||
amduat_octets_free(&segment.bloom);
|
||||
segment.bloom = amduat_octets(NULL, 0u);
|
||||
}
|
||||
|
||||
segment_bytes = amduat_octets(NULL, 0u);
|
||||
if (!amduat_enc_asl_core_index_encode_v1(&segment, &segment_bytes)) {
|
||||
amduat_octets_free(&segment.bloom);
|
||||
amduat_reference_free(&derived_ref);
|
||||
amduat_octets_free(&artifact_bytes);
|
||||
free(index_path);
|
||||
|
|
@ -1650,6 +1715,7 @@ static amduat_asl_store_error_t amduat_asl_store_index_fs_put_indexed_impl(
|
|||
free(blocks_path);
|
||||
return AMDUAT_ASL_STORE_ERR_IO;
|
||||
}
|
||||
amduat_octets_free(&segment.bloom);
|
||||
|
||||
if (!amduat_hash_asl1_digest(AMDUAT_HASH_ASL1_ID_SHA256,
|
||||
segment_bytes,
|
||||
|
|
|
|||
152
src/near_core/asl/index_bloom.c
Normal file
152
src/near_core/asl/index_bloom.c
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
#include "amduat/asl/index_bloom.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static uint64_t amduat_asl_index_bloom_hash_update(uint64_t hash,
|
||||
const uint8_t *data,
|
||||
size_t len) {
|
||||
size_t i;
|
||||
|
||||
if (data == NULL || len == 0u) {
|
||||
return hash;
|
||||
}
|
||||
|
||||
for (i = 0u; i < len; ++i) {
|
||||
hash ^= (uint64_t)data[i];
|
||||
hash *= 1099511628211ull;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
static bool amduat_asl_index_bloom_hash_pair(amduat_hash_id_t hash_id,
|
||||
amduat_octets_t digest,
|
||||
uint64_t *out_h1,
|
||||
uint64_t *out_h2) {
|
||||
uint64_t hash;
|
||||
uint8_t header[4];
|
||||
uint16_t digest_len;
|
||||
|
||||
if (out_h1 == NULL || out_h2 == NULL) {
|
||||
return false;
|
||||
}
|
||||
if (digest.len != 0u && digest.data == NULL) {
|
||||
return false;
|
||||
}
|
||||
if (digest.len > UINT16_MAX) {
|
||||
return false;
|
||||
}
|
||||
|
||||
digest_len = (uint16_t)digest.len;
|
||||
header[0] = (uint8_t)(hash_id & 0xffu);
|
||||
header[1] = (uint8_t)((hash_id >> 8) & 0xffu);
|
||||
header[2] = (uint8_t)(digest_len & 0xffu);
|
||||
header[3] = (uint8_t)((digest_len >> 8) & 0xffu);
|
||||
|
||||
hash = 1469598103934665603ull ^ 0x9e3779b97f4a7c15ull;
|
||||
hash = amduat_asl_index_bloom_hash_update(hash, header, sizeof(header));
|
||||
hash = amduat_asl_index_bloom_hash_update(hash, digest.data, digest.len);
|
||||
*out_h1 = hash;
|
||||
|
||||
hash = 1469598103934665603ull ^ 0xbf58476d1ce4e5b9ull;
|
||||
hash = amduat_asl_index_bloom_hash_update(hash, header, sizeof(header));
|
||||
hash = amduat_asl_index_bloom_hash_update(hash, digest.data, digest.len);
|
||||
if (hash == 0u) {
|
||||
hash = 0x94d049bb133111ebull;
|
||||
}
|
||||
*out_h2 = hash;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void amduat_asl_index_bloom_set_bit(uint8_t *data, size_t bit) {
|
||||
data[bit >> 3] |= (uint8_t)(1u << (bit & 7u));
|
||||
}
|
||||
|
||||
static bool amduat_asl_index_bloom_test_bit(const uint8_t *data, size_t bit) {
|
||||
return (data[bit >> 3] & (uint8_t)(1u << (bit & 7u))) != 0u;
|
||||
}
|
||||
|
||||
bool amduat_asl_index_bloom_init(amduat_octets_t *out_bloom) {
|
||||
uint8_t *data;
|
||||
|
||||
if (out_bloom == NULL) {
|
||||
return false;
|
||||
}
|
||||
*out_bloom = amduat_octets(NULL, 0u);
|
||||
|
||||
if (AMDUAT_ASL_INDEX_BLOOM_BYTES == 0u) {
|
||||
return true;
|
||||
}
|
||||
|
||||
data = (uint8_t *)calloc(1u, AMDUAT_ASL_INDEX_BLOOM_BYTES);
|
||||
if (data == NULL) {
|
||||
return false;
|
||||
}
|
||||
*out_bloom = amduat_octets(data, AMDUAT_ASL_INDEX_BLOOM_BYTES);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool amduat_asl_index_bloom_add(amduat_octets_t bloom,
|
||||
amduat_hash_id_t hash_id,
|
||||
amduat_octets_t digest) {
|
||||
uint64_t h1;
|
||||
uint64_t h2;
|
||||
size_t i;
|
||||
size_t bit_count;
|
||||
uint8_t *data;
|
||||
|
||||
if (bloom.len == 0u) {
|
||||
return false;
|
||||
}
|
||||
if (bloom.data == NULL) {
|
||||
return false;
|
||||
}
|
||||
bit_count = bloom.len * 8u;
|
||||
if (bit_count == 0u) {
|
||||
return false;
|
||||
}
|
||||
if (!amduat_asl_index_bloom_hash_pair(hash_id, digest, &h1, &h2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
data = (uint8_t *)bloom.data;
|
||||
for (i = 0u; i < AMDUAT_ASL_INDEX_BLOOM_HASHES; ++i) {
|
||||
uint64_t mix = h1 + (uint64_t)i * h2;
|
||||
size_t bit = (size_t)(mix % (uint64_t)bit_count);
|
||||
amduat_asl_index_bloom_set_bit(data, bit);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool amduat_asl_index_bloom_maybe_contains(amduat_octets_t bloom,
|
||||
amduat_hash_id_t hash_id,
|
||||
amduat_octets_t digest) {
|
||||
uint64_t h1;
|
||||
uint64_t h2;
|
||||
size_t i;
|
||||
size_t bit_count;
|
||||
|
||||
if (bloom.len == 0u || bloom.data == NULL) {
|
||||
return true;
|
||||
}
|
||||
bit_count = bloom.len * 8u;
|
||||
if (bit_count == 0u) {
|
||||
return true;
|
||||
}
|
||||
if (!amduat_asl_index_bloom_hash_pair(hash_id, digest, &h1, &h2)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (i = 0u; i < AMDUAT_ASL_INDEX_BLOOM_HASHES; ++i) {
|
||||
uint64_t mix = h1 + (uint64_t)i * h2;
|
||||
size_t bit = (size_t)(mix % (uint64_t)bit_count);
|
||||
if (!amduat_asl_index_bloom_test_bit(bloom.data, bit)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -1,10 +1,13 @@
|
|||
#include "amduat/asl/asl_store_index_fs.h"
|
||||
#include "amduat/asl/index_bloom.h"
|
||||
#include "amduat/asl/store.h"
|
||||
#include "amduat/enc/asl1_core.h"
|
||||
#include "amduat/enc/asl_core_index.h"
|
||||
#include "amduat/hash/asl1.h"
|
||||
|
||||
#include <dirent.h>
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
|
@ -98,6 +101,85 @@ static bool remove_tree(const char *path) {
|
|||
return rmdir(path) == 0;
|
||||
}
|
||||
|
||||
static bool read_file(const char *path, uint8_t **out_bytes, size_t *out_len) {
|
||||
FILE *fp;
|
||||
long size;
|
||||
uint8_t *buffer;
|
||||
size_t read_len;
|
||||
|
||||
if (path == NULL || out_bytes == NULL || out_len == NULL) {
|
||||
return false;
|
||||
}
|
||||
*out_bytes = NULL;
|
||||
*out_len = 0u;
|
||||
|
||||
fp = fopen(path, "rb");
|
||||
if (fp == NULL) {
|
||||
return false;
|
||||
}
|
||||
if (fseek(fp, 0, SEEK_END) != 0) {
|
||||
fclose(fp);
|
||||
return false;
|
||||
}
|
||||
size = ftell(fp);
|
||||
if (size < 0) {
|
||||
fclose(fp);
|
||||
return false;
|
||||
}
|
||||
if (fseek(fp, 0, SEEK_SET) != 0) {
|
||||
fclose(fp);
|
||||
return false;
|
||||
}
|
||||
|
||||
buffer = (uint8_t *)malloc((size_t)size);
|
||||
if (buffer == NULL) {
|
||||
fclose(fp);
|
||||
return false;
|
||||
}
|
||||
read_len = fread(buffer, 1u, (size_t)size, fp);
|
||||
fclose(fp);
|
||||
if (read_len != (size_t)size) {
|
||||
free(buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
*out_bytes = buffer;
|
||||
*out_len = (size_t)size;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool build_segment_path(const char *root,
|
||||
uint64_t segment_id,
|
||||
char **out_path) {
|
||||
int needed;
|
||||
char *buffer;
|
||||
|
||||
if (root == NULL || out_path == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
needed = snprintf(NULL,
|
||||
0,
|
||||
"%s/index/segments/segment-%016" PRIx64 ".asl",
|
||||
root,
|
||||
segment_id);
|
||||
if (needed <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
buffer = (char *)malloc((size_t)needed + 1u);
|
||||
if (buffer == NULL) {
|
||||
return false;
|
||||
}
|
||||
snprintf(buffer,
|
||||
(size_t)needed + 1u,
|
||||
"%s/index/segments/segment-%016" PRIx64 ".asl",
|
||||
root,
|
||||
segment_id);
|
||||
*out_path = buffer;
|
||||
return true;
|
||||
}
|
||||
|
||||
static char *make_temp_root(void) {
|
||||
char *templ;
|
||||
const char template_prefix[] = "/tmp/amduat_test_asl_store_index_fs_XXXXXX";
|
||||
|
|
@ -186,6 +268,60 @@ static int test_round_trip(void) {
|
|||
}
|
||||
}
|
||||
|
||||
{
|
||||
char *segment_path = NULL;
|
||||
uint8_t *segment_bytes = NULL;
|
||||
size_t segment_len = 0u;
|
||||
amduat_asl_core_index_segment_t segment;
|
||||
bool bloom_nonzero = false;
|
||||
size_t i;
|
||||
|
||||
memset(&segment, 0, sizeof(segment));
|
||||
|
||||
if (!build_segment_path(root, 1u, &segment_path)) {
|
||||
fprintf(stderr, "segment path build failed\n");
|
||||
goto cleanup;
|
||||
}
|
||||
if (!read_file(segment_path, &segment_bytes, &segment_len)) {
|
||||
fprintf(stderr, "segment read failed\n");
|
||||
free(segment_path);
|
||||
goto cleanup;
|
||||
}
|
||||
free(segment_path);
|
||||
if (!amduat_enc_asl_core_index_decode_v1(
|
||||
amduat_octets(segment_bytes, segment_len), &segment)) {
|
||||
fprintf(stderr, "segment decode failed\n");
|
||||
free(segment_bytes);
|
||||
goto cleanup;
|
||||
}
|
||||
free(segment_bytes);
|
||||
|
||||
if (segment.bloom.len != AMDUAT_ASL_INDEX_BLOOM_BYTES) {
|
||||
fprintf(stderr, "segment bloom size mismatch\n");
|
||||
amduat_enc_asl_core_index_free(&segment);
|
||||
goto cleanup;
|
||||
}
|
||||
if (!amduat_asl_index_bloom_maybe_contains(segment.bloom,
|
||||
ref.hash_id,
|
||||
ref.digest)) {
|
||||
fprintf(stderr, "segment bloom missing digest\n");
|
||||
amduat_enc_asl_core_index_free(&segment);
|
||||
goto cleanup;
|
||||
}
|
||||
for (i = 0u; i < segment.bloom.len; ++i) {
|
||||
if (segment.bloom.data[i] != 0u) {
|
||||
bloom_nonzero = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!bloom_nonzero) {
|
||||
fprintf(stderr, "segment bloom was empty\n");
|
||||
amduat_enc_asl_core_index_free(&segment);
|
||||
goto cleanup;
|
||||
}
|
||||
amduat_enc_asl_core_index_free(&segment);
|
||||
}
|
||||
|
||||
exit_code = 0;
|
||||
|
||||
cleanup:
|
||||
|
|
|
|||
Loading…
Reference in a new issue