diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 0a92ca60ef..3976d73eee 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -27,6 +27,17 @@ #define MAGIC "BORG_IDX" #define MAGIC_LEN 8 +#define DEBUG 0 + +#define debug_print(fmt, ...) \ + do { \ + if (DEBUG) { \ + fprintf(stderr, fmt, __VA_ARGS__); \ + fflush(NULL); \ + } \ + } while (0) + + typedef struct { char magic[MAGIC_LEN]; int32_t num_entries; @@ -44,6 +55,7 @@ typedef struct { off_t bucket_size; int lower_limit; int upper_limit; + void *tmp_entry; } HashIndex; /* prime (or w/ big prime factors) hash table sizes @@ -68,7 +80,9 @@ static int hash_sizes[] = { }; #define HASH_MIN_LOAD .25 -#define HASH_MAX_LOAD .75 /* don't go higher than 0.75, otherwise performance severely suffers! */ +#define HASH_MAX_LOAD .99 /* use testsuite.benchmark.test_chunk_indexer_* to find + an appropriate value; also don't forget to update this + value in archive.py */ #define MAX(x, y) ((x) > (y) ? (x): (y)) #define NELEMS(x) (sizeof(x) / sizeof((x)[0])) @@ -76,7 +90,7 @@ static int hash_sizes[] = { #define EMPTY _htole32(0xffffffff) #define DELETED _htole32(0xfffffffe) -#define BUCKET_ADDR(index, idx) (index->buckets + (idx * index->bucket_size)) +#define BUCKET_ADDR(index, idx) (index->buckets + ((idx) * index->bucket_size)) #define BUCKET_MATCHES_KEY(index, idx, key) (memcmp(key, BUCKET_ADDR(index, idx), index->key_size) == 0) @@ -84,7 +98,7 @@ static int hash_sizes[] = { #define BUCKET_IS_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == EMPTY) #define BUCKET_MARK_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = DELETED) -#define BUCKET_MARK_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = EMPTY) +#define BUCKET_MARK_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, (idx)) + index->key_size)) = EMPTY) #define EPRINTF_MSG(msg, ...) fprintf(stderr, "hashindex: " msg "\n", ##__VA_ARGS__) #define EPRINTF_MSG_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg "\n", path, ##__VA_ARGS__) @@ -108,35 +122,58 @@ hashindex_index(HashIndex *index, const void *key) return _le32toh(*((uint32_t *)key)) % index->num_buckets; } +inline int +distance(int current_idx, int ideal_idx, int num_buckets) +{ + /* If the current index is smaller than the ideal index we've wrapped + around the end of the bucket array and need to compensate for that. */ + return current_idx - ideal_idx + ( (current_idx < ideal_idx) ? num_buckets : 0 ); +} + static int -hashindex_lookup(HashIndex *index, const void *key) +hashindex_lookup(HashIndex *index, const void *key, int *skip_hint) { - int didx = -1; int start = hashindex_index(index, key); int idx = start; - for(;;) { - if(BUCKET_IS_EMPTY(index, idx)) - { - return -1; - } - if(BUCKET_IS_DELETED(index, idx)) { - if(didx == -1) { - didx = idx; - } + int offset; + int rv = -1; + int period = 0; + for(offset=0; ;offset++) { + if(BUCKET_IS_EMPTY(index, idx)) { + rv = -1; + /* debug_print("\n hashindex_lookup:empty %d\n", offset); */ + break; } - else if(BUCKET_MATCHES_KEY(index, idx, key)) { - if (didx != -1) { - memcpy(BUCKET_ADDR(index, didx), BUCKET_ADDR(index, idx), index->bucket_size); - BUCKET_MARK_DELETED(index, idx); - idx = didx; - } + if(BUCKET_MATCHES_KEY(index, idx, key)) { return idx; } - idx = (idx + 1) % index->num_buckets; + if(period++ == 63){ + period = 0; + if (offset > distance(idx, hashindex_index(index, BUCKET_ADDR(index, idx)), index->num_buckets)) { + rv = -1; + break; + } + } + + idx ++; + if (idx >= index->num_buckets) { + idx = 0; + } if(idx == start) { - return -1; + rv = -1; + break; } } + if (skip_hint != NULL) { + /* compensate for the period, hashindex_set will need to re-examine the last + 16 buckets for a suitable bucket to insert it's value */ + offset = offset - 64; + if (offset < 0) { + offset = 0; + } + (*skip_hint) = offset; + } + return rv; } static int @@ -145,6 +182,7 @@ hashindex_resize(HashIndex *index, int capacity) HashIndex *new; void *key = NULL; int32_t key_size = index->key_size; + debug_print("\nresize to %d!\n", capacity); if(!(new = hashindex_init(capacity, key_size, index->value_size))) { return 0; @@ -161,6 +199,7 @@ hashindex_resize(HashIndex *index, int capacity) index->num_buckets = new->num_buckets; index->lower_limit = new->lower_limit; index->upper_limit = new->upper_limit; + free(new->tmp_entry); free(new); return 1; } @@ -269,6 +308,13 @@ hashindex_read(const char *path) index = NULL; goto fail; } + if(!(index->tmp_entry = calloc(1, header.key_size + header.value_size))) { + EPRINTF_PATH(path, "malloc temp entry failed"); + free(index->buckets); + free(index); + index = NULL; + goto fail; + } bytes_read = fread(index->buckets, 1, buckets_length, fd); if(bytes_read != buckets_length) { if(ferror(fd)) { @@ -279,6 +325,7 @@ hashindex_read(const char *path) EPRINTF_MSG_PATH(path, "fread buckets failed (expected %ju, got %ju)", (uintmax_t) buckets_length, (uintmax_t) bytes_read); } + free(index->tmp_entry); free(index->buckets); free(index); index = NULL; @@ -314,6 +361,13 @@ hashindex_init(int capacity, int key_size, int value_size) free(index); return NULL; } + if(!(index->tmp_entry = calloc(1, key_size + value_size))) { + EPRINTF("malloc temp entry failed"); + free(index->buckets); + free(index); + return NULL; + } + index->num_entries = 0; index->key_size = key_size; index->value_size = value_size; @@ -321,6 +375,8 @@ hashindex_init(int capacity, int key_size, int value_size) index->bucket_size = index->key_size + index->value_size; index->lower_limit = get_lower_limit(index->num_buckets); index->upper_limit = get_upper_limit(index->num_buckets); + debug_print("\ninit %d < %d\n", index->lower_limit, index->upper_limit); + for(i = 0; i < capacity; i++) { BUCKET_MARK_EMPTY(index, i); } @@ -331,6 +387,7 @@ static void hashindex_free(HashIndex *index) { free(index->buckets); + free(index->tmp_entry); free(index); } @@ -366,52 +423,171 @@ hashindex_write(HashIndex *index, const char *path) return ret; } + static const void * hashindex_get(HashIndex *index, const void *key) { - int idx = hashindex_lookup(index, key); + int idx = hashindex_lookup(index, key, NULL); if(idx < 0) { + hashindex_lookup(index, key, NULL); return NULL; } return BUCKET_ADDR(index, idx) + index->key_size; } + +static inline int +rshift_chunk_size(HashIndex *index, int bucket_index) { + int start = bucket_index; + while(bucket_index < index->num_buckets) { + if (BUCKET_IS_EMPTY(index, bucket_index)) { + return (bucket_index - start) * index->bucket_size; + } + bucket_index++; + } + return -1; +} + +static inline int +lshift_chunk_size(HashIndex *index, int bucket_index) { + int start = bucket_index; + while(bucket_index < index->num_buckets) { + if (BUCKET_IS_EMPTY(index, bucket_index) || + (distance(bucket_index, + hashindex_index(index, BUCKET_ADDR(index, bucket_index)), + index->num_buckets) == 0)) { + return (bucket_index - start) * index->bucket_size; + } + bucket_index++; + } + return -1; +} + static int hashindex_set(HashIndex *index, const void *key, const void *value) { - int idx = hashindex_lookup(index, key); - uint8_t *ptr; - if(idx < 0) + int offset = 0; + int chunk_size; + int idx = hashindex_lookup(index, key, &offset); + if(idx >= 0) { + debug_print("%s", "\nhit\n"); + /* we already have the key in the index we just need to update its value */ + memcpy(BUCKET_ADDR(index, idx) + index->key_size, value, index->value_size); + } + else + { + /* we don't have the key in the index we need to find an appropriate address */ + debug_print("%s", "\n\nmiss\n"); if(index->num_entries > index->upper_limit) { + /* we need to grow the hashindex */ if(!hashindex_resize(index, grow_size(index->num_buckets))) { return 0; } + offset = 0; } - idx = hashindex_index(index, key); - while(!BUCKET_IS_EMPTY(index, idx) && !BUCKET_IS_DELETED(index, idx)) { - idx = (idx + 1) % index->num_buckets; + idx = hashindex_index(index, key) + offset; + if (idx >= index->num_buckets){ + idx = idx - index->num_buckets; + } + while(!BUCKET_IS_EMPTY(index, idx) && + (offset <= distance(idx, + hashindex_index(index, BUCKET_ADDR(index, idx)), + index->num_buckets))) { + offset ++; + idx++; + if (idx >= index->num_buckets) { + idx = 0; + } + } + if (!BUCKET_IS_EMPTY(index, idx)) { + // we have a collision + chunk_size = rshift_chunk_size(index, idx); + if (chunk_size > 0) { + // shift by one bucket + memmove(BUCKET_ADDR(index, idx+1), BUCKET_ADDR(index, idx), chunk_size); + // and insert the key + memcpy(BUCKET_ADDR(index, idx), key, index->key_size); + memcpy(BUCKET_ADDR(index, idx)+index->key_size, value, index->value_size); + } else { + if (chunk_size != -1){ + debug_print("\n! chunk_size: %d\n\n", chunk_size); + } + // we've reached the end of the bucket space, but found no empty bucket + // make temporary copy of the last entry + memcpy(index->tmp_entry, BUCKET_ADDR(index, index->num_buckets-1), index->bucket_size); + if (idx < index->num_buckets - 1) { + // shift all remaining buckets by one, unless we're at the very last bucket + memmove(BUCKET_ADDR(index, idx+1), + BUCKET_ADDR(index, idx), + (index->num_buckets - idx -1) * index->bucket_size); + } + // insert the value + memcpy(BUCKET_ADDR(index, idx), key, index->key_size); + memcpy(BUCKET_ADDR(index, idx) + index->key_size, value, index->value_size); + idx = 0; + chunk_size = rshift_chunk_size(index, idx); + if (chunk_size > 0) { + // shift chunk at start by one + memmove(BUCKET_ADDR(index, idx+1), BUCKET_ADDR(index, idx), chunk_size); + } else if (chunk_size == -1) { + debug_print("\n! chunk_size: %d\n\n", chunk_size); + } + // insert key from the last address at index 0 + memcpy(BUCKET_ADDR(index, idx), index->tmp_entry, index->bucket_size); + } + } else { + memcpy(BUCKET_ADDR(index, idx), key, index->key_size); + memcpy(BUCKET_ADDR(index, idx)+index->key_size, value, index->value_size); } - ptr = BUCKET_ADDR(index, idx); - memcpy(ptr, key, index->key_size); - memcpy(ptr + index->key_size, value, index->value_size); index->num_entries += 1; } - else - { - memcpy(BUCKET_ADDR(index, idx) + index->key_size, value, index->value_size); - } return 1; } static int hashindex_delete(HashIndex *index, const void *key) { - int idx = hashindex_lookup(index, key); + int idx = hashindex_lookup(index, key, NULL); + int c_size = -1; if (idx < 0) { - return 1; + return 1; // not in index, nothing to do + } + if (idx+1 < index->num_buckets) { + c_size = lshift_chunk_size(index, idx+1); // includes current idx in chunk + } + if(c_size != -1) { + // the simple case, just shift a chunk + if (c_size != 0) { + memmove(BUCKET_ADDR(index, idx), BUCKET_ADDR(index, (idx+1)), c_size); + } + // and mark the last position of the chunk empty + idx += c_size/index->bucket_size; + BUCKET_MARK_EMPTY(index, idx); + } else { + // the complicated case, we shift all the way to the end of the bucket array + memmove(BUCKET_ADDR(index, idx), BUCKET_ADDR(index, idx+1), + (index->num_buckets - idx - 1) * index->bucket_size); + // then check if we need to take the first bucket and move it to the last position + if (BUCKET_IS_EMPTY(index, 0)) { + // no need, it's empty anyway + BUCKET_MARK_EMPTY(index, (index->num_buckets-1)); + } + else { + // move first bucket to last address + memmove(BUCKET_ADDR(index, index->num_buckets-1), BUCKET_ADDR(index, 0), + index->bucket_size); + // then determine if we need to shift an entire chunk after the first bucket + c_size = lshift_chunk_size(index, 1); + if(c_size == 0) { + // nothing to shift, mark first bucket empty and we're done + BUCKET_MARK_EMPTY(index, 0); + } else { + memmove(BUCKET_ADDR(index, 0), BUCKET_ADDR(index, 1), c_size); + BUCKET_MARK_EMPTY(index, (c_size/index->bucket_size)); + } + } } - BUCKET_MARK_DELETED(index, idx); index->num_entries -= 1; if(index->num_entries < index->lower_limit) { if(!hashindex_resize(index, shrink_size(index->num_buckets))) { @@ -431,7 +607,7 @@ hashindex_next_key(HashIndex *index, const void *key) if (idx == index->num_buckets) { return NULL; } - while(BUCKET_IS_EMPTY(index, idx) || BUCKET_IS_DELETED(index, idx)) { + while(BUCKET_IS_EMPTY(index, idx)) { idx ++; if (idx == index->num_buckets) { return NULL; @@ -451,3 +627,108 @@ hashindex_size(HashIndex *index) { return sizeof(HashHeader) + index->num_buckets * index->bucket_size; } + +static void +benchmark_getitem(HashIndex *index, char *keys, int key_count) +{ + char *key = keys; + char *last_addr = key + (32 * key_count); + /* if (DEBUG){ */ + /* lookups = 0; collisions = 0; swaps = 0; updates = 0; shortcuts = 0; inserts = 0; */ + /* } */ + while (key < last_addr) { + hashindex_get(index, key); + key += 32; + } + /* if (DEBUG){ */ + /* printf("\n\n\nlookups %f; collisions: %lu; swaps %lu; updates %lu; " */ + /* "shorts %lu; inserts %lu; buckets %d\n\n\n", */ + /* (double)(lookups) / key_count, collisions, swaps, updates, shortcuts, */ + /* inserts, index->num_buckets); */ + /* } */ +} + +static void +benchmark_setitem(HashIndex *index, char *keys, int key_count) +{ + char *key = keys; + char *last_addr = key + (32 * key_count); + uint32_t data[3] = {0, 0, 0}; + /* if (DEBUG){ */ + /* lookups = 0; collisions = 0; swaps = 0; updates = 0; shortcuts = 0; inserts = 0; */ + /* } */ + while (key < last_addr) { + hashindex_set(index, key, data); + key += 32; + } + /* if (DEBUG) { */ + /* printf("\n\n\nlookups %f; collisions: %lu; swaps %lu; updates %lu; shorts %lu; " */ + /* "inserts %lu; buckets %d\n\n\n", */ + /* (double)(lookups) / key_count, collisions, swaps, updates, shortcuts, */ + /* inserts, index->num_buckets); */ + /* } */ + +} + + +static void +benchmark_delete(HashIndex *index, char *keys, int key_count) +{ + char *key = keys; + char *last_addr = key + (32 * key_count); + /* if (DEBUG){ */ + /* lookups = 0; collisions = 0; swaps = 0; updates = 0; shortcuts = 0; inserts = 0; */ + /* } */ + while (key < last_addr) { + hashindex_delete(index, key); + key += 32; + } + /* if (DEBUG) { */ + /* printf("\n\n\nlookups %f; collisions: %lu; swaps %lu; updates %lu; shorts %lu; " */ + /* "inserts %lu; buckets %d\n\n\n", */ + /* (double)(lookups) / key_count, collisions, swaps, updates, shortcuts, */ + /* inserts, index->num_buckets); */ + /* } */ + +} + + +static void +benchmark_churn(HashIndex *index, char *keys, int key_count) +{ + char *key = keys; + char *last_addr = key + (32 * key_count); + uint32_t data[3] = {0, 0, 0}; + size_t key_size = index->key_size; + uint8_t deleted_key[key_size]; + unsigned int period = 0; + /* if (DEBUG){ */ + /* lookups = 0; collisions = 0; swaps = 0; updates = 0; shortcuts = 0; inserts = 0; */ + /* } */ + while (key < last_addr) { + switch (period) { + case 0: + memcpy(deleted_key, key, key_size); + hashindex_delete(index, key); + break; + case 1 ... 6: + hashindex_set(index, key, data); + break; + case 7 ... 9: + hashindex_get(index, key); + break; + case 10: + period = 0; + hashindex_set(index, deleted_key, data); + continue; + } + period ++; + key += 32; + } + /* if (DEBUG) { */ + /* printf("\n\n\nlookups %f; collisions: %lu; swaps %lu; updates %lu; shorts %lu; " */ + /* "inserts %lu; buckets %d\n\n\n", */ + /* (double)(lookups) / key_count, collisions, swaps, updates, shortcuts, */ + /* inserts, index->num_buckets); */ + /* } */ +} diff --git a/src/borg/archive.py b/src/borg/archive.py index 9546cb0afa..cfa5aebb73 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1010,7 +1010,7 @@ def init_chunks(self): """ # Explicitly set the initial hash table capacity to avoid performance issues # due to hash table "resonance" - capacity = int(len(self.repository) * 1.35 + 1) # > len * 1.0 / HASH_MAX_LOAD (see _hashindex.c) + capacity = int(len(self.repository) * (1/0.99) + 1) # > len * 1.0 / HASH_MAX_LOAD (see _hashindex.c) self.chunks = ChunkIndex(capacity) marker = None while True: diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index 74c52c9c1a..03c70c699d 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -27,6 +27,10 @@ cdef extern from "_hashindex.c": int hashindex_set(HashIndex *index, void *key, void *value) uint32_t _htole32(uint32_t v) uint32_t _le32toh(uint32_t v) + void benchmark_getitem(HashIndex *index, char *keys, int key_count) + void benchmark_setitem(HashIndex *index, char *keys, int key_count) + void benchmark_delete(HashIndex *index, char *keys, int key_count) + void benchmark_churn(HashIndex *index, char *keys, int key_count) cdef _NoDefault = object() @@ -360,3 +364,23 @@ cdef class ChunkKeyIterator: cdef uint32_t refcount = _le32toh(value[0]) assert refcount <= MAX_VALUE, "invalid reference count" return (self.key)[:self.key_size], ChunkIndexEntry(refcount, _le32toh(value[1]), _le32toh(value[2])) + + +from cpython.bytes cimport PyBytes_AS_STRING + + +def bench_getitem(ChunkIndex chunk_index, bytes keys): + cdef key_count = len(keys) // chunk_index.key_size + benchmark_getitem(chunk_index.index, PyBytes_AS_STRING(keys), key_count) + +def bench_setitem(ChunkIndex chunk_index, bytes keys): + cdef key_count = len(keys) // chunk_index.key_size + benchmark_setitem(chunk_index.index, PyBytes_AS_STRING(keys), key_count) + +def bench_delete(ChunkIndex chunk_index, bytes keys): + cdef key_count = len(keys) // chunk_index.key_size + benchmark_delete(chunk_index.index, PyBytes_AS_STRING(keys), key_count) + +def bench_churn(ChunkIndex chunk_index, bytes keys): + cdef key_count = len(keys) // chunk_index.key_size + benchmark_churn(chunk_index.index, PyBytes_AS_STRING(keys), key_count) diff --git a/src/borg/testsuite/benchmark.py b/src/borg/testsuite/benchmark.py index 9751bc1a45..3a55aec51e 100644 --- a/src/borg/testsuite/benchmark.py +++ b/src/borg/testsuite/benchmark.py @@ -7,10 +7,19 @@ """ import os +from hashlib import sha256 import pytest from .archiver import changedir, cmd +from .hashindex import ChunkIndex +from .hashindex import H +import borg.hashindex + +bench_getitem = borg.hashindex.bench_getitem +bench_setitem = borg.hashindex.bench_setitem +bench_delete = borg.hashindex.bench_delete +bench_churn = borg.hashindex.bench_churn @pytest.yield_fixture @@ -97,3 +106,148 @@ def test_check(benchmark, cmd, archive): def test_help(benchmark, cmd): result, out = benchmark(cmd, 'help') assert result == 0 + + +rounds = 10 + + +@pytest.fixture( + params=[.30, .50, .75, .85, .93, .95] +) +def fill(request): + return request.param + + +def test_chunk_indexer_c_getitem(benchmark, fill): + max_key = int(445649 * fill - 10) + index = ChunkIndex(445649) + keys = [sha256(H(k)).digest() + for k in range(max_key)] + bucket_val = (0, 0, 0) + for key in keys: + index[key] = bucket_val + keys = b"".join(keys) + + def do_gets(keys=keys): + bench_getitem(index, keys) + benchmark.pedantic(do_gets, rounds=rounds) + + +def test_chunk_indexer_c_getitem_with_misses(benchmark, fill): + max_key = int(445649 * fill - 10) + index = ChunkIndex(445649) + keys = [sha256(H(k)).digest() + for k in range(max_key)] + bucket_val = (0, 0, 0) + for key in keys: + index[key] = bucket_val + missing_keys = b"".join([ + sha256(H(k)).digest() + for k in range(max_key, (max_key+int(len(keys)/3)))]) + keys = b"".join(keys) + missing_keys + + def do_gets(keys=keys): + bench_getitem(index, keys) + benchmark.pedantic(do_gets, rounds=rounds) + + +def test_chunk_indexer_c_setitem_update(benchmark, fill): + max_key = int(445649 * fill - 10) + index = ChunkIndex(445649) + keys = b"".join((sha256(H(k)).digest() + for k in range(max_key))) + bucket_val = (0, 0, 0) + for i in range(0, 32*max_key, 32): + key = keys[i:i+32] + index[key] = bucket_val + + def do_sets(): + bench_setitem(index, keys) + benchmark.pedantic(do_sets, rounds=rounds) + + +def test_chunk_indexer_c_setitem(benchmark, fill): + max_key = int(445649 * fill - 10) + keys = b"".join((sha256(H(k)).digest() + for k in range(max_key))) + def setup(): + # return *args, **kwargs for the benchmarked function + index = ChunkIndex(445649) + return (index, ), dict() + + def do_sets(index): + bench_setitem(index, keys) + benchmark.pedantic(do_sets, rounds=rounds, setup=setup) + + +def test_chunk_indexer_c_delete(benchmark, fill): + max_key = int(445649 * fill - 10) + keys = b"".join((sha256(H(k)).digest() + for k in range(max_key))) + delete_keys = b"".join((sha256(H(k)).digest() + for k in range(0, max_key, 3))) + def setup(): + # return *args, **kwargs for the benchmarked function + index = ChunkIndex(445649) + bucket_val = (5, 5, 5) + for i in range(0, 32*max_key, 32): + key = keys[i:i+32] + index[key] = bucket_val + return (index, ), dict() + + def do_delete(index): + bench_delete(index, delete_keys) + benchmark.pedantic(do_delete, rounds=rounds, setup=setup) + + +def test_chunk_indexer_c_setitem_after_deletion(benchmark, fill): + """ + Update a bunch of values after 1/5 of the keys in an index have been deleted + This will demonstrate the impact of tombstones on the index. + """ + max_key = int(445649 * fill - 10) + keys = b"".join((sha256(H(k)).digest() + for k in range(max_key) + if k%5)) + delete_keys = b"".join((sha256(H(k)).digest() + for k in range(0, max_key, 5))) + def setup(): + # return *args, **kwargs for the benchmarked function + index = ChunkIndex(445649) + bucket_val = (5, 5, 5) + for i in range(0, len(delete_keys), 32): + key = delete_keys[i:i+32] + index[key] = bucket_val + for i in range(0, len(keys), 32): + key = keys[i:i+32] + index[key] = bucket_val + for i in range(0, len(delete_keys), 32): + key = delete_keys[i:i+32] + del index[key] + return (index, ), dict() + + def do_sets(index): + bench_setitem(index, keys) + benchmark.pedantic(do_sets, rounds=rounds, setup=setup) + + +def test_chunk_indexer_c_churn(benchmark, fill): + """ + Creates churn by repeatedly deleting, updating, getting and re-inserting keys + Will loop over all keys, delete 1 key, update next 5, read next 3 then reinsert the deleted key + """ + max_key = int(445649 * fill - 10) + keys = b"".join((sha256(H(k)).digest() + for k in range(max_key))) + def setup(): + # return *args, **kwargs for the benchmarked function + index = ChunkIndex(445649) + bucket_val = (5, 5, 5) + for i in range(0, len(keys), 32): + key = keys[i:i+32] + index[key] = bucket_val + return [index, ], dict() + + def do_sets(index): + bench_churn(index, keys) + benchmark.pedantic(do_sets, rounds=rounds, setup=setup) diff --git a/src/borg/testsuite/hashindex.py b/src/borg/testsuite/hashindex.py index 5ddb851710..2039536fa7 100644 --- a/src/borg/testsuite/hashindex.py +++ b/src/borg/testsuite/hashindex.py @@ -3,6 +3,7 @@ import os import tempfile import zlib +import sys from ..hashindex import NSIndex, ChunkIndex from .. import hashindex @@ -122,6 +123,48 @@ def test_chunkindex_summarize(self): assert unique_chunks == 3 +class HashIndexExtraTestCase(BaseTestCase): + """These tests are separate because they should not become part of the selftest + """ + def test_chunk_indexer(self): + # max_key is chosen so that when the deleted_keys get added we're close to max fill rate + # but never trigger a resize + max_key = int(1031 * (0.99*(2./3.))) + index = ChunkIndex(max_key) + deleted_keys = [ + hashlib.sha256(H(k)).digest() + for k in range(-1, -int(max_key/3), -1)] + print(max_key + len(deleted_keys)) + keys = [hashlib.sha256(H(k)).digest() for k in range(max_key)] + for i, key in enumerate(keys): + index[key] = (i, i, i) + for i, key in enumerate(deleted_keys): + index[key] = (i, i, i) + for key in deleted_keys: + del index[key] + + missing, undeleted, wrong_value = 0, 0, 0 + for i, key in enumerate(keys): + val = index.get(key) + if val != (i, i, i): + if val is None: + missing += 1 + else: + wrong_value += 1 + for i, key in enumerate(deleted_keys): + if index.get(key) is not None: + undeleted += 1 + index[key] = (i, i, i) + for i, key in enumerate(deleted_keys): + val = index.get(key) + if val != (i, i, i): + if val is None: + missing += 1 + else: + wrong_value += 1 + assert (missing, undeleted, wrong_value) == (0, 0, 0) + + class HashIndexSizeTestCase(BaseTestCase): def test_size_on_disk(self): idx = ChunkIndex()