Skip to content

Commit

Permalink
HashTable: fix issues on 32bit platforms
Browse files Browse the repository at this point in the history
- RESERVED is uint32_t, so we can't assign it to a 32bit int
- in general, instead of using C int, let's rather use size_t
  for array indexes and sizes. size_t is unsigned, so no problem
  with that. it's 32bit on 32bit platforms, 64bit on 64bit platforms.
- exception: indexes into the kv array are always only uint32_t
  to save memory.

Note that due to the load factor of the hashtable, it makes sense to
have hashtables with a bit more than 2*32 buckets even if the kv array
is limited to less than 2**32.
If there would be close to 2**32 stored kv items, the hashtable would
be approximately 2**33 buckets large (if the load factor is 0.5).
  • Loading branch information
ThomasWaldmann committed Nov 9, 2024
1 parent 37c708a commit 8e73059
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 21 deletions.
14 changes: 7 additions & 7 deletions src/borghash/HashTable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@ from libc.stdint cimport uint8_t, uint32_t

cdef class HashTable:
cdef int ksize, vsize
cdef readonly int capacity, used
cdef int initial_capacity, tombstones
cdef readonly size_t capacity, used
cdef size_t initial_capacity, tombstones
cdef float max_load_factor, min_load_factor, shrink_factor, grow_factor
cdef uint32_t* table
cdef int kv_capacity, kv_used
cdef uint32_t kv_capacity, kv_used
cdef float kv_grow_factor
cdef uint8_t* keys
cdef uint8_t* values
cdef int stats_get, stats_set, stats_del, stats_iter, stats_lookup, stats_linear
cdef int stats_resize_table, stats_resize_kv

cdef int _get_index(self, uint8_t* key)
cdef int _lookup_index(self, uint8_t* key_ptr, int* index_ptr)
cdef void _resize_table(self, int new_capacity)
cdef void _resize_kv(self, int new_capacity)
cdef size_t _get_index(self, uint8_t* key)
cdef int _lookup_index(self, uint8_t* key_ptr, size_t* index_ptr)
cdef void _resize_table(self, size_t new_capacity)
cdef void _resize_kv(self, size_t new_capacity)
28 changes: 14 additions & 14 deletions src/borghash/HashTable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,18 @@ cdef class HashTable:
def __len__(self) -> int:
return self.used

cdef int _get_index(self, uint8_t* key):
cdef size_t _get_index(self, uint8_t* key):
"""key must be perfectly random distributed bytes, so we don't need a hash function here."""
cdef uint32_t key32 = (key[0] << 24) | (key[1] << 16) | (key[2] << 8) | key[3]
return key32 % self.capacity

cdef int _lookup_index(self, uint8_t* key_ptr, int* index_ptr):
cdef int _lookup_index(self, uint8_t* key_ptr, size_t* index_ptr):
"""
search for a specific key.
if found, return 1 and set *index_ptr to the index of the bucket in self.table.
if not found, return 0 and set *index_ptr to the index of a free bucket in self.table.
"""
cdef int index = self._get_index(key_ptr)
cdef size_t index = self._get_index(key_ptr)
cdef uint32_t kv_index
self.stats_lookup += 1
while (kv_index := self.table[index]) != FREE_BUCKET:
Expand All @@ -138,7 +138,7 @@ cdef class HashTable:
cdef uint8_t* key_ptr = <uint8_t*> key
cdef uint8_t* value_ptr = <uint8_t*> value
cdef uint32_t kv_index
cdef int index
cdef size_t index
self.stats_set += 1
if self._lookup_index(key_ptr, &index):
kv_index = self.table[index]
Expand Down Expand Up @@ -173,7 +173,7 @@ cdef class HashTable:
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
cdef uint32_t kv_index
cdef int index
cdef size_t index
self.stats_get += 1
if self._lookup_index(<uint8_t*> key, &index):
kv_index = self.table[index]
Expand All @@ -185,7 +185,7 @@ cdef class HashTable:
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
cdef uint8_t* key_ptr = <uint8_t*> key
cdef int index
cdef size_t index
cdef uint32_t kv_index

self.stats_del += 1
Expand Down Expand Up @@ -227,7 +227,7 @@ cdef class HashTable:
return value

def items(self) -> Iterator[tuple[bytes, bytes]]:
cdef int i
cdef size_t i
cdef uint32_t kv_index
self.stats_iter += 1
for i in range(self.capacity):
Expand All @@ -237,8 +237,8 @@ cdef class HashTable:
value = self.values[kv_index * self.vsize:(kv_index + 1) * self.vsize]
yield key, value

cdef void _resize_table(self, int new_capacity):
cdef int i, index
cdef void _resize_table(self, size_t new_capacity):
cdef size_t i, index
cdef uint32_t kv_index
cdef uint32_t* new_table = <uint32_t*> malloc(new_capacity * sizeof(uint32_t))
for i in range(new_capacity):
Expand All @@ -259,13 +259,13 @@ cdef class HashTable:
self.table = new_table
self.tombstones = 0

cdef void _resize_kv(self, int new_capacity):
cdef void _resize_kv(self, size_t new_capacity):
# We must never use kv indexes >= RESERVED, thus we'll never need more capacity either.
cdef int capacity = min(new_capacity, RESERVED - 1)
cdef size_t capacity = min(new_capacity, <size_t> RESERVED - 1)
self.stats_resize_kv += 1
self.keys = <uint8_t*> realloc(self.keys, capacity * self.ksize * sizeof(uint8_t))
self.values = <uint8_t*> realloc(self.values, capacity * self.vsize * sizeof(uint8_t))
self.kv_capacity = capacity
self.kv_capacity = <uint32_t> capacity

def k_to_idx(self, key: bytes) -> int:
"""
Expand All @@ -274,7 +274,7 @@ cdef class HashTable:
"""
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
cdef int index
cdef size_t index
if self._lookup_index(<uint8_t*> key, &index):
return self.table[index] # == uint32_t kv_index
else:
Expand All @@ -297,7 +297,7 @@ cdef class HashTable:
raise ValueError("Key size does not match the defined size")
if len(value) != self.vsize:
raise ValueError("Value size does not match the defined size")
cdef int index
cdef size_t index
cdef uint32_t kv_index
if self._lookup_index(<uint8_t*> key, &index):
kv_index = self.table[index]
Expand Down

0 comments on commit 8e73059

Please sign in to comment.