Source code for hbutils.encoding.int_hash

"""
Non-cryptographic integer hash utilities.

This module provides a collection of fast, non-cryptographic hash functions
commonly used for hash tables, checksums, and lightweight data fingerprinting.
The algorithms prioritize speed and distribution quality over cryptographic
security, and they are exposed through a unified interface.

The module contains the following main components:

* :func:`register_int_hash` - Register custom hash functions by name
* :func:`int_hash` - Unified entry point for computing integer hashes

Supported algorithm names include:

* ``FNV-1a-32`` - 32-bit Fowler-Noll-Vo hash
* ``FNV-1a-64`` - 64-bit Fowler-Noll-Vo hash
* ``DJB2`` - Daniel J. Bernstein's hash algorithm
* ``SDBM`` - Hash function from the SDBM database library
* ``MurmurHash3-32`` - 32-bit MurmurHash3 (simplified)
* ``CRC32-Variant`` - Modified CRC32-based hash
* ``xxHash32-Simple`` - 32-bit xxHash (simplified)
* ``xs`` - Minimal polynomial hash with basic mixing

.. note::
   These algorithms are **not** cryptographically secure and should not be
   used for security-sensitive applications.

Example::

    >>> from hbutils.encoding import int_hash
    >>> int_hash("hello", method='FNV-1a-32')
    >>> int_hash(b"world", method='DJB2')

"""

import struct
from typing import Union, Callable, Dict, Optional

__all__ = [
    'register_int_hash',
    'int_hash',
]


def _norm_input(data: Union[str, bytes, bytearray]) -> bytes:
    """
    Normalize input data to bytes format.

    :param data: Input data in string, bytes, or bytearray format.
    :type data: Union[str, bytes, bytearray]
    :return: Normalized data as bytes.
    :rtype: bytes

    Example::

        >>> _norm_input("hello")
        b'hello'
        >>> _norm_input(bytearray(b"world"))
        b'world'

    """
    if isinstance(data, str):
        return data.encode('utf-8')
    elif isinstance(data, bytearray):
        return bytes(data)
    return data


_IntHashTyping = Callable[[Union[str, bytes, bytearray]], int]
_INT_HASH_FUNCS: Dict[str, _IntHashTyping] = {}


def _register(name: str, func: _IntHashTyping) -> None:
    """
    Register a hash function with a given name.

    :param name: The name to register the hash function under.
    :type name: str
    :param func: The hash function to register.
    :type func: _IntHashTyping
    :return: ``None``.
    :rtype: None
    """
    _INT_HASH_FUNCS[name] = func


[docs] def register_int_hash( name: str, func: Optional[_IntHashTyping] = None ) -> Optional[Callable[[_IntHashTyping], _IntHashTyping]]: """ Register an integer hash function decorator. This function can be used as a decorator to register hash functions, or called directly with both ``name`` and ``func`` parameters. :param name: The name to register the hash function under. :type name: str :param func: The hash function to register (optional). :type func: Optional[_IntHashTyping] :return: A decorator that registers the function if ``func`` is ``None``, otherwise ``None``. :rtype: Optional[Callable[[_IntHashTyping], _IntHashTyping]] Example:: >>> @register_int_hash('my_hash') ... def my_hash_func(data): ... return hash(data) >>> register_int_hash('another_hash', lambda x: hash(x)) """ if func is None: def _decorator(f: _IntHashTyping) -> _IntHashTyping: _register(name, f) return f return _decorator else: _register(name, func) return None
@register_int_hash('FNV-1a-32') def _int_hash_fnv1a_32(data: Union[str, bytes, bytearray]) -> int: """ Compute FNV-1a 32-bit hash. FNV-1a (Fowler-Noll-Vo) is a fast, non-cryptographic hash function with good distribution properties. It uses XOR and multiplication operations. :param data: Input data to hash. :type data: Union[str, bytes, bytearray] :return: 32-bit hash value. :rtype: int Example:: >>> _int_hash_fnv1a_32("hello") >>> _int_hash_fnv1a_32(b"world") """ data = _norm_input(data) hash_val = 0x811c9dc5 # FNV offset basis fnv_prime = 0x01000193 # FNV prime for byte in data: hash_val ^= byte hash_val = (hash_val * fnv_prime) & 0xFFFFFFFF return hash_val @register_int_hash('FNV-1a-64') def _int_hash_fnv1a_64(data: Union[str, bytes, bytearray]) -> int: """ Compute FNV-1a 64-bit hash. 64-bit version of FNV-1a hash algorithm, providing larger hash space and better collision resistance than the 32-bit version. :param data: Input data to hash. :type data: Union[str, bytes, bytearray] :return: 64-bit hash value. :rtype: int Example:: >>> _int_hash_fnv1a_64("hello") >>> _int_hash_fnv1a_64(b"world") """ data = _norm_input(data) hash_val = 0xcbf29ce484222325 # FNV offset basis fnv_prime = 0x100000001b3 # FNV prime for byte in data: hash_val ^= byte hash_val = (hash_val * fnv_prime) & 0xFFFFFFFFFFFFFFFF return hash_val @register_int_hash('DJB2') def _int_hash_djb2(data: Union[str, bytes, bytearray]) -> int: """ Compute DJB2 hash. DJB2 is a simple and effective hash function created by Daniel J. Bernstein. It uses bit shifting and addition operations for fast computation. :param data: Input data to hash. :type data: Union[str, bytes, bytearray] :return: 32-bit hash value. :rtype: int Example:: >>> _int_hash_djb2("hello") >>> _int_hash_djb2(b"world") """ data = _norm_input(data) hash_val = 5381 for byte in data: hash_val = ((hash_val << 5) + hash_val + byte) & 0xFFFFFFFF return hash_val @register_int_hash('SDBM') def _int_hash_sdbm(data: Union[str, bytes, bytearray]) -> int: """ Compute SDBM hash. SDBM hash algorithm from the SDBM database library. It uses bit shifting and subtraction operations to generate hash values. :param data: Input data to hash. :type data: Union[str, bytes, bytearray] :return: 32-bit hash value. :rtype: int Example:: >>> _int_hash_sdbm("hello") >>> _int_hash_sdbm(b"world") """ data = _norm_input(data) hash_val = 0 for byte in data: hash_val = (byte + (hash_val << 6) + (hash_val << 16) - hash_val) & 0xFFFFFFFF return hash_val @register_int_hash('MurmurHash3-32') def _int_hash_murmur3_32(data: Union[str, bytes, bytearray], seed: int = 0) -> int: """ Compute MurmurHash3 32-bit hash (simplified implementation). MurmurHash3 is a fast, non-cryptographic hash function with excellent distribution and avalanche properties. This is a simplified implementation of the 32-bit version. :param data: Input data to hash. :type data: Union[str, bytes, bytearray] :param seed: Seed value for hash initialization (default: 0). :type seed: int :return: 32-bit hash value. :rtype: int Example:: >>> _int_hash_murmur3_32("hello") >>> _int_hash_murmur3_32("hello", seed=42) """ data = _norm_input(data) length = len(data) c1 = 0xcc9e2d51 c2 = 0x1b873593 r1 = 15 r2 = 13 m = 5 n = 0xe6546b64 hash_val = seed # Process 4-byte blocks for i in range(0, length - (length % 4), 4): k = struct.unpack('<I', data[i:i + 4])[0] k = (k * c1) & 0xFFFFFFFF k = ((k << r1) | (k >> (32 - r1))) & 0xFFFFFFFF k = (k * c2) & 0xFFFFFFFF hash_val ^= k hash_val = ((hash_val << r2) | (hash_val >> (32 - r2))) & 0xFFFFFFFF hash_val = ((hash_val * m) + n) & 0xFFFFFFFF # Process remaining bytes remaining = length % 4 if remaining: k = 0 for i in range(remaining): k |= data[length - remaining + i] << (i * 8) k = (k * c1) & 0xFFFFFFFF k = ((k << r1) | (k >> (32 - r1))) & 0xFFFFFFFF k = (k * c2) & 0xFFFFFFFF hash_val ^= k # Final mixing hash_val ^= length hash_val ^= hash_val >> 16 hash_val = (hash_val * 0x85ebca6b) & 0xFFFFFFFF hash_val ^= hash_val >> 13 hash_val = (hash_val * 0xc2b2ae35) & 0xFFFFFFFF hash_val ^= hash_val >> 16 return hash_val @register_int_hash('CRC32-Variant') def _int_hash_crc32_variant(data: Union[str, bytes, bytearray]) -> int: """ Compute CRC32 variant hash. A variant of the CRC32 (Cyclic Redundancy Check) algorithm adapted for hash table use. Uses a simplified CRC32 polynomial for computation. :param data: Input data to hash. :type data: Union[str, bytes, bytearray] :return: 32-bit hash value. :rtype: int Example:: >>> _int_hash_crc32_variant("hello") >>> _int_hash_crc32_variant(b"world") """ data = _norm_input(data) # Simplified CRC32 polynomial polynomial = 0xEDB88320 hash_val = 0xFFFFFFFF for byte in data: hash_val ^= byte for _ in range(8): if hash_val & 1: hash_val = (hash_val >> 1) ^ polynomial else: hash_val >>= 1 return hash_val ^ 0xFFFFFFFF @register_int_hash('xxHash32-Simple') def _int_hash_xxhash32_simple(data: Union[str, bytes, bytearray], seed: int = 0) -> int: """ Compute xxHash32 hash (simplified implementation). xxHash is an extremely fast non-cryptographic hash algorithm, working at speeds close to RAM limits. This is a simplified implementation of the 32-bit version. :param data: Input data to hash. :type data: Union[str, bytes, bytearray] :param seed: Seed value for hash initialization (default: 0). :type seed: int :return: 32-bit hash value. :rtype: int Example:: >>> _int_hash_xxhash32_simple("hello") >>> _int_hash_xxhash32_simple("hello", seed=42) """ data = _norm_input(data) length = len(data) PRIME32_1 = 0x9E3779B1 PRIME32_2 = 0x85EBCA77 PRIME32_3 = 0xC2B2AE3D PRIME32_4 = 0x27D4EB2F PRIME32_5 = 0x165667B1 if length >= 16: v1 = (seed + PRIME32_1 + PRIME32_2) & 0xFFFFFFFF v2 = (seed + PRIME32_2) & 0xFFFFFFFF v3 = seed v4 = (seed - PRIME32_1) & 0xFFFFFFFF # Process 16-byte blocks for i in range(0, length - 15, 16): for j in range(4): val = struct.unpack('<I', data[i + j * 4:i + j * 4 + 4])[0] if j == 0: v1 = ((v1 + val * PRIME32_2) & 0xFFFFFFFF) v1 = (((v1 << 13) | (v1 >> 19)) * PRIME32_1) & 0xFFFFFFFF elif j == 1: v2 = ((v2 + val * PRIME32_2) & 0xFFFFFFFF) v2 = (((v2 << 13) | (v2 >> 19)) * PRIME32_1) & 0xFFFFFFFF elif j == 2: v3 = ((v3 + val * PRIME32_2) & 0xFFFFFFFF) v3 = (((v3 << 13) | (v3 >> 19)) * PRIME32_1) & 0xFFFFFFFF else: v4 = ((v4 + val * PRIME32_2) & 0xFFFFFFFF) v4 = (((v4 << 13) | (v4 >> 19)) * PRIME32_1) & 0xFFFFFFFF hash_val = (((v1 << 1) | (v1 >> 31)) + ((v2 << 7) | (v2 >> 25)) + ((v3 << 12) | (v3 >> 20)) + ((v4 << 18) | (v4 >> 14))) & 0xFFFFFFFF remaining_start = length - (length % 16) else: hash_val = (seed + PRIME32_5) & 0xFFFFFFFF remaining_start = 0 hash_val = (hash_val + length) & 0xFFFFFFFF # Process remaining bytes for i in range(remaining_start, length): hash_val = (hash_val + data[i] * PRIME32_5) & 0xFFFFFFFF hash_val = (((hash_val << 11) | (hash_val >> 21)) * PRIME32_1) & 0xFFFFFFFF # Final mixing hash_val ^= hash_val >> 15 hash_val = (hash_val * PRIME32_2) & 0xFFFFFFFF hash_val ^= hash_val >> 13 hash_val = (hash_val * PRIME32_3) & 0xFFFFFFFF hash_val ^= hash_val >> 16 return hash_val @register_int_hash('xs') def _int_hash_xs(data: Union[str, bytes, bytearray]) -> int: """ Compute a minimal but functional hash using simple polynomial hashing. This is a lightweight hash function that uses a simple polynomial hash (``hash = hash * 31 + byte``) with a single bit mixing operation to improve the avalanche effect. It satisfies basic hash properties while maintaining simplicity and speed. :param data: The input data to hash. Can be string, bytes, or bytearray. :type data: Union[str, bytes, bytearray] :return: A 32-bit unsigned integer hash value. :rtype: int Example:: >>> _int_hash_xs('hello') # Hash a string >>> _int_hash_xs(b'hello') # Hash bytes """ # Convert all input types to bytes if isinstance(data, str): data = data.encode('utf-8') elif isinstance(data, bytearray): data = bytes(data) # Simple polynomial hash: hash = (hash * 31 + byte) hash_val = 0 for byte in data: hash_val = (hash_val * 31 + byte) & 0xffffffff # Single bit mixing to improve avalanche effect hash_val ^= hash_val >> 16 return hash_val & 0xffffffff
[docs] def int_hash(data: Union[str, bytes, bytearray], method: str = 'FNV-1a-32') -> int: """ Compute integer hash using the specified method. This is the main entry point for computing hashes. It dispatches to the appropriate hash function based on the ``method`` parameter. :param data: Input data to hash. :type data: Union[str, bytes, bytearray] :param method: Hash algorithm to use (default: ``'FNV-1a-32'``). Available methods: ``'FNV-1a-32'``, ``'FNV-1a-64'``, ``'DJB2'``, ``'SDBM'``, ``'MurmurHash3-32'``, ``'CRC32-Variant'``, ``'xxHash32-Simple'``, ``'xs'``. :type method: str :return: Hash value computed by the specified method. :rtype: int :raises KeyError: If the specified method is not registered. Example:: >>> int_hash("hello") >>> int_hash("hello", method='FNV-1a-32') >>> int_hash(b"world", method='DJB2') """ return _INT_HASH_FUNCS[method](data)