diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fbf5dcf..2e7263e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.0 +current_version = 2.0.0 commit = True tag = True diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c3b5e05..5c471ca 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,52 @@ Changelog ========= +------------------- +v2.0.0 (2025-11-14) +------------------- + +* Major feature additions + * Encode/decode transformer for byte ↔ string conversion + * Entropy risk and total calculation functions + * New Puid methods: ``encode()``, ``decode()``, ``risk()``, ``total()`` + * ETE (Entropy Transform Efficiency) metric +* Code quality improvements + * Full Python naming conventions (snake_case vs camelCase) + * Type hints throughout core modules + * Add ``__slots__`` to Puid and ValidChars classes + * Dictionary-based encoder selection (O(1) lookup) + * Comprehensive test suite (108 tests) +* Breaking changes + * Function renames for Pythonic style: + * ``entropy_risk`` → ``risk_for_entropy`` + * ``entropy_total`` → ``total_for_entropy`` + * ``acceptValueFor`` → ``accept_value_for`` + * ``CharMetrics.avgBits`` → ``avg_bits`` + +------------------- +v1.2.1 (2025-11-14) +------------------- + +* Performance improvements + * Use dictionary dispatch for encoder selection (O(1) vs O(n)) + * Cache encoder functions to avoid recreation + * Add ``__slots__`` to frequently instantiated classes +* Code quality improvements + * Add comprehensive type hints + * Extract magic numbers to named constants + * Use namedtuple for multi-value returns + * Remove duplicate encoder creation + +------------------- +v1.2.0 (2023-08-08) +------------------- + +* Optimize bit shift +* Add pre-defined char sets + * Base16 (RFC6468). Note: Same as HexUpper + * Crockford32 + * WordSafe32 (Another avoid words strategy) + ------------------- v1.1.0 (2022-08-04) ------------------- @@ -14,7 +60,6 @@ v1.1.0 (2022-08-04) * Update README * Create test helpers - ------------------- v1.0.0 (2022-07-29) ------------------- diff --git a/README.md b/README.md index 33d1c0e..2784ab9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Simple, flexible and efficient generation of probably unique identifiers (`puid`, aka random strings) of intuitively specified entropy using pre-defined or custom characters (including Unicode). ```python -from puid import Chars, Puid +from puid import Puid, Chars rand_id = Puid(chars=Chars.ALPHA, total=1e5, risk=1e12) rand_id.generate() @@ -43,7 +43,7 @@ Random string generation can be thought of as a _transformation_ of some random What characters are used in the ID? - > `puid` provides 16 pre-defined character sets, as well as allows custom characters, including Unicode + > `puid` provides 19 pre-defined character sets, as well as allows custom characters, including Unicode 3. **ID randomness** @@ -73,7 +73,9 @@ rand_id.generate() from puid import Puid from random import getrandbits -prng_bytes = lambda n: bytearray(getrandbits(8) for _ in range(n)) +def prng_bytes(n): + return bytearray(getrandbits(8) for _ in range(n)) + prng_id = Puid(entropy_source=prng_bytes) prng_id.generate() 'JcQTr8u7MATncImOjO0qOS' @@ -95,7 +97,7 @@ dingosky_id.generate() 'sdosigokdsdygooggogdggisndkogonksnkodnokosg' unicode_id = Puid(chars='dîñgø$kyDÎÑGØßK¥') -unicode_id.() +unicode_id.generate() 'îGÎØÎÑî¥gK¥Ñ¥kîDîyøøØñÑØyd¥¥ØGØÑ$KߨgøÑ' ``` @@ -123,6 +125,36 @@ token.generate() '5D241826F2A644E1B725DB1DD7E4BF742D9D0DC6D6A36F419046A02835A16B83' ``` +**Encode/Decode** + +Transform between bytes and strings using the generator's character set: + +```python +from puid import Puid, Chars + +p = Puid(chars=Chars.SAFE64) +bytes_data = bytearray([0x09, 0x25, 0x84, 0x3c]) + +encoded = p.encode(bytes_data) +# => 'CSwc' + +decoded = p.decode(encoded) +# => bytearray(b'\t%\x84<') +``` + +**Risk and Total Calculation** + +Calculate the risk or total given the generator's entropy: + +```python +from puid import Puid + +p = Puid(bits=96) + +risk = p.risk(1e6) # risk of repeat for 1M IDs +total = p.total(1e15) # total possible IDs for 1e15 risk +``` + [TOC](#TOC) ### Installation @@ -163,16 +195,22 @@ conda install -c dingosky puid-py - `chars`: `Chars.SAFE64` - `entropy_source`: `secret.token_bytes` -#### PuidInfo +#### Generator Methods and Properties -The **Puid**'s `__repr__` function provides information regarding the generator configuration: +The **Puid** instance provides the following: -- `bits`: ID entropy +- `generate()`: Generate a random ID +- `encode(bytes)`: Encode bytes to string using the generator's characters +- `decode(text)`: Decode a string to bytes using the generator's characters +- `risk(total)`: Calculate risk of repeat for a given total number of IDs +- `total(risk)`: Calculate total possible IDs for a given risk of repeat +- `ete`: Entropy transform efficiency (0 < ETE ≤ 1.0) +- `ere`: Entropy representation efficiency (0 < ERE ≤ 1.0) +- `bits`: ID entropy bits - `bits_per_char`: Entropy bits per ID character -- `chars`: Source characters -- `entropy_source`: String `module.function` -- `ere`: Entropy representation efficiency - `len`: ID string length +- `chars`: Source characters +- `entropy_source`: String `module.function` of entropy source Example: @@ -183,53 +221,56 @@ rand_id = Puid(total=1e5, risk=1e14, chars=Chars.BASE32) rand_id.generate() '7XKJJKNZBF7GCMEX' -print(rand_id) -Puid: bits = 80.0, bits_per_char = 5.0, chars = BASE32 -> '234567ABCDEFGHIJKLMNOPQRSTUVWXYZ', len = 16, ere = 0.625, entropy_source = secrets.token_bytes +print(f"ETE: {rand_id.ete}, ERE: {rand_id.ere}") +# ETE: 1.0, ERE: 0.62 + +risk = rand_id.risk(1e5) +total = rand_id.total(1e14) ``` ### Chars There are 19 pre-defined character sets: -| Name | Characters | -| :---------------- | :-------------------------------------------------------------------------------------------- | -| :alpha | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | -| :alpha_lower | abcdefghijklmnopqrstuvwxyz | -| :alpha_upper | ABCDEFGHIJKLMNOPQRSTUVWXYZ | -| :alphanum | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 | -| :alphanum_lower | abcdefghijklmnopqrstuvwxyz0123456789 | -| :alphanum_upper | ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 | -| :base16 | 0123456789ABCDEF | -| :base32 | ABCDEFGHIJKLMNOPQRSTUVWXYZ234567 | -| :base32_hex | 0123456789abcdefghijklmnopqrstuv | -| :base32_hex_upper | 0123456789ABCDEFGHIJKLMNOPQRSTUV | -| :crockford32 | 0123456789ABCDEFGHJKMNPQRSTVWXYZ | -| :decimal | 0123456789 | -| :hex | 0123456789abcdef | -| :hex_upper | 0123456789ABCDEF | -| :safe_ascii | !#$%&()\*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^\_abcdefghijklmnopqrstuvwxyz{\|}~ | -| :safe32 | 2346789bdfghjmnpqrtBDFGHJLMNPQRT | -| :safe64 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-\_ | -| :symbol | !#$%&()\*+,-./:;<=>?@[]^\_{\|}~ | -| :wordSafe32 | 23456789CFGHJMPQRVWXcfghjmpqrvwx | +| Name | Characters | +| :--------------- | :-------------------------------------------------------------------------------------------- | +| ALPHA | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | +| ALPHA_LOWER | abcdefghijklmnopqrstuvwxyz | +| ALPHA_UPPER | ABCDEFGHIJKLMNOPQRSTUVWXYZ | +| ALPHANUM | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 | +| ALPHANUM_LOWER | abcdefghijklmnopqrstuvwxyz0123456789 | +| ALPHANUM_UPPER | ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 | +| BASE16 | 0123456789ABCDEF | +| BASE32 | ABCDEFGHIJKLMNOPQRSTUVWXYZ234567 | +| BASE32_HEX | 0123456789abcdefghijklmnopqrstuv | +| BASE32_HEX_UPPER | 0123456789ABCDEFGHIJKLMNOPQRSTUV | +| CROCKFORD32 | 0123456789ABCDEFGHJKMNPQRSTVWXYZ | +| DECIMAL | 0123456789 | +| HEX | 0123456789abcdef | +| HEX_UPPER | 0123456789ABCDEF | +| SAFE_ASCII | !#$%&()\*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^\_abcdefghijklmnopqrstuvwxyz{\|}~ | +| SAFE32 | 2346789bdfghjmnpqrtBDFGHJLMNPQRT | +| SAFE64 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-\_ | +| SYMBOL | !#$%&()\*+,-./:;<=>?@[]^\_{\|}~ | +| WORD_SAFE32 | 23456789CFGHJMPQRVWXcfghjmpqrvwx | Any string of up to 256 unique characters can be used for **`puid`** generation. #### Description of non-obvious character sets -| Name | Description | -| :---------------- | :--------------------------------------------------------- | -| :base16 | https://datatracker.ietf.org/doc/html/rfc4648#section-8 | -| :base32 | https://datatracker.ietf.org/doc/html/rfc4648#section-6 | -| :base32_hex | Lowercase of :base32_hex_upper | -| :base32_hex_upper | https://datatracker.ietf.org/doc/html/rfc4648#section-7 | -| :crockford32 | https://www.crockford.com/base32.html | -| :safe_ascii | Printable ascii that does not require escape in String | -| :safe32 | Alpha and numbers picked to reduce chance of English words | -| :safe64 | https://datatracker.ietf.org/doc/html/rfc4648#section-5 | -| :wordSafe32 | Alpha and numbers picked to reduce chance of English words | +| Name | Description | +| :--------------- | :--------------------------------------------------------- | +| BASE16 | https://datatracker.ietf.org/doc/html/rfc4648#section-8 | +| BASE32 | https://datatracker.ietf.org/doc/html/rfc4648#section-6 | +| BASE32_HEX | Lowercase of Base32HexUpper | +| BASE32_HEX_UPPER | https://datatracker.ietf.org/doc/html/rfc4648#section-7 | +| CROCKFORD32 | https://www.crockford.com/base32.html | +| SAFE_ASCII | Printable ascii that does not require escape in String | +| SAFE32 | Alpha and numbers picked to reduce chance of English words | +| SAFE64 | https://datatracker.ietf.org/doc/html/rfc4648#section-5 | +| WORD_SAFE32 | Alpha and numbers picked to reduce chance of English words | -Note: :safe32 and :wordSafe32 are two different strategies for the same goal. +Note: SAFE32 and WORD_SAFE32 are two different strategies for the same goal. [TOC](#TOC) @@ -252,7 +293,9 @@ A somewhat simplistic statement for entropy from information theory is: _entropy Rather, a random string represents _captured_ entropy, entropy that was produced by _some other_ process. For example, you cannot look at the hex string **`'18f6303a'`** and definitively say it has 32 bits of entropy. To see why, suppose you run the following code snippet and get **`'18f6303a'`**: ```python -rand_id = lambda: '18f6303a' if random.random() < 0.5 else '1' +from random import random + +rand_id = lambda: '18f6303a' if random() < 0.5 else '1' rand_id() '18f6303a' ``` @@ -382,7 +425,7 @@ Now, suppose you are tasked to maintain this code: ```python from puid import Chars, Puid -rand_id = Puid(total=500000, risk=1e12, chars=Chars.ALPHANUM_LOWER) +rand_id = Puid(total=500_000, risk=1e12, chars=Chars.ALPHANUM_LOWER) ``` Hmmm. Looks like there are 500,000 IDs expected and the repeat risk is 1 in a trillion. No guessing. The code is explicit. Oh, and by the way, the IDs are 15 characters long. But who cares? It's the ID randomness that matters, not the length. @@ -433,7 +476,7 @@ Hmmm. Looks like there are 500,000 IDs expected and the repeat risk is 1 in a tr ```python from puid import Chars, Puid -Puid(chars=Chars.SAFE32, total=10e6, risk=1e15) +rand_id = Puid(chars=Chars.SAFE32, total=10e6, risk=1e15) rand_id.generate() 'RHR3DtnP9B3J748NdR87' ``` diff --git a/conda/meta.yaml b/conda/meta.yaml index d48ba6c..e9f9292 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = "puid-py" %} -{% set version = "1.2.0" %} +{% set version = "2.0.0" %} package: name: "{{ name|lower }}" diff --git a/pyproject.toml b/pyproject.toml index fc9d53d..1a99bc6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] requires = [ - "setuptools>=30.3.0", + "setuptools>=68.0.0", "wheel", ] diff --git a/setup.cfg b/setup.cfg index d131b0a..e9708b0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,7 +29,6 @@ tests_require = pytest test = pytest [tool:isort] -force_single_line = True line_length = 120 known_first_party = puid default_section = THIRDPARTY diff --git a/setup.py b/setup.py index 2092c1f..4b8a6bd 100755 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def read(*names, **kwargs): setup( name='puid-py', - version='1.2.0', + version='2.0.0', license='MIT', description='Simple, flexible and efficient generation of probably unique identifiers (`puid`, ' 'aka random strings) of intuitively specified entropy using pre-defined or custom characters, ' diff --git a/src/puid/__init__.py b/src/puid/__init__.py index 4e68dd5..ecdb93f 100644 --- a/src/puid/__init__.py +++ b/src/puid/__init__.py @@ -1,2 +1,4 @@ +__version__ = '2.0.0' + from puid.chars import Chars from puid.puid import Puid diff --git a/src/puid/bits.py b/src/puid/bits.py index 1d7ed44..1fe5812 100644 --- a/src/puid/bits.py +++ b/src/puid/bits.py @@ -1,5 +1,8 @@ +from collections import namedtuple from math import ceil, floor, log2 +AcceptResult = namedtuple('AcceptResult', ['accept', 'shift']) + # Create array of minimum bits required to determine if a value is less than n_chars # Array elements are of the form (n, bits): For values less than n, bits bits are required # @@ -13,7 +16,15 @@ # -def bit_shifts(n_chars): +def isPow2(n: int) -> bool: + """Check if a number is a power of 2.""" + if n <= 0: + return False + return (n & (n - 1)) == 0 + + +def bitShifts(n_chars: int) -> list: + """Create array of minimum bits required to determine if a value is less than n_chars.""" n_bits_per_char = ceil(log2(n_chars)) base_value = n_chars - 1 if n_chars % 2 == 0 else n_chars @@ -34,6 +45,11 @@ def shift(bit): return [base_shift] + [shift(bit) for bit in range(2, n_bits_per_char) if is_bit_zero(bit)] +def bit_shifts(n_chars): + """Deprecated: use bitShifts instead.""" + return bitShifts(n_chars) + + def fill_entropy(entropy_offset, entropy_bytes, entropy_fn): n_bytes = len(entropy_bytes) n_bits = 8 * n_bytes @@ -81,12 +97,6 @@ def muncher(n_chars, puid_len, entropy_fn): entropy_offset = n_entropy_bits entropy_bytes = bytearray(buffer_len) - def pow2(bit): - return round(pow(2, bit)) - - def is_pow2(n): - return pow2(round(log2(n))) == n - counter = list(range(puid_len)) def sliced_value(): @@ -95,7 +105,7 @@ def sliced_value(): entropy_offset = fill_entropy(entropy_offset, entropy_bytes, entropy_fn) return value_at(entropy_offset, n_bits_per_char, entropy_bytes) - if is_pow2(n_chars): + if isPow2(n_chars): # When chars count is a power of 2, sliced bits always yield a valid value def bits_muncher(): def slice_value(): @@ -109,20 +119,20 @@ def slice_value(): return bits_muncher - shifts = bit_shifts(n_chars) + shifts = bitShifts(n_chars) def accept_value(value): # Value is valid if it is less than the number of characters if value < n_chars: - return (True, n_bits_per_char) + return AcceptResult(True, n_bits_per_char) # For invalid value, shift the minimal bits necessary to determine validity # If only one item, no need to search if len(shifts) == 1: - return (False, shifts[0][1]) + return AcceptResult(False, shifts[0][1]) bit_shift = [bs for bs in shifts if value <= bs[0]] - return (False, bit_shift[0][1]) + return AcceptResult(False, bit_shift[0][1]) def slice_value(): nonlocal entropy_offset diff --git a/src/puid/chars.py b/src/puid/chars.py index d7eab79..c4273b4 100644 --- a/src/puid/chars.py +++ b/src/puid/chars.py @@ -1,7 +1,14 @@ from enum import Enum +from typing import Dict, NamedTuple +from math import ceil, log2, pow from puid.chars_error import InvalidChars, NonUniqueChars, TooFewChars, TooManyChars +MIN_CHARS = 2 +MAX_CHARS = 256 +VALID_CHAR_MIN_CODE = 160 +INVALID_CHAR_THRESHOLD = ord('~') + def valid_chars(chars): """ @@ -21,8 +28,8 @@ def valid_chars(chars): if not isinstance(chars, str): raise InvalidChars('Characters must be a str') - min_len = 2 - max_len = 256 + min_len = MIN_CHARS + max_len = MAX_CHARS if len(chars) < min_len: raise TooFewChars(f'Must have at least {min_len} characters') @@ -43,7 +50,7 @@ def valid_chars(chars): def _valid_char(char): code_point = ord(char) - if 160 < code_point: + if VALID_CHAR_MIN_CODE < code_point: return True if char == '!': @@ -56,7 +63,7 @@ def _valid_char(char): return False if char == '`': return False - if ord('~') < code_point: + if INVALID_CHAR_THRESHOLD < code_point: return False return True @@ -96,6 +103,8 @@ def __len__(self): class ValidChars: """Base class for PredefinedChars and CustomChars""" + __slots__ = ('name', 'value') + def __repr__(self): return "{0} -> '{1}'".format(self.name, self.value) @@ -156,6 +165,85 @@ def __init__(self, chars): self.value = chars +class CharMetrics(NamedTuple): + """Metrics for a character set including entropy transform efficiency.""" + avg_bits: float + ere: float + ete: float + + +def _bits_consumed_on_reject( + charset_size: int, total_values: int, shifts: list +) -> int: + """Calculate total bits consumed when rejecting values.""" + sum_bits = 0 + for value in range(charset_size, total_values): + bit_shift = None + for bs in shifts: + if value <= bs[0]: + bit_shift = bs + break + if bit_shift is None: + raise ValueError('Invalid bit_shifts: missing range') + sum_bits += bit_shift[1] + return sum_bits + + +def _avg_bytes_per_char(chars: str) -> float: + """Calculate average byte size per character for a string.""" + total_bytes = len(chars.encode('utf-8')) + return total_bytes / len(chars) + + +def metrics(chars: str) -> CharMetrics: + """ + Calculate entropy metrics for a character set. + + Returns a CharMetrics object with: + - avgBits: Average bits consumed per character during generation + - ere: Entropy representation efficiency (0 < ERE ≤ 1.0) + - ete: Entropy transform efficiency (0 < ETE ≤ 1.0) + + Example: + >>> metrics(Chars.SAFE64.value).ete + 1.0 + """ + from puid.bits import isPow2, bitShifts + + charset_size = len(chars) + bits_per_char = ceil(log2(charset_size)) + theoretical_bits = log2(charset_size) + shifts = bitShifts(charset_size) + + avg_rep_bits_per_char = _avg_bytes_per_char(chars) * 8 + ere = theoretical_bits / avg_rep_bits_per_char + + if isPow2(charset_size): + return CharMetrics( + avg_bits=float(bits_per_char), + ere=round(ere, 4), + ete=1.0 + ) + + total_values = pow(2, bits_per_char) + prob_accept = charset_size / total_values + prob_reject = 1 - prob_accept + + reject_count = total_values - charset_size + reject_bits = _bits_consumed_on_reject(charset_size, int(total_values), shifts) + + avg_bits_on_reject = reject_bits / reject_count + avg_bits = bits_per_char + (prob_reject / prob_accept) * avg_bits_on_reject + + ete = theoretical_bits / avg_bits + + return CharMetrics( + avg_bits=avg_bits, + ere=round(ere, 4), + ete=round(ete, 4) + ) + + if __name__ == '__main__': # pragma: no cover import doctest diff --git a/src/puid/encoder.py b/src/puid/encoder.py index c602efd..9b03bfd 100644 --- a/src/puid/encoder.py +++ b/src/puid/encoder.py @@ -1,4 +1,6 @@ -from puid.chars import Chars +from typing import Callable, Dict + +from puid.chars import Chars, ValidChars from puid.encoders.alpha import alpha from puid.encoders.alpha import alpha_lower from puid.encoders.alpha import alpha_upper @@ -21,62 +23,38 @@ from puid.encoders.word_safe32 import word_safe32 -def encoder(chars: Chars): - if chars.name == Chars.ALPHA.name: - return alpha() - - if chars.name == Chars.ALPHA_LOWER.name: - return alpha_lower() - - if chars.name == Chars.ALPHA_UPPER.name: - return alpha_upper() - - if chars.name == Chars.ALPHANUM.name: - return alphanum() - - if chars.name == Chars.ALPHANUM_LOWER.name: - return alphanum_lower() - - if chars.name == Chars.ALPHANUM_UPPER.name: - return alphanum_upper() - - if chars.name == Chars.BASE16.name: - return base16() - - if chars.name == Chars.BASE32.name: - return base32() - - if chars.name == Chars.BASE32_HEX.name: - return base32_hex() - - if chars.name == Chars.BASE32_HEX_UPPER.name: - return base32_hex_upper() - - if chars.name == Chars.CROCKFORD32.name: - return crockford32() - - if chars.name == Chars.DECIMAL.name: - return decimal() - - if chars.name == Chars.HEX.name: - return hex_lower() - - if chars.name == Chars.HEX_UPPER.name: - return hex_upper() - - if chars.name == Chars.SAFE32.name: - return safe32() - - if chars.name == Chars.SAFE64.name: - return safe64() - - if chars.name == Chars.SAFE_ASCII.name: - return safe_ascii() - - if chars.name == Chars.SYMBOL.name: - return symbol() - - if chars.name == Chars.WORD_SAFE32.name: - return word_safe32() - +_ENCODER_CACHE: Dict[str, Callable[[int], int]] = {} + + +def _init_encoder_map() -> Dict[str, Callable[[int], int]]: + return { + Chars.ALPHA.name: alpha(), + Chars.ALPHA_LOWER.name: alpha_lower(), + Chars.ALPHA_UPPER.name: alpha_upper(), + Chars.ALPHANUM.name: alphanum(), + Chars.ALPHANUM_LOWER.name: alphanum_lower(), + Chars.ALPHANUM_UPPER.name: alphanum_upper(), + Chars.BASE16.name: base16(), + Chars.BASE32.name: base32(), + Chars.BASE32_HEX.name: base32_hex(), + Chars.BASE32_HEX_UPPER.name: base32_hex_upper(), + Chars.CROCKFORD32.name: crockford32(), + Chars.DECIMAL.name: decimal(), + Chars.HEX.name: hex_lower(), + Chars.HEX_UPPER.name: hex_upper(), + Chars.SAFE32.name: safe32(), + Chars.SAFE64.name: safe64(), + Chars.SAFE_ASCII.name: safe_ascii(), + Chars.SYMBOL.name: symbol(), + Chars.WORD_SAFE32.name: word_safe32(), + } + + +def encoder(chars: ValidChars) -> Callable[[int], int]: + if not _ENCODER_CACHE: + _ENCODER_CACHE.update(_init_encoder_map()) + + encoder_fn = _ENCODER_CACHE.get(chars.name) + if encoder_fn is not None: + return encoder_fn return custom(chars) diff --git a/src/puid/encoders/transformer.py b/src/puid/encoders/transformer.py new file mode 100644 index 0000000..2bf0fe9 --- /dev/null +++ b/src/puid/encoders/transformer.py @@ -0,0 +1,110 @@ +from math import ceil, log2 +from typing import Callable, Tuple + +from puid.bits import bitShifts, isPow2, value_at + + +def accept_value_for(chars: str) -> Callable[[int], Tuple[bool, int]]: + """Create an accept/reject function for a character set.""" + n_chars = len(chars) + n_bits_per_char = ceil(__import__('math').log2(n_chars)) + + if isPow2(n_chars): + def always_accept(value: int) -> Tuple[bool, int]: + return (True, n_bits_per_char) + return always_accept + + shifts = bitShifts(n_chars) + + def accept_value_func(value: int) -> Tuple[bool, int]: + if value < n_chars: + return (True, n_bits_per_char) + + if len(shifts) == 1: + return (False, shifts[0][1]) + + bit_shift = [bs for bs in shifts if value <= bs[0]] + return (False, bit_shift[0][1]) + + return accept_value_func + + +def encode(chars: str, byte_data: bytearray) -> str: + """ + Encode bytes into a string using the provided character set. + + Example: + >>> from puid import Chars + >>> bytes_data = bytearray([0x09, 0x25, 0x84, 0x3c, 0xbd, 0xc0, 0x89, 0xeb, + ... 0x61, 0x75, 0x81, 0x65, 0x09, 0xb4, 0x9a, 0x54, 0x20]) + >>> encode(Chars.SAFE64, bytes_data) + 'CSWEPL3AiethdYFlCbSaVC' + """ + n_bits_per_char = ceil(log2(len(chars))) + n_entropy_bits = 8 * len(byte_data) + + if n_entropy_bits == 0: + return '' + + # Support both ValidChars and raw string + charset = chars.value if hasattr(chars, 'value') else chars + char_codes = [ord(c) for c in charset] + + accept_func = accept_value_for(charset) + + offset = 0 + codes = [] + + while offset + n_bits_per_char <= n_entropy_bits: + v = value_at(offset, n_bits_per_char, byte_data) + accept, shift = accept_func(v) + offset += shift + if accept: + codes.append(char_codes[v]) + + return ''.join(chr(code) for code in codes) + + +def decode(chars: str, text: str) -> bytearray: + """ + Decode a string of characters back into bytes using the provided character set. + Pads the final partial byte with zeros if the bit-length is not a multiple of 8. + + Example: + >>> from puid import Chars + >>> text = 'CSWEPL3AiethdYFlCbSaVC' + >>> bytes_data = decode(Chars.SAFE64, text) + >>> len(bytes_data) + 17 + """ + charset = chars.value if hasattr(chars, 'value') else chars + n_bits_per_char = ceil(log2(len(charset))) + + if not text: + return bytearray() + + char_map = {char: idx for idx, char in enumerate(charset)} + + acc = 0 + acc_bits = 0 + out = [] + + for char in text: + if char not in char_map: + raise ValueError(f'Invalid character for charset: {char}') + + val = char_map[char] + acc = (acc << n_bits_per_char) | val + acc_bits += n_bits_per_char + + while acc_bits >= 8: + shift = acc_bits - 8 + byte_val = (acc >> shift) & 0xff + out.append(byte_val) + acc_bits -= 8 + acc = acc & ((1 << acc_bits) - 1) + + if acc_bits > 0: + out.append((acc << (8 - acc_bits)) & 0xff) + + return bytearray(out) diff --git a/src/puid/entropy.py b/src/puid/entropy.py index 943e338..013e1dd 100644 --- a/src/puid/entropy.py +++ b/src/puid/entropy.py @@ -1,11 +1,12 @@ -from math import ceil, log2, trunc +from math import ceil, log2, sqrt, trunc +from typing import Union from puid.chars import ValidChars from puid.chars_error import InvalidChars from puid.puid_error import TotalRiskError -def bits_for_total_risk(total: 0, risk: 0): +def bits_for_total_risk(total: Union[int, float], risk: Union[int, float]) -> float: """ Entropy bits necessary to produce a `total` `puid`s with given `risk` of repeat @@ -33,13 +34,10 @@ def non_neg_int_or_float(value): if risk in [0, 1]: return 0 - if total < 1000: - return log2(total) + log2(total - 1) + log2(risk) - 1 - else: - return 2 * log2(total) + log2(risk) - 1 + return log2(total) + log2(total - 1) + log2(risk) - 1 -def bits_per_char(chars): +def bits_per_char(chars: ValidChars) -> float: """ Entropy bits per character for either a predefined Chars enum or a string of characters @@ -59,7 +57,7 @@ def bits_per_char(chars): raise InvalidChars('chars must be an instance of ValidChars') -def bits_for_len(chars, len): +def bits_for_len(chars: ValidChars, len: int) -> int: """ Bits necessary for a `puid` of length `len` using characters `chars` @@ -74,7 +72,7 @@ def bits_for_len(chars, len): return trunc(len * bits_per_char(chars)) -def len_for_bits(chars, bits): +def len_for_bits(chars: ValidChars, bits: Union[int, float]) -> int: """ Length necessary for a `puid` of `bits` using characters `chars` @@ -89,6 +87,52 @@ def len_for_bits(chars, bits): return ceil(bits / bits_per_char(chars)) +def risk_for_entropy(bits: Union[int, float], total: Union[int, float]) -> Union[int, float]: + """ + Risk given entropy `bits` after `total` IDs. + + This approximation is conservative and will underestimate the true risk. + + :param bits: Entropy bits + :param total: Total number of IDs + :return: Risk of repeat (conservative approximation) + + >>> risk_for_entropy(96, 1.0e7) + 1584563250285288.0 + """ + if total in [0, 1]: + return 0 + + if bits <= 0: + return 0 + + n = log2(total) + log2(total - 1) + return 2 ** (bits - n + 1) + + +def total_for_entropy(bits: Union[int, float], risk: Union[int, float]) -> Union[int, float]: + """ + Total possible IDs given entropy `bits` and repeat `risk`. + + This exact inversion with flooring is conservative and will underestimate the true total. + + :param bits: Entropy bits + :param risk: Risk of repeat + :return: Total possible IDs (conservative approximation) + + >>> total_for_entropy(64, 1e9) + 192077.0 + """ + if bits <= 0: + return 0 + + if risk in [0, 1]: + return 0 + + c = 2 ** (bits + 1) / risk + return (1 + sqrt(1 + 4 * c)) / 2 + + if __name__ == '__main__': # pragma: no cover import doctest diff --git a/src/puid/puid.py b/src/puid/puid.py index 605110b..2ed1771 100644 --- a/src/puid/puid.py +++ b/src/puid/puid.py @@ -1,17 +1,39 @@ from math import ceil, log2 from secrets import token_bytes +from typing import Callable, Optional, Union from puid import Chars from puid.bits import muncher -from puid.chars import CustomChars, PredefinedChars +from puid.chars import CustomChars, PredefinedChars, ValidChars, metrics from puid.chars_error import InvalidChars from puid.encoder import encoder -from puid.entropy import bits_for_total_risk +from puid.encoders.transformer import encode as transform_encode +from puid.encoders.transformer import decode as transform_decode +from puid.entropy import bits_for_total_risk, risk_for_entropy, total_for_entropy from puid.puid_error import BitsError, TotalRiskError class Puid: - def __init__(self, total=None, risk=None, bits=None, chars=None, entropy_source=None): + __slots__ = ( + 'chars', + 'len', + 'bits', + 'entropy_source', + 'bits_muncher', + 'bits_per_char', + '_encoded', + 'ere', + 'ete', + ) + + def __init__( + self, + total: Optional[Union[int, float]] = None, + risk: Optional[Union[int, float]] = None, + bits: Optional[int] = None, + chars: Optional[Union[Chars, str]] = None, + entropy_source: Optional[Callable[[int], bytearray]] = None, + ) -> None: base_bits = None if bits is None and total is None and risk is None: @@ -32,7 +54,7 @@ def __init__(self, total=None, risk=None, bits=None, chars=None, entropy_source= base_bits = bits_for_total_risk(total, risk) if chars is None: - self.chars = PredefinedChars(Chars.SAFE64) + self.chars: ValidChars = PredefinedChars(Chars.SAFE64) elif isinstance(chars, Chars): self.chars = PredefinedChars(chars) elif isinstance(chars, str): @@ -42,31 +64,49 @@ def __init__(self, total=None, risk=None, bits=None, chars=None, entropy_source= n_chars = len(self.chars) n_bits_per_char = log2(n_chars) - self.len = round(ceil(base_bits / n_bits_per_char)) - self.bits = self.len * n_bits_per_char + self.len: int = round(ceil(base_bits / n_bits_per_char)) + self.bits: float = self.len * n_bits_per_char entropy_fn = entropy_source or token_bytes - self.entropy_source = f'{entropy_fn.__module__}.{entropy_fn.__name__}' + self.entropy_source: str = f'{entropy_fn.__module__}.{entropy_fn.__name__}' - self._chars_encoder = encoder(self.chars) - self.bits_muncher = muncher(n_chars, self.len, entropy_fn) + self.bits_muncher: Callable[[], list[int]] = muncher(n_chars, self.len, entropy_fn) - self.bits_per_char = n_bits_per_char + self.bits_per_char: float = n_bits_per_char chars_encoder = encoder(self.chars) - def encoded(values): + def encoded(values: list[int]) -> list[str]: return [chr(chars_encoder(value)) for value in values] - self._encoded = encoded + self._encoded: Callable[[list[int]], list[str]] = encoded - self.ere = (n_bits_per_char * n_chars) / (8 * len(self.chars.value.encode('utf-8'))) + self.ere: float = (n_bits_per_char * n_chars) / (8 * len(self.chars.value.encode('utf-8'))) + + char_metrics = metrics(self.chars.value) + self.ete: float = char_metrics.ete - def __repr__(self): + def __repr__(self) -> str: bits = round(self.bits, 2) bpc = round(self.bits_per_char, 2) return f'Puid: bits = {bits}, bits_per_char = {bpc}, chars = {self.chars}, len = {self.len}, ' 'ere = {self.ere}, entropy_source = {self.entropy_source}' - def generate(self): + def generate(self) -> str: values = self.bits_muncher() return "".join(self._encoded(values)) + + def encode(self, byte_data: bytearray) -> str: + """Encode bytes into a string using the generator's configured characters.""" + return transform_encode(self.chars.value, byte_data) + + def decode(self, text: str) -> bytearray: + """Decode a string back into bytes using the generator's configured characters.""" + return transform_decode(self.chars.value, text) + + def risk(self, total: Union[int, float]) -> Union[int, float]: + """Calculate the risk of repeat given a total number of IDs.""" + return risk_for_entropy(self.bits, total) + + def total(self, risk: Union[int, float]) -> Union[int, float]: + """Calculate the total possible IDs given a risk of repeat.""" + return total_for_entropy(self.bits, risk) diff --git a/tests/new_features_test.py b/tests/new_features_test.py new file mode 100644 index 0000000..19abcc4 --- /dev/null +++ b/tests/new_features_test.py @@ -0,0 +1,322 @@ +"""Tests for v2.1.0 and v2.2.0 features: encode, decode, risk, total, ete metrics.""" +import pytest + +from puid import Chars, Puid +from puid.encoders.transformer import encode, decode +from puid.entropy import risk_for_entropy, total_for_entropy + + +class TestEncodeTransformer: + """Tests for encode() transformer function.""" + + def test_encode_alpha_lower(self): + """Test encoding with AlphaLower charset.""" + bytes_data = bytearray([141, 138, 2, 168, 7, 11, 13, 0]) + result = encode(Chars.ALPHA_LOWER.value, bytes_data) + assert result == 'rwfafkahbmgq' + + def test_encode_hex_upper(self): + """Test encoding with HexUpper charset.""" + bytes_data = bytearray([0xc7, 0xc9, 0x00, 0x2a, 0x16, 0x32]) + result = encode(Chars.HEX_UPPER.value, bytes_data) + assert result == 'C7C9002A1632' + + def test_encode_safe64(self): + """Test encoding with Safe64 charset.""" + bytes_data = bytearray([ + 0x09, 0x25, 0x84, 0x3c, 0xbd, 0xc0, 0x89, 0xeb, + 0x61, 0x75, 0x81, 0x65, 0x09, 0xb4, 0x9a, 0x54, 0x20 + ]) + result = encode(Chars.SAFE64.value, bytes_data) + assert result == 'CSWEPL3AiethdYFlCbSaVC' + + def test_encode_empty_bytes(self): + """Test encoding empty bytes returns empty string.""" + result = encode(Chars.ALPHA.value, bytearray()) + assert result == '' + + def test_encode_custom_charset(self): + """Test encoding with custom charset.""" + bytes_data = bytearray([0xc7, 0xc9, 0x00, 0x2a, 0xbd, 0x72]) + result = encode('dingosky', bytes_data) + assert len(result) > 0 + + +class TestDecodeTransformer: + """Tests for decode() transformer function.""" + + def test_decode_hex_upper(self): + """Test decoding with HexUpper charset.""" + text = 'C7C9002A1632' + result = decode(Chars.HEX_UPPER.value, text) + assert result == bytearray([0xc7, 0xc9, 0x00, 0x2a, 0x16, 0x32]) + + def test_decode_safe64(self): + """Test decoding with Safe64 charset.""" + text = 'CSWEPL3AiethdYFlCbSaVC' + expected = bytearray([ + 0x09, 0x25, 0x84, 0x3c, 0xbd, 0xc0, 0x89, 0xeb, + 0x61, 0x75, 0x81, 0x65, 0x09, 0xb4, 0x9a, 0x54, 0x20 + ]) + result = decode(Chars.SAFE64.value, text) + assert result == expected + + def test_decode_empty_string(self): + """Test decoding empty string returns empty bytes.""" + result = decode(Chars.ALPHA.value, '') + assert result == bytearray() + + def test_decode_invalid_character_raises(self): + """Test decoding invalid character raises ValueError.""" + with pytest.raises(ValueError, match='Invalid character'): + decode(Chars.HEX.value, 'G') + + def test_decode_pads_final_byte(self): + """Test decode pads final partial byte with zeros.""" + result = decode(Chars.BASE32.value, 'A') + assert result == bytearray([0]) + + +class TestEncodeDecodeRoundtrip: + """Tests for encode/decode roundtrips.""" + + def test_roundtrip_safe32(self): + """Test encode/decode roundtrip with Safe32.""" + bytes_data = bytearray([0xd2, 0xe3, 0xe9, 0xda, 0x19, 0x03, 0xb7, 0x30]) + encoded = encode(Chars.SAFE32.value, bytes_data) + decoded = decode(Chars.SAFE32.value, encoded) + assert decoded == bytes_data + + def test_roundtrip_alpha_lower(self): + """Test encode/decode roundtrip with AlphaLower.""" + bytes_data = bytearray([141, 138, 2, 168, 7, 11, 13, 0]) + encoded = encode(Chars.ALPHA_LOWER.value, bytes_data) + decoded = decode(Chars.ALPHA_LOWER.value, encoded) + assert decoded == bytes_data + + def test_roundtrip_hex(self): + """Test encode/decode roundtrip with Hex.""" + bytes_data = bytearray([0xc7, 0xc9, 0x00, 0x2a, 0xbd]) + encoded = encode(Chars.HEX.value, bytes_data) + decoded = decode(Chars.HEX.value, encoded) + assert decoded == bytes_data + + +class TestPuidEncodeDecode: + """Tests for Puid.encode() and Puid.decode() methods.""" + + def test_puid_encode_decode_safe64(self): + """Test Puid encode/decode methods with Safe64.""" + p = Puid(chars=Chars.SAFE64) + bytes_data = bytearray([ + 0x09, 0x25, 0x84, 0x3c, 0xbd, 0xc0, 0x89, 0xeb, + 0x61, 0x75, 0x81, 0x65, 0x09, 0xb4, 0x9a, 0x54, 0x20 + ]) + encoded = p.encode(bytes_data) + assert encoded == 'CSWEPL3AiethdYFlCbSaVC' + + decoded = p.decode(encoded) + assert decoded == bytes_data + + def test_puid_encode_custom_chars(self): + """Test Puid encode with custom characters.""" + p = Puid(chars='dingosky') + bytes_data = bytearray([0xc7, 0xc9, 0x00, 0x2a, 0xbd, 0x72]) + encoded = p.encode(bytes_data) + decoded = p.decode(encoded) + assert decoded == bytes_data + + def test_puid_encode_empty_bytes(self): + """Test Puid encode with empty bytes.""" + p = Puid() + result = p.encode(bytearray()) + assert result == '' + + def test_puid_decode_empty_string(self): + """Test Puid decode with empty string.""" + p = Puid() + result = p.decode('') + assert result == bytearray() + + +class TestRiskForEntropy: + """Tests for risk_for_entropy() function.""" + + def test_risk_for_entropy_basic(self): + """Test basic risk calculation.""" + risk = risk_for_entropy(96, 1.0e7) + assert risk > 0 + # Risk should be approximately 1.58e15 (conservative estimate) + assert risk > 1e15 + + def test_risk_zero_total(self): + """Test risk with zero total returns 0.""" + assert risk_for_entropy(96, 0) == 0 + + def test_risk_one_total(self): + """Test risk with total=1 returns 0.""" + assert risk_for_entropy(96, 1) == 0 + + def test_risk_zero_bits(self): + """Test risk with zero bits returns 0.""" + assert risk_for_entropy(0, 1e7) == 0 + + def test_risk_negative_bits(self): + """Test risk with negative bits returns 0.""" + assert risk_for_entropy(-10, 1e7) == 0 + + +class TestTotalForEntropy: + """Tests for total_for_entropy() function.""" + + def test_total_for_entropy_basic(self): + """Test basic total calculation.""" + total = total_for_entropy(64, 1e9) + assert total > 0 + # Total should be approximately 192077 (conservative estimate) + assert 190000 < total < 200000 + + def test_total_zero_bits(self): + """Test total with zero bits returns 0.""" + assert total_for_entropy(0, 1e9) == 0 + + def test_total_negative_bits(self): + """Test total with negative bits returns 0.""" + assert total_for_entropy(-10, 1e9) == 0 + + def test_total_zero_risk(self): + """Test total with zero risk returns 0.""" + assert total_for_entropy(64, 0) == 0 + + def test_total_one_risk(self): + """Test total with risk=1 returns 0.""" + assert total_for_entropy(64, 1) == 0 + + +class TestPuidRiskAndTotal: + """Tests for Puid.risk() and Puid.total() methods.""" + + def test_puid_risk_method(self): + """Test Puid.risk() method.""" + p = Puid() + risk = p.risk(1e6) + assert risk > 0 + + def test_puid_total_method(self): + """Test Puid.total() method.""" + p = Puid() + total = p.total(1e15) + assert total > 0 + + def test_puid_risk_and_total_consistency(self): + """Test that risk and total are reasonably inverse operations.""" + p = Puid(total=1e6, risk=1e15, chars=Chars.SAFE32) + + # Get calculated risk for 1e6 IDs + calc_risk = p.risk(1e6) + assert calc_risk > 0 + + # Get calculated total for 1e15 risk + calc_total = p.total(1e15) + assert calc_total > 0 + + def test_puid_risk_with_different_configs(self): + """Test Puid.risk() with different configurations.""" + p1 = Puid(total=1e6, risk=1e15, chars=Chars.ALPHA) + risk1 = p1.risk(1e5) + assert risk1 > 0 + + p2 = Puid(bits=256) + risk2 = p2.risk(1e5) + assert risk2 > 0 + assert risk2 > risk1 # More bits = higher capacity + + +class TestEteMetric: + """Tests for Entropy Transform Efficiency (ete) metric.""" + + def test_ete_power_of_two_charset(self): + """Test ete is 1.0 for power-of-2 charsets.""" + # Safe64 has 64 characters (2^6) + p = Puid(chars=Chars.SAFE64) + assert p.ete == 1.0 + + # Hex has 16 characters (2^4) + p = Puid(chars=Chars.HEX) + assert p.ete == 1.0 + + # Base32 has 32 characters (2^5) + p = Puid(chars=Chars.BASE32) + assert p.ete == 1.0 + + def test_ete_non_power_of_two_charset(self): + """Test ete is less than 1.0 for non-power-of-2 charsets.""" + # AlphaNum has 62 characters (not power of 2) + p = Puid(chars=Chars.ALPHANUM) + assert 0 < p.ete < 1.0 + + # Decimal has 10 characters (not power of 2) + p = Puid(chars=Chars.DECIMAL) + assert 0 < p.ete < 1.0 + + def test_ete_custom_power_of_two(self): + """Test ete for custom power-of-2 charsets.""" + # 8-char custom charset (2^3) + p = Puid(chars='dingosky') + assert p.ete == 1.0 + + # 4-char custom charset (2^2) + p = Puid(chars='ATCG') + assert p.ete == 1.0 + + def test_ete_alpha_charset(self): + """Test ete for Alpha charset (52 chars).""" + p = Puid(chars=Chars.ALPHA) + # 52 is not a power of 2, so ete should be < 1.0 + assert 0 < p.ete < 1.0 + assert p.ete > 0.84 # Should be reasonable efficiency + +class TestIntegration: + """Integration tests combining multiple features.""" + + def test_puid_with_encode_decode_risk_total(self): + """Test Puid instance using encode, decode, risk, and total.""" + p = Puid(total=1e5, risk=1e12, chars=Chars.SAFE32) + + # Generate some bytes (use complete byte boundaries for decode) + test_bytes = bytearray([0xd2, 0xe3, 0xe9, 0xda, 0x19]) + + # Encode to string + encoded = p.encode(test_bytes) + assert isinstance(encoded, str) + + # Decode back to bytes (may be padded with zeros on partial byte) + decoded = p.decode(encoded) + # First 5 bytes should match + assert decoded[:5] == test_bytes + + # Test risk calculation + risk = p.risk(1e5) + assert risk > 0 + + # Test total calculation + total = p.total(1e12) + assert total > 0 + + def test_multiple_puid_instances_independence(self): + """Test that multiple Puid instances work independently.""" + p1 = Puid(chars=Chars.HEX, bits=64) + p2 = Puid(chars=Chars.ALPHA, bits=128) + + bytes_data = bytearray([0x09, 0x25, 0x84, 0x3c]) + + # Each encodes with its own charset + encoded1 = p1.encode(bytes_data) + encoded2 = p2.encode(bytes_data) + + # Different charsets produce different encodings + assert encoded1 != encoded2 + + # Each decodes correctly + assert p1.decode(encoded1) == bytes_data + assert p2.decode(encoded2) == bytes_data diff --git a/tests/puid_test.py b/tests/puid_test.py index ea8e140..3beffaa 100644 --- a/tests/puid_test.py +++ b/tests/puid_test.py @@ -290,6 +290,16 @@ def test_wordSafe32(util): assert wordSafe32_id.generate() == "2PqX" +def test_unicode_chars(util): + unicode_bytes = util.fixed_bytes('ec f9 db 7a 33 3d 21 97 a0 c2 bf 92 80 dd 2f 57 12 c1 1a ef') + unicode_id = Puid(bits=24, chars='dîngøsky:₩', entropy_source=unicode_bytes) + + assert unicode_id.generate() == '₩gî₩₩nî₩' + assert unicode_id.generate() == 'ydkîsnsd' + assert unicode_id.generate() == 'îøsîndøk' + pass + + def test_256_chars(): single_byte = Chars.SAFE64.value