diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index fbf5dcf..2e7263e 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.0.0
+current_version = 2.0.0
commit = True
tag = True
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c3b5e05..5c471ca 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -2,6 +2,52 @@
Changelog
=========
+-------------------
+v2.0.0 (2025-11-14)
+-------------------
+
+* Major feature additions
+ * Encode/decode transformer for byte ↔ string conversion
+ * Entropy risk and total calculation functions
+ * New Puid methods: ``encode()``, ``decode()``, ``risk()``, ``total()``
+ * ETE (Entropy Transform Efficiency) metric
+* Code quality improvements
+ * Full Python naming conventions (snake_case vs camelCase)
+ * Type hints throughout core modules
+ * Add ``__slots__`` to Puid and ValidChars classes
+ * Dictionary-based encoder selection (O(1) lookup)
+ * Comprehensive test suite (108 tests)
+* Breaking changes
+ * Function renames for Pythonic style:
+ * ``entropy_risk`` → ``risk_for_entropy``
+ * ``entropy_total`` → ``total_for_entropy``
+ * ``acceptValueFor`` → ``accept_value_for``
+ * ``CharMetrics.avgBits`` → ``avg_bits``
+
+-------------------
+v1.2.1 (2025-11-14)
+-------------------
+
+* Performance improvements
+ * Use dictionary dispatch for encoder selection (O(1) vs O(n))
+ * Cache encoder functions to avoid recreation
+ * Add ``__slots__`` to frequently instantiated classes
+* Code quality improvements
+ * Add comprehensive type hints
+ * Extract magic numbers to named constants
+ * Use namedtuple for multi-value returns
+ * Remove duplicate encoder creation
+
+-------------------
+v1.2.0 (2023-08-08)
+-------------------
+
+* Optimize bit shift
+* Add pre-defined char sets
+ * Base16 (RFC6468). Note: Same as HexUpper
+ * Crockford32
+ * WordSafe32 (Another avoid words strategy)
+
-------------------
v1.1.0 (2022-08-04)
-------------------
@@ -14,7 +60,6 @@ v1.1.0 (2022-08-04)
* Update README
* Create test helpers
-
-------------------
v1.0.0 (2022-07-29)
-------------------
diff --git a/README.md b/README.md
index 33d1c0e..2784ab9 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
Simple, flexible and efficient generation of probably unique identifiers (`puid`, aka random strings) of intuitively specified entropy using pre-defined or custom characters (including Unicode).
```python
-from puid import Chars, Puid
+from puid import Puid, Chars
rand_id = Puid(chars=Chars.ALPHA, total=1e5, risk=1e12)
rand_id.generate()
@@ -43,7 +43,7 @@ Random string generation can be thought of as a _transformation_ of some random
What characters are used in the ID?
- > `puid` provides 16 pre-defined character sets, as well as allows custom characters, including Unicode
+ > `puid` provides 19 pre-defined character sets, as well as allows custom characters, including Unicode
3. **ID randomness**
@@ -73,7 +73,9 @@ rand_id.generate()
from puid import Puid
from random import getrandbits
-prng_bytes = lambda n: bytearray(getrandbits(8) for _ in range(n))
+def prng_bytes(n):
+ return bytearray(getrandbits(8) for _ in range(n))
+
prng_id = Puid(entropy_source=prng_bytes)
prng_id.generate()
'JcQTr8u7MATncImOjO0qOS'
@@ -95,7 +97,7 @@ dingosky_id.generate()
'sdosigokdsdygooggogdggisndkogonksnkodnokosg'
unicode_id = Puid(chars='dîñgø$kyDÎÑGØßK¥')
-unicode_id.()
+unicode_id.generate()
'îGÎØÎÑî¥gK¥Ñ¥kîDîyøøØñÑØyd¥¥ØGØÑ$KߨgøÑ'
```
@@ -123,6 +125,36 @@ token.generate()
'5D241826F2A644E1B725DB1DD7E4BF742D9D0DC6D6A36F419046A02835A16B83'
```
+**Encode/Decode**
+
+Transform between bytes and strings using the generator's character set:
+
+```python
+from puid import Puid, Chars
+
+p = Puid(chars=Chars.SAFE64)
+bytes_data = bytearray([0x09, 0x25, 0x84, 0x3c])
+
+encoded = p.encode(bytes_data)
+# => 'CSwc'
+
+decoded = p.decode(encoded)
+# => bytearray(b'\t%\x84<')
+```
+
+**Risk and Total Calculation**
+
+Calculate the risk or total given the generator's entropy:
+
+```python
+from puid import Puid
+
+p = Puid(bits=96)
+
+risk = p.risk(1e6) # risk of repeat for 1M IDs
+total = p.total(1e15) # total possible IDs for 1e15 risk
+```
+
[TOC](#TOC)
### Installation
@@ -163,16 +195,22 @@ conda install -c dingosky puid-py
- `chars`: `Chars.SAFE64`
- `entropy_source`: `secret.token_bytes`
-#### PuidInfo
+#### Generator Methods and Properties
-The **Puid**'s `__repr__` function provides information regarding the generator configuration:
+The **Puid** instance provides the following:
-- `bits`: ID entropy
+- `generate()`: Generate a random ID
+- `encode(bytes)`: Encode bytes to string using the generator's characters
+- `decode(text)`: Decode a string to bytes using the generator's characters
+- `risk(total)`: Calculate risk of repeat for a given total number of IDs
+- `total(risk)`: Calculate total possible IDs for a given risk of repeat
+- `ete`: Entropy transform efficiency (0 < ETE ≤ 1.0)
+- `ere`: Entropy representation efficiency (0 < ERE ≤ 1.0)
+- `bits`: ID entropy bits
- `bits_per_char`: Entropy bits per ID character
-- `chars`: Source characters
-- `entropy_source`: String `module.function`
-- `ere`: Entropy representation efficiency
- `len`: ID string length
+- `chars`: Source characters
+- `entropy_source`: String `module.function` of entropy source
Example:
@@ -183,53 +221,56 @@ rand_id = Puid(total=1e5, risk=1e14, chars=Chars.BASE32)
rand_id.generate()
'7XKJJKNZBF7GCMEX'
-print(rand_id)
-Puid: bits = 80.0, bits_per_char = 5.0, chars = BASE32 -> '234567ABCDEFGHIJKLMNOPQRSTUVWXYZ', len = 16, ere = 0.625, entropy_source = secrets.token_bytes
+print(f"ETE: {rand_id.ete}, ERE: {rand_id.ere}")
+# ETE: 1.0, ERE: 0.62
+
+risk = rand_id.risk(1e5)
+total = rand_id.total(1e14)
```
### Chars
There are 19 pre-defined character sets:
-| Name | Characters |
-| :---------------- | :-------------------------------------------------------------------------------------------- |
-| :alpha | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz |
-| :alpha_lower | abcdefghijklmnopqrstuvwxyz |
-| :alpha_upper | ABCDEFGHIJKLMNOPQRSTUVWXYZ |
-| :alphanum | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 |
-| :alphanum_lower | abcdefghijklmnopqrstuvwxyz0123456789 |
-| :alphanum_upper | ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 |
-| :base16 | 0123456789ABCDEF |
-| :base32 | ABCDEFGHIJKLMNOPQRSTUVWXYZ234567 |
-| :base32_hex | 0123456789abcdefghijklmnopqrstuv |
-| :base32_hex_upper | 0123456789ABCDEFGHIJKLMNOPQRSTUV |
-| :crockford32 | 0123456789ABCDEFGHJKMNPQRSTVWXYZ |
-| :decimal | 0123456789 |
-| :hex | 0123456789abcdef |
-| :hex_upper | 0123456789ABCDEF |
-| :safe_ascii | !#$%&()\*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^\_abcdefghijklmnopqrstuvwxyz{\|}~ |
-| :safe32 | 2346789bdfghjmnpqrtBDFGHJLMNPQRT |
-| :safe64 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-\_ |
-| :symbol | !#$%&()\*+,-./:;<=>?@[]^\_{\|}~ |
-| :wordSafe32 | 23456789CFGHJMPQRVWXcfghjmpqrvwx |
+| Name | Characters |
+| :--------------- | :-------------------------------------------------------------------------------------------- |
+| ALPHA | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz |
+| ALPHA_LOWER | abcdefghijklmnopqrstuvwxyz |
+| ALPHA_UPPER | ABCDEFGHIJKLMNOPQRSTUVWXYZ |
+| ALPHANUM | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 |
+| ALPHANUM_LOWER | abcdefghijklmnopqrstuvwxyz0123456789 |
+| ALPHANUM_UPPER | ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 |
+| BASE16 | 0123456789ABCDEF |
+| BASE32 | ABCDEFGHIJKLMNOPQRSTUVWXYZ234567 |
+| BASE32_HEX | 0123456789abcdefghijklmnopqrstuv |
+| BASE32_HEX_UPPER | 0123456789ABCDEFGHIJKLMNOPQRSTUV |
+| CROCKFORD32 | 0123456789ABCDEFGHJKMNPQRSTVWXYZ |
+| DECIMAL | 0123456789 |
+| HEX | 0123456789abcdef |
+| HEX_UPPER | 0123456789ABCDEF |
+| SAFE_ASCII | !#$%&()\*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^\_abcdefghijklmnopqrstuvwxyz{\|}~ |
+| SAFE32 | 2346789bdfghjmnpqrtBDFGHJLMNPQRT |
+| SAFE64 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-\_ |
+| SYMBOL | !#$%&()\*+,-./:;<=>?@[]^\_{\|}~ |
+| WORD_SAFE32 | 23456789CFGHJMPQRVWXcfghjmpqrvwx |
Any string of up to 256 unique characters can be used for **`puid`** generation.
#### Description of non-obvious character sets
-| Name | Description |
-| :---------------- | :--------------------------------------------------------- |
-| :base16 | https://datatracker.ietf.org/doc/html/rfc4648#section-8 |
-| :base32 | https://datatracker.ietf.org/doc/html/rfc4648#section-6 |
-| :base32_hex | Lowercase of :base32_hex_upper |
-| :base32_hex_upper | https://datatracker.ietf.org/doc/html/rfc4648#section-7 |
-| :crockford32 | https://www.crockford.com/base32.html |
-| :safe_ascii | Printable ascii that does not require escape in String |
-| :safe32 | Alpha and numbers picked to reduce chance of English words |
-| :safe64 | https://datatracker.ietf.org/doc/html/rfc4648#section-5 |
-| :wordSafe32 | Alpha and numbers picked to reduce chance of English words |
+| Name | Description |
+| :--------------- | :--------------------------------------------------------- |
+| BASE16 | https://datatracker.ietf.org/doc/html/rfc4648#section-8 |
+| BASE32 | https://datatracker.ietf.org/doc/html/rfc4648#section-6 |
+| BASE32_HEX | Lowercase of Base32HexUpper |
+| BASE32_HEX_UPPER | https://datatracker.ietf.org/doc/html/rfc4648#section-7 |
+| CROCKFORD32 | https://www.crockford.com/base32.html |
+| SAFE_ASCII | Printable ascii that does not require escape in String |
+| SAFE32 | Alpha and numbers picked to reduce chance of English words |
+| SAFE64 | https://datatracker.ietf.org/doc/html/rfc4648#section-5 |
+| WORD_SAFE32 | Alpha and numbers picked to reduce chance of English words |
-Note: :safe32 and :wordSafe32 are two different strategies for the same goal.
+Note: SAFE32 and WORD_SAFE32 are two different strategies for the same goal.
[TOC](#TOC)
@@ -252,7 +293,9 @@ A somewhat simplistic statement for entropy from information theory is: _entropy
Rather, a random string represents _captured_ entropy, entropy that was produced by _some other_ process. For example, you cannot look at the hex string **`'18f6303a'`** and definitively say it has 32 bits of entropy. To see why, suppose you run the following code snippet and get **`'18f6303a'`**:
```python
-rand_id = lambda: '18f6303a' if random.random() < 0.5 else '1'
+from random import random
+
+rand_id = lambda: '18f6303a' if random() < 0.5 else '1'
rand_id()
'18f6303a'
```
@@ -382,7 +425,7 @@ Now, suppose you are tasked to maintain this code:
```python
from puid import Chars, Puid
-rand_id = Puid(total=500000, risk=1e12, chars=Chars.ALPHANUM_LOWER)
+rand_id = Puid(total=500_000, risk=1e12, chars=Chars.ALPHANUM_LOWER)
```
Hmmm. Looks like there are 500,000 IDs expected and the repeat risk is 1 in a trillion. No guessing. The code is explicit. Oh, and by the way, the IDs are 15 characters long. But who cares? It's the ID randomness that matters, not the length.
@@ -433,7 +476,7 @@ Hmmm. Looks like there are 500,000 IDs expected and the repeat risk is 1 in a tr
```python
from puid import Chars, Puid
-Puid(chars=Chars.SAFE32, total=10e6, risk=1e15)
+rand_id = Puid(chars=Chars.SAFE32, total=10e6, risk=1e15)
rand_id.generate()
'RHR3DtnP9B3J748NdR87'
```
diff --git a/conda/meta.yaml b/conda/meta.yaml
index d48ba6c..e9f9292 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,5 +1,5 @@
{% set name = "puid-py" %}
-{% set version = "1.2.0" %}
+{% set version = "2.0.0" %}
package:
name: "{{ name|lower }}"
diff --git a/pyproject.toml b/pyproject.toml
index fc9d53d..1a99bc6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[build-system]
requires = [
- "setuptools>=30.3.0",
+ "setuptools>=68.0.0",
"wheel",
]
diff --git a/setup.cfg b/setup.cfg
index d131b0a..e9708b0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,7 +29,6 @@ tests_require = pytest
test = pytest
[tool:isort]
-force_single_line = True
line_length = 120
known_first_party = puid
default_section = THIRDPARTY
diff --git a/setup.py b/setup.py
index 2092c1f..4b8a6bd 100755
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ def read(*names, **kwargs):
setup(
name='puid-py',
- version='1.2.0',
+ version='2.0.0',
license='MIT',
description='Simple, flexible and efficient generation of probably unique identifiers (`puid`, '
'aka random strings) of intuitively specified entropy using pre-defined or custom characters, '
diff --git a/src/puid/__init__.py b/src/puid/__init__.py
index 4e68dd5..ecdb93f 100644
--- a/src/puid/__init__.py
+++ b/src/puid/__init__.py
@@ -1,2 +1,4 @@
+__version__ = '2.0.0'
+
from puid.chars import Chars
from puid.puid import Puid
diff --git a/src/puid/bits.py b/src/puid/bits.py
index 1d7ed44..1fe5812 100644
--- a/src/puid/bits.py
+++ b/src/puid/bits.py
@@ -1,5 +1,8 @@
+from collections import namedtuple
from math import ceil, floor, log2
+AcceptResult = namedtuple('AcceptResult', ['accept', 'shift'])
+
# Create array of minimum bits required to determine if a value is less than n_chars
# Array elements are of the form (n, bits): For values less than n, bits bits are required
#
@@ -13,7 +16,15 @@
#
-def bit_shifts(n_chars):
+def isPow2(n: int) -> bool:
+ """Check if a number is a power of 2."""
+ if n <= 0:
+ return False
+ return (n & (n - 1)) == 0
+
+
+def bitShifts(n_chars: int) -> list:
+ """Create array of minimum bits required to determine if a value is less than n_chars."""
n_bits_per_char = ceil(log2(n_chars))
base_value = n_chars - 1 if n_chars % 2 == 0 else n_chars
@@ -34,6 +45,11 @@ def shift(bit):
return [base_shift] + [shift(bit) for bit in range(2, n_bits_per_char) if is_bit_zero(bit)]
+def bit_shifts(n_chars):
+ """Deprecated: use bitShifts instead."""
+ return bitShifts(n_chars)
+
+
def fill_entropy(entropy_offset, entropy_bytes, entropy_fn):
n_bytes = len(entropy_bytes)
n_bits = 8 * n_bytes
@@ -81,12 +97,6 @@ def muncher(n_chars, puid_len, entropy_fn):
entropy_offset = n_entropy_bits
entropy_bytes = bytearray(buffer_len)
- def pow2(bit):
- return round(pow(2, bit))
-
- def is_pow2(n):
- return pow2(round(log2(n))) == n
-
counter = list(range(puid_len))
def sliced_value():
@@ -95,7 +105,7 @@ def sliced_value():
entropy_offset = fill_entropy(entropy_offset, entropy_bytes, entropy_fn)
return value_at(entropy_offset, n_bits_per_char, entropy_bytes)
- if is_pow2(n_chars):
+ if isPow2(n_chars):
# When chars count is a power of 2, sliced bits always yield a valid value
def bits_muncher():
def slice_value():
@@ -109,20 +119,20 @@ def slice_value():
return bits_muncher
- shifts = bit_shifts(n_chars)
+ shifts = bitShifts(n_chars)
def accept_value(value):
# Value is valid if it is less than the number of characters
if value < n_chars:
- return (True, n_bits_per_char)
+ return AcceptResult(True, n_bits_per_char)
# For invalid value, shift the minimal bits necessary to determine validity
# If only one item, no need to search
if len(shifts) == 1:
- return (False, shifts[0][1])
+ return AcceptResult(False, shifts[0][1])
bit_shift = [bs for bs in shifts if value <= bs[0]]
- return (False, bit_shift[0][1])
+ return AcceptResult(False, bit_shift[0][1])
def slice_value():
nonlocal entropy_offset
diff --git a/src/puid/chars.py b/src/puid/chars.py
index d7eab79..c4273b4 100644
--- a/src/puid/chars.py
+++ b/src/puid/chars.py
@@ -1,7 +1,14 @@
from enum import Enum
+from typing import Dict, NamedTuple
+from math import ceil, log2, pow
from puid.chars_error import InvalidChars, NonUniqueChars, TooFewChars, TooManyChars
+MIN_CHARS = 2
+MAX_CHARS = 256
+VALID_CHAR_MIN_CODE = 160
+INVALID_CHAR_THRESHOLD = ord('~')
+
def valid_chars(chars):
"""
@@ -21,8 +28,8 @@ def valid_chars(chars):
if not isinstance(chars, str):
raise InvalidChars('Characters must be a str')
- min_len = 2
- max_len = 256
+ min_len = MIN_CHARS
+ max_len = MAX_CHARS
if len(chars) < min_len:
raise TooFewChars(f'Must have at least {min_len} characters')
@@ -43,7 +50,7 @@ def valid_chars(chars):
def _valid_char(char):
code_point = ord(char)
- if 160 < code_point:
+ if VALID_CHAR_MIN_CODE < code_point:
return True
if char == '!':
@@ -56,7 +63,7 @@ def _valid_char(char):
return False
if char == '`':
return False
- if ord('~') < code_point:
+ if INVALID_CHAR_THRESHOLD < code_point:
return False
return True
@@ -96,6 +103,8 @@ def __len__(self):
class ValidChars:
"""Base class for PredefinedChars and CustomChars"""
+ __slots__ = ('name', 'value')
+
def __repr__(self):
return "{0} -> '{1}'".format(self.name, self.value)
@@ -156,6 +165,85 @@ def __init__(self, chars):
self.value = chars
+class CharMetrics(NamedTuple):
+ """Metrics for a character set including entropy transform efficiency."""
+ avg_bits: float
+ ere: float
+ ete: float
+
+
+def _bits_consumed_on_reject(
+ charset_size: int, total_values: int, shifts: list
+) -> int:
+ """Calculate total bits consumed when rejecting values."""
+ sum_bits = 0
+ for value in range(charset_size, total_values):
+ bit_shift = None
+ for bs in shifts:
+ if value <= bs[0]:
+ bit_shift = bs
+ break
+ if bit_shift is None:
+ raise ValueError('Invalid bit_shifts: missing range')
+ sum_bits += bit_shift[1]
+ return sum_bits
+
+
+def _avg_bytes_per_char(chars: str) -> float:
+ """Calculate average byte size per character for a string."""
+ total_bytes = len(chars.encode('utf-8'))
+ return total_bytes / len(chars)
+
+
+def metrics(chars: str) -> CharMetrics:
+ """
+ Calculate entropy metrics for a character set.
+
+ Returns a CharMetrics object with:
+ - avgBits: Average bits consumed per character during generation
+ - ere: Entropy representation efficiency (0 < ERE ≤ 1.0)
+ - ete: Entropy transform efficiency (0 < ETE ≤ 1.0)
+
+ Example:
+ >>> metrics(Chars.SAFE64.value).ete
+ 1.0
+ """
+ from puid.bits import isPow2, bitShifts
+
+ charset_size = len(chars)
+ bits_per_char = ceil(log2(charset_size))
+ theoretical_bits = log2(charset_size)
+ shifts = bitShifts(charset_size)
+
+ avg_rep_bits_per_char = _avg_bytes_per_char(chars) * 8
+ ere = theoretical_bits / avg_rep_bits_per_char
+
+ if isPow2(charset_size):
+ return CharMetrics(
+ avg_bits=float(bits_per_char),
+ ere=round(ere, 4),
+ ete=1.0
+ )
+
+ total_values = pow(2, bits_per_char)
+ prob_accept = charset_size / total_values
+ prob_reject = 1 - prob_accept
+
+ reject_count = total_values - charset_size
+ reject_bits = _bits_consumed_on_reject(charset_size, int(total_values), shifts)
+
+ avg_bits_on_reject = reject_bits / reject_count
+ avg_bits = bits_per_char + (prob_reject / prob_accept) * avg_bits_on_reject
+
+ ete = theoretical_bits / avg_bits
+
+ return CharMetrics(
+ avg_bits=avg_bits,
+ ere=round(ere, 4),
+ ete=round(ete, 4)
+ )
+
+
if __name__ == '__main__': # pragma: no cover
import doctest
diff --git a/src/puid/encoder.py b/src/puid/encoder.py
index c602efd..9b03bfd 100644
--- a/src/puid/encoder.py
+++ b/src/puid/encoder.py
@@ -1,4 +1,6 @@
-from puid.chars import Chars
+from typing import Callable, Dict
+
+from puid.chars import Chars, ValidChars
from puid.encoders.alpha import alpha
from puid.encoders.alpha import alpha_lower
from puid.encoders.alpha import alpha_upper
@@ -21,62 +23,38 @@
from puid.encoders.word_safe32 import word_safe32
-def encoder(chars: Chars):
- if chars.name == Chars.ALPHA.name:
- return alpha()
-
- if chars.name == Chars.ALPHA_LOWER.name:
- return alpha_lower()
-
- if chars.name == Chars.ALPHA_UPPER.name:
- return alpha_upper()
-
- if chars.name == Chars.ALPHANUM.name:
- return alphanum()
-
- if chars.name == Chars.ALPHANUM_LOWER.name:
- return alphanum_lower()
-
- if chars.name == Chars.ALPHANUM_UPPER.name:
- return alphanum_upper()
-
- if chars.name == Chars.BASE16.name:
- return base16()
-
- if chars.name == Chars.BASE32.name:
- return base32()
-
- if chars.name == Chars.BASE32_HEX.name:
- return base32_hex()
-
- if chars.name == Chars.BASE32_HEX_UPPER.name:
- return base32_hex_upper()
-
- if chars.name == Chars.CROCKFORD32.name:
- return crockford32()
-
- if chars.name == Chars.DECIMAL.name:
- return decimal()
-
- if chars.name == Chars.HEX.name:
- return hex_lower()
-
- if chars.name == Chars.HEX_UPPER.name:
- return hex_upper()
-
- if chars.name == Chars.SAFE32.name:
- return safe32()
-
- if chars.name == Chars.SAFE64.name:
- return safe64()
-
- if chars.name == Chars.SAFE_ASCII.name:
- return safe_ascii()
-
- if chars.name == Chars.SYMBOL.name:
- return symbol()
-
- if chars.name == Chars.WORD_SAFE32.name:
- return word_safe32()
-
+_ENCODER_CACHE: Dict[str, Callable[[int], int]] = {}
+
+
+def _init_encoder_map() -> Dict[str, Callable[[int], int]]:
+ return {
+ Chars.ALPHA.name: alpha(),
+ Chars.ALPHA_LOWER.name: alpha_lower(),
+ Chars.ALPHA_UPPER.name: alpha_upper(),
+ Chars.ALPHANUM.name: alphanum(),
+ Chars.ALPHANUM_LOWER.name: alphanum_lower(),
+ Chars.ALPHANUM_UPPER.name: alphanum_upper(),
+ Chars.BASE16.name: base16(),
+ Chars.BASE32.name: base32(),
+ Chars.BASE32_HEX.name: base32_hex(),
+ Chars.BASE32_HEX_UPPER.name: base32_hex_upper(),
+ Chars.CROCKFORD32.name: crockford32(),
+ Chars.DECIMAL.name: decimal(),
+ Chars.HEX.name: hex_lower(),
+ Chars.HEX_UPPER.name: hex_upper(),
+ Chars.SAFE32.name: safe32(),
+ Chars.SAFE64.name: safe64(),
+ Chars.SAFE_ASCII.name: safe_ascii(),
+ Chars.SYMBOL.name: symbol(),
+ Chars.WORD_SAFE32.name: word_safe32(),
+ }
+
+
+def encoder(chars: ValidChars) -> Callable[[int], int]:
+ if not _ENCODER_CACHE:
+ _ENCODER_CACHE.update(_init_encoder_map())
+
+ encoder_fn = _ENCODER_CACHE.get(chars.name)
+ if encoder_fn is not None:
+ return encoder_fn
return custom(chars)
diff --git a/src/puid/encoders/transformer.py b/src/puid/encoders/transformer.py
new file mode 100644
index 0000000..2bf0fe9
--- /dev/null
+++ b/src/puid/encoders/transformer.py
@@ -0,0 +1,110 @@
+from math import ceil, log2
+from typing import Callable, Tuple
+
+from puid.bits import bitShifts, isPow2, value_at
+
+
+def accept_value_for(chars: str) -> Callable[[int], Tuple[bool, int]]:
+ """Create an accept/reject function for a character set."""
+ n_chars = len(chars)
+ n_bits_per_char = ceil(__import__('math').log2(n_chars))
+
+ if isPow2(n_chars):
+ def always_accept(value: int) -> Tuple[bool, int]:
+ return (True, n_bits_per_char)
+ return always_accept
+
+ shifts = bitShifts(n_chars)
+
+ def accept_value_func(value: int) -> Tuple[bool, int]:
+ if value < n_chars:
+ return (True, n_bits_per_char)
+
+ if len(shifts) == 1:
+ return (False, shifts[0][1])
+
+ bit_shift = [bs for bs in shifts if value <= bs[0]]
+ return (False, bit_shift[0][1])
+
+ return accept_value_func
+
+
+def encode(chars: str, byte_data: bytearray) -> str:
+ """
+ Encode bytes into a string using the provided character set.
+
+ Example:
+ >>> from puid import Chars
+ >>> bytes_data = bytearray([0x09, 0x25, 0x84, 0x3c, 0xbd, 0xc0, 0x89, 0xeb,
+ ... 0x61, 0x75, 0x81, 0x65, 0x09, 0xb4, 0x9a, 0x54, 0x20])
+ >>> encode(Chars.SAFE64, bytes_data)
+ 'CSWEPL3AiethdYFlCbSaVC'
+ """
+ n_bits_per_char = ceil(log2(len(chars)))
+ n_entropy_bits = 8 * len(byte_data)
+
+ if n_entropy_bits == 0:
+ return ''
+
+ # Support both ValidChars and raw string
+ charset = chars.value if hasattr(chars, 'value') else chars
+ char_codes = [ord(c) for c in charset]
+
+ accept_func = accept_value_for(charset)
+
+ offset = 0
+ codes = []
+
+ while offset + n_bits_per_char <= n_entropy_bits:
+ v = value_at(offset, n_bits_per_char, byte_data)
+ accept, shift = accept_func(v)
+ offset += shift
+ if accept:
+ codes.append(char_codes[v])
+
+ return ''.join(chr(code) for code in codes)
+
+
+def decode(chars: str, text: str) -> bytearray:
+ """
+ Decode a string of characters back into bytes using the provided character set.
+ Pads the final partial byte with zeros if the bit-length is not a multiple of 8.
+
+ Example:
+ >>> from puid import Chars
+ >>> text = 'CSWEPL3AiethdYFlCbSaVC'
+ >>> bytes_data = decode(Chars.SAFE64, text)
+ >>> len(bytes_data)
+ 17
+ """
+ charset = chars.value if hasattr(chars, 'value') else chars
+ n_bits_per_char = ceil(log2(len(charset)))
+
+ if not text:
+ return bytearray()
+
+ char_map = {char: idx for idx, char in enumerate(charset)}
+
+ acc = 0
+ acc_bits = 0
+ out = []
+
+ for char in text:
+ if char not in char_map:
+ raise ValueError(f'Invalid character for charset: {char}')
+
+ val = char_map[char]
+ acc = (acc << n_bits_per_char) | val
+ acc_bits += n_bits_per_char
+
+ while acc_bits >= 8:
+ shift = acc_bits - 8
+ byte_val = (acc >> shift) & 0xff
+ out.append(byte_val)
+ acc_bits -= 8
+ acc = acc & ((1 << acc_bits) - 1)
+
+ if acc_bits > 0:
+ out.append((acc << (8 - acc_bits)) & 0xff)
+
+ return bytearray(out)
diff --git a/src/puid/entropy.py b/src/puid/entropy.py
index 943e338..013e1dd 100644
--- a/src/puid/entropy.py
+++ b/src/puid/entropy.py
@@ -1,11 +1,12 @@
-from math import ceil, log2, trunc
+from math import ceil, log2, sqrt, trunc
+from typing import Union
from puid.chars import ValidChars
from puid.chars_error import InvalidChars
from puid.puid_error import TotalRiskError
-def bits_for_total_risk(total: 0, risk: 0):
+def bits_for_total_risk(total: Union[int, float], risk: Union[int, float]) -> float:
"""
Entropy bits necessary to produce a `total` `puid`s with given `risk` of repeat
@@ -33,13 +34,10 @@ def non_neg_int_or_float(value):
if risk in [0, 1]:
return 0
- if total < 1000:
- return log2(total) + log2(total - 1) + log2(risk) - 1
- else:
- return 2 * log2(total) + log2(risk) - 1
+ return log2(total) + log2(total - 1) + log2(risk) - 1
-def bits_per_char(chars):
+def bits_per_char(chars: ValidChars) -> float:
"""
Entropy bits per character for either a predefined Chars enum or a string of characters
@@ -59,7 +57,7 @@ def bits_per_char(chars):
raise InvalidChars('chars must be an instance of ValidChars')
-def bits_for_len(chars, len):
+def bits_for_len(chars: ValidChars, len: int) -> int:
"""
Bits necessary for a `puid` of length `len` using characters `chars`
@@ -74,7 +72,7 @@ def bits_for_len(chars, len):
return trunc(len * bits_per_char(chars))
-def len_for_bits(chars, bits):
+def len_for_bits(chars: ValidChars, bits: Union[int, float]) -> int:
"""
Length necessary for a `puid` of `bits` using characters `chars`
@@ -89,6 +87,52 @@ def len_for_bits(chars, bits):
return ceil(bits / bits_per_char(chars))
+def risk_for_entropy(bits: Union[int, float], total: Union[int, float]) -> Union[int, float]:
+ """
+ Risk given entropy `bits` after `total` IDs.
+
+ This approximation is conservative and will underestimate the true risk.
+
+ :param bits: Entropy bits
+ :param total: Total number of IDs
+ :return: Risk of repeat (conservative approximation)
+
+ >>> risk_for_entropy(96, 1.0e7)
+ 1584563250285288.0
+ """
+ if total in [0, 1]:
+ return 0
+
+ if bits <= 0:
+ return 0
+
+ n = log2(total) + log2(total - 1)
+ return 2 ** (bits - n + 1)
+
+
+def total_for_entropy(bits: Union[int, float], risk: Union[int, float]) -> Union[int, float]:
+ """
+ Total possible IDs given entropy `bits` and repeat `risk`.
+
+ This exact inversion with flooring is conservative and will underestimate the true total.
+
+ :param bits: Entropy bits
+ :param risk: Risk of repeat
+ :return: Total possible IDs (conservative approximation)
+
+ >>> total_for_entropy(64, 1e9)
+ 192077.0
+ """
+ if bits <= 0:
+ return 0
+
+ if risk in [0, 1]:
+ return 0
+
+ c = 2 ** (bits + 1) / risk
+ return (1 + sqrt(1 + 4 * c)) / 2
+
+
if __name__ == '__main__': # pragma: no cover
import doctest
diff --git a/src/puid/puid.py b/src/puid/puid.py
index 605110b..2ed1771 100644
--- a/src/puid/puid.py
+++ b/src/puid/puid.py
@@ -1,17 +1,39 @@
from math import ceil, log2
from secrets import token_bytes
+from typing import Callable, Optional, Union
from puid import Chars
from puid.bits import muncher
-from puid.chars import CustomChars, PredefinedChars
+from puid.chars import CustomChars, PredefinedChars, ValidChars, metrics
from puid.chars_error import InvalidChars
from puid.encoder import encoder
-from puid.entropy import bits_for_total_risk
+from puid.encoders.transformer import encode as transform_encode
+from puid.encoders.transformer import decode as transform_decode
+from puid.entropy import bits_for_total_risk, risk_for_entropy, total_for_entropy
from puid.puid_error import BitsError, TotalRiskError
class Puid:
- def __init__(self, total=None, risk=None, bits=None, chars=None, entropy_source=None):
+ __slots__ = (
+ 'chars',
+ 'len',
+ 'bits',
+ 'entropy_source',
+ 'bits_muncher',
+ 'bits_per_char',
+ '_encoded',
+ 'ere',
+ 'ete',
+ )
+
+ def __init__(
+ self,
+ total: Optional[Union[int, float]] = None,
+ risk: Optional[Union[int, float]] = None,
+ bits: Optional[int] = None,
+ chars: Optional[Union[Chars, str]] = None,
+ entropy_source: Optional[Callable[[int], bytearray]] = None,
+ ) -> None:
base_bits = None
if bits is None and total is None and risk is None:
@@ -32,7 +54,7 @@ def __init__(self, total=None, risk=None, bits=None, chars=None, entropy_source=
base_bits = bits_for_total_risk(total, risk)
if chars is None:
- self.chars = PredefinedChars(Chars.SAFE64)
+ self.chars: ValidChars = PredefinedChars(Chars.SAFE64)
elif isinstance(chars, Chars):
self.chars = PredefinedChars(chars)
elif isinstance(chars, str):
@@ -42,31 +64,49 @@ def __init__(self, total=None, risk=None, bits=None, chars=None, entropy_source=
n_chars = len(self.chars)
n_bits_per_char = log2(n_chars)
- self.len = round(ceil(base_bits / n_bits_per_char))
- self.bits = self.len * n_bits_per_char
+ self.len: int = round(ceil(base_bits / n_bits_per_char))
+ self.bits: float = self.len * n_bits_per_char
entropy_fn = entropy_source or token_bytes
- self.entropy_source = f'{entropy_fn.__module__}.{entropy_fn.__name__}'
+ self.entropy_source: str = f'{entropy_fn.__module__}.{entropy_fn.__name__}'
- self._chars_encoder = encoder(self.chars)
- self.bits_muncher = muncher(n_chars, self.len, entropy_fn)
+ self.bits_muncher: Callable[[], list[int]] = muncher(n_chars, self.len, entropy_fn)
- self.bits_per_char = n_bits_per_char
+ self.bits_per_char: float = n_bits_per_char
chars_encoder = encoder(self.chars)
- def encoded(values):
+ def encoded(values: list[int]) -> list[str]:
return [chr(chars_encoder(value)) for value in values]
- self._encoded = encoded
+ self._encoded: Callable[[list[int]], list[str]] = encoded
- self.ere = (n_bits_per_char * n_chars) / (8 * len(self.chars.value.encode('utf-8')))
+ self.ere: float = (n_bits_per_char * n_chars) / (8 * len(self.chars.value.encode('utf-8')))
+
+ char_metrics = metrics(self.chars.value)
+ self.ete: float = char_metrics.ete
- def __repr__(self):
+ def __repr__(self) -> str:
bits = round(self.bits, 2)
bpc = round(self.bits_per_char, 2)
return f'Puid: bits = {bits}, bits_per_char = {bpc}, chars = {self.chars}, len = {self.len}, '
'ere = {self.ere}, entropy_source = {self.entropy_source}'
- def generate(self):
+ def generate(self) -> str:
values = self.bits_muncher()
return "".join(self._encoded(values))
+
+ def encode(self, byte_data: bytearray) -> str:
+ """Encode bytes into a string using the generator's configured characters."""
+ return transform_encode(self.chars.value, byte_data)
+
+ def decode(self, text: str) -> bytearray:
+ """Decode a string back into bytes using the generator's configured characters."""
+ return transform_decode(self.chars.value, text)
+
+ def risk(self, total: Union[int, float]) -> Union[int, float]:
+ """Calculate the risk of repeat given a total number of IDs."""
+ return risk_for_entropy(self.bits, total)
+
+ def total(self, risk: Union[int, float]) -> Union[int, float]:
+ """Calculate the total possible IDs given a risk of repeat."""
+ return total_for_entropy(self.bits, risk)
diff --git a/tests/new_features_test.py b/tests/new_features_test.py
new file mode 100644
index 0000000..19abcc4
--- /dev/null
+++ b/tests/new_features_test.py
@@ -0,0 +1,322 @@
+"""Tests for v2.1.0 and v2.2.0 features: encode, decode, risk, total, ete metrics."""
+import pytest
+
+from puid import Chars, Puid
+from puid.encoders.transformer import encode, decode
+from puid.entropy import risk_for_entropy, total_for_entropy
+
+
+class TestEncodeTransformer:
+ """Tests for encode() transformer function."""
+
+ def test_encode_alpha_lower(self):
+ """Test encoding with AlphaLower charset."""
+ bytes_data = bytearray([141, 138, 2, 168, 7, 11, 13, 0])
+ result = encode(Chars.ALPHA_LOWER.value, bytes_data)
+ assert result == 'rwfafkahbmgq'
+
+ def test_encode_hex_upper(self):
+ """Test encoding with HexUpper charset."""
+ bytes_data = bytearray([0xc7, 0xc9, 0x00, 0x2a, 0x16, 0x32])
+ result = encode(Chars.HEX_UPPER.value, bytes_data)
+ assert result == 'C7C9002A1632'
+
+ def test_encode_safe64(self):
+ """Test encoding with Safe64 charset."""
+ bytes_data = bytearray([
+ 0x09, 0x25, 0x84, 0x3c, 0xbd, 0xc0, 0x89, 0xeb,
+ 0x61, 0x75, 0x81, 0x65, 0x09, 0xb4, 0x9a, 0x54, 0x20
+ ])
+ result = encode(Chars.SAFE64.value, bytes_data)
+ assert result == 'CSWEPL3AiethdYFlCbSaVC'
+
+ def test_encode_empty_bytes(self):
+ """Test encoding empty bytes returns empty string."""
+ result = encode(Chars.ALPHA.value, bytearray())
+ assert result == ''
+
+ def test_encode_custom_charset(self):
+ """Test encoding with custom charset."""
+ bytes_data = bytearray([0xc7, 0xc9, 0x00, 0x2a, 0xbd, 0x72])
+ result = encode('dingosky', bytes_data)
+ assert len(result) > 0
+
+
+class TestDecodeTransformer:
+ """Tests for decode() transformer function."""
+
+ def test_decode_hex_upper(self):
+ """Test decoding with HexUpper charset."""
+ text = 'C7C9002A1632'
+ result = decode(Chars.HEX_UPPER.value, text)
+ assert result == bytearray([0xc7, 0xc9, 0x00, 0x2a, 0x16, 0x32])
+
+ def test_decode_safe64(self):
+ """Test decoding with Safe64 charset."""
+ text = 'CSWEPL3AiethdYFlCbSaVC'
+ expected = bytearray([
+ 0x09, 0x25, 0x84, 0x3c, 0xbd, 0xc0, 0x89, 0xeb,
+ 0x61, 0x75, 0x81, 0x65, 0x09, 0xb4, 0x9a, 0x54, 0x20
+ ])
+ result = decode(Chars.SAFE64.value, text)
+ assert result == expected
+
+ def test_decode_empty_string(self):
+ """Test decoding empty string returns empty bytes."""
+ result = decode(Chars.ALPHA.value, '')
+ assert result == bytearray()
+
+ def test_decode_invalid_character_raises(self):
+ """Test decoding invalid character raises ValueError."""
+ with pytest.raises(ValueError, match='Invalid character'):
+ decode(Chars.HEX.value, 'G')
+
+ def test_decode_pads_final_byte(self):
+ """Test decode pads final partial byte with zeros."""
+ result = decode(Chars.BASE32.value, 'A')
+ assert result == bytearray([0])
+
+
+class TestEncodeDecodeRoundtrip:
+ """Tests for encode/decode roundtrips."""
+
+ def test_roundtrip_safe32(self):
+ """Test encode/decode roundtrip with Safe32."""
+ bytes_data = bytearray([0xd2, 0xe3, 0xe9, 0xda, 0x19, 0x03, 0xb7, 0x30])
+ encoded = encode(Chars.SAFE32.value, bytes_data)
+ decoded = decode(Chars.SAFE32.value, encoded)
+ assert decoded == bytes_data
+
+ def test_roundtrip_alpha_lower(self):
+ """Test encode/decode roundtrip with AlphaLower."""
+ bytes_data = bytearray([141, 138, 2, 168, 7, 11, 13, 0])
+ encoded = encode(Chars.ALPHA_LOWER.value, bytes_data)
+ decoded = decode(Chars.ALPHA_LOWER.value, encoded)
+ assert decoded == bytes_data
+
+ def test_roundtrip_hex(self):
+ """Test encode/decode roundtrip with Hex."""
+ bytes_data = bytearray([0xc7, 0xc9, 0x00, 0x2a, 0xbd])
+ encoded = encode(Chars.HEX.value, bytes_data)
+ decoded = decode(Chars.HEX.value, encoded)
+ assert decoded == bytes_data
+
+
+class TestPuidEncodeDecode:
+ """Tests for Puid.encode() and Puid.decode() methods."""
+
+ def test_puid_encode_decode_safe64(self):
+ """Test Puid encode/decode methods with Safe64."""
+ p = Puid(chars=Chars.SAFE64)
+ bytes_data = bytearray([
+ 0x09, 0x25, 0x84, 0x3c, 0xbd, 0xc0, 0x89, 0xeb,
+ 0x61, 0x75, 0x81, 0x65, 0x09, 0xb4, 0x9a, 0x54, 0x20
+ ])
+ encoded = p.encode(bytes_data)
+ assert encoded == 'CSWEPL3AiethdYFlCbSaVC'
+
+ decoded = p.decode(encoded)
+ assert decoded == bytes_data
+
+ def test_puid_encode_custom_chars(self):
+ """Test Puid encode with custom characters."""
+ p = Puid(chars='dingosky')
+ bytes_data = bytearray([0xc7, 0xc9, 0x00, 0x2a, 0xbd, 0x72])
+ encoded = p.encode(bytes_data)
+ decoded = p.decode(encoded)
+ assert decoded == bytes_data
+
+ def test_puid_encode_empty_bytes(self):
+ """Test Puid encode with empty bytes."""
+ p = Puid()
+ result = p.encode(bytearray())
+ assert result == ''
+
+ def test_puid_decode_empty_string(self):
+ """Test Puid decode with empty string."""
+ p = Puid()
+ result = p.decode('')
+ assert result == bytearray()
+
+
+class TestRiskForEntropy:
+ """Tests for risk_for_entropy() function."""
+
+ def test_risk_for_entropy_basic(self):
+ """Test basic risk calculation."""
+ risk = risk_for_entropy(96, 1.0e7)
+ assert risk > 0
+ # Risk should be approximately 1.58e15 (conservative estimate)
+ assert risk > 1e15
+
+ def test_risk_zero_total(self):
+ """Test risk with zero total returns 0."""
+ assert risk_for_entropy(96, 0) == 0
+
+ def test_risk_one_total(self):
+ """Test risk with total=1 returns 0."""
+ assert risk_for_entropy(96, 1) == 0
+
+ def test_risk_zero_bits(self):
+ """Test risk with zero bits returns 0."""
+ assert risk_for_entropy(0, 1e7) == 0
+
+ def test_risk_negative_bits(self):
+ """Test risk with negative bits returns 0."""
+ assert risk_for_entropy(-10, 1e7) == 0
+
+
+class TestTotalForEntropy:
+ """Tests for total_for_entropy() function."""
+
+ def test_total_for_entropy_basic(self):
+ """Test basic total calculation."""
+ total = total_for_entropy(64, 1e9)
+ assert total > 0
+ # Total should be approximately 192077 (conservative estimate)
+ assert 190000 < total < 200000
+
+ def test_total_zero_bits(self):
+ """Test total with zero bits returns 0."""
+ assert total_for_entropy(0, 1e9) == 0
+
+ def test_total_negative_bits(self):
+ """Test total with negative bits returns 0."""
+ assert total_for_entropy(-10, 1e9) == 0
+
+ def test_total_zero_risk(self):
+ """Test total with zero risk returns 0."""
+ assert total_for_entropy(64, 0) == 0
+
+ def test_total_one_risk(self):
+ """Test total with risk=1 returns 0."""
+ assert total_for_entropy(64, 1) == 0
+
+
+class TestPuidRiskAndTotal:
+ """Tests for Puid.risk() and Puid.total() methods."""
+
+ def test_puid_risk_method(self):
+ """Test Puid.risk() method."""
+ p = Puid()
+ risk = p.risk(1e6)
+ assert risk > 0
+
+ def test_puid_total_method(self):
+ """Test Puid.total() method."""
+ p = Puid()
+ total = p.total(1e15)
+ assert total > 0
+
+ def test_puid_risk_and_total_consistency(self):
+ """Test that risk and total are reasonably inverse operations."""
+ p = Puid(total=1e6, risk=1e15, chars=Chars.SAFE32)
+
+ # Get calculated risk for 1e6 IDs
+ calc_risk = p.risk(1e6)
+ assert calc_risk > 0
+
+ # Get calculated total for 1e15 risk
+ calc_total = p.total(1e15)
+ assert calc_total > 0
+
+ def test_puid_risk_with_different_configs(self):
+ """Test Puid.risk() with different configurations."""
+ p1 = Puid(total=1e6, risk=1e15, chars=Chars.ALPHA)
+ risk1 = p1.risk(1e5)
+ assert risk1 > 0
+
+ p2 = Puid(bits=256)
+ risk2 = p2.risk(1e5)
+ assert risk2 > 0
+ assert risk2 > risk1 # More bits = higher capacity
+
+
+class TestEteMetric:
+ """Tests for Entropy Transform Efficiency (ete) metric."""
+
+ def test_ete_power_of_two_charset(self):
+ """Test ete is 1.0 for power-of-2 charsets."""
+ # Safe64 has 64 characters (2^6)
+ p = Puid(chars=Chars.SAFE64)
+ assert p.ete == 1.0
+
+ # Hex has 16 characters (2^4)
+ p = Puid(chars=Chars.HEX)
+ assert p.ete == 1.0
+
+ # Base32 has 32 characters (2^5)
+ p = Puid(chars=Chars.BASE32)
+ assert p.ete == 1.0
+
+ def test_ete_non_power_of_two_charset(self):
+ """Test ete is less than 1.0 for non-power-of-2 charsets."""
+ # AlphaNum has 62 characters (not power of 2)
+ p = Puid(chars=Chars.ALPHANUM)
+ assert 0 < p.ete < 1.0
+
+ # Decimal has 10 characters (not power of 2)
+ p = Puid(chars=Chars.DECIMAL)
+ assert 0 < p.ete < 1.0
+
+ def test_ete_custom_power_of_two(self):
+ """Test ete for custom power-of-2 charsets."""
+ # 8-char custom charset (2^3)
+ p = Puid(chars='dingosky')
+ assert p.ete == 1.0
+
+ # 4-char custom charset (2^2)
+ p = Puid(chars='ATCG')
+ assert p.ete == 1.0
+
+ def test_ete_alpha_charset(self):
+ """Test ete for Alpha charset (52 chars)."""
+ p = Puid(chars=Chars.ALPHA)
+ # 52 is not a power of 2, so ete should be < 1.0
+ assert 0 < p.ete < 1.0
+ assert p.ete > 0.84 # Should be reasonable efficiency
+
+class TestIntegration:
+ """Integration tests combining multiple features."""
+
+ def test_puid_with_encode_decode_risk_total(self):
+ """Test Puid instance using encode, decode, risk, and total."""
+ p = Puid(total=1e5, risk=1e12, chars=Chars.SAFE32)
+
+ # Generate some bytes (use complete byte boundaries for decode)
+ test_bytes = bytearray([0xd2, 0xe3, 0xe9, 0xda, 0x19])
+
+ # Encode to string
+ encoded = p.encode(test_bytes)
+ assert isinstance(encoded, str)
+
+ # Decode back to bytes (may be padded with zeros on partial byte)
+ decoded = p.decode(encoded)
+ # First 5 bytes should match
+ assert decoded[:5] == test_bytes
+
+ # Test risk calculation
+ risk = p.risk(1e5)
+ assert risk > 0
+
+ # Test total calculation
+ total = p.total(1e12)
+ assert total > 0
+
+ def test_multiple_puid_instances_independence(self):
+ """Test that multiple Puid instances work independently."""
+ p1 = Puid(chars=Chars.HEX, bits=64)
+ p2 = Puid(chars=Chars.ALPHA, bits=128)
+
+ bytes_data = bytearray([0x09, 0x25, 0x84, 0x3c])
+
+ # Each encodes with its own charset
+ encoded1 = p1.encode(bytes_data)
+ encoded2 = p2.encode(bytes_data)
+
+ # Different charsets produce different encodings
+ assert encoded1 != encoded2
+
+ # Each decodes correctly
+ assert p1.decode(encoded1) == bytes_data
+ assert p2.decode(encoded2) == bytes_data
diff --git a/tests/puid_test.py b/tests/puid_test.py
index ea8e140..3beffaa 100644
--- a/tests/puid_test.py
+++ b/tests/puid_test.py
@@ -290,6 +290,16 @@ def test_wordSafe32(util):
assert wordSafe32_id.generate() == "2PqX"
+def test_unicode_chars(util):
+ unicode_bytes = util.fixed_bytes('ec f9 db 7a 33 3d 21 97 a0 c2 bf 92 80 dd 2f 57 12 c1 1a ef')
+ unicode_id = Puid(bits=24, chars='dîngøsky:₩', entropy_source=unicode_bytes)
+
+ assert unicode_id.generate() == '₩gî₩₩nî₩'
+ assert unicode_id.generate() == 'ydkîsnsd'
+ assert unicode_id.generate() == 'îøsîndøk'
+ pass
+
+
def test_256_chars():
single_byte = Chars.SAFE64.value