-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
34 lines (30 loc) · 1006 Bytes
/
tokenizer.py
File metadata and controls
34 lines (30 loc) · 1006 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# src/flatcode/utils/tokenizer.py
import sys
from functools import lru_cache
try:
import tiktoken
except ImportError:
# 这里的错误处理通常在 CLI 层做,但在 Utils 层若缺失可抛出 ImportError
tiktoken = None
class Tokenizer:
_encoding = None
@classmethod
def get_encoding(cls):
if cls._encoding is None:
if tiktoken is None:
raise ImportError("tiktoken not installed")
try:
cls._encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
# Fallback
cls._encoding = tiktoken.get_encoding("p50k_base")
return cls._encoding
@staticmethod
def count(text: str) -> int:
"""Estimates token count for a given text."""
try:
encoding = Tokenizer.get_encoding()
return len(encoding.encode(text))
except Exception:
# Fallback estimation strategy
return len(text) // 4