|
|
附录:分析脚本核心逻辑
python
-- coding: utf-8 --
import json
import regex as re
import tiktoken
from collections import Counter
1️⃣ 读取数据
def load_jsonl(path):
with open(path, "r", encoding="utf-8") as f:
for line in f:
yield json.loads(line)
2️⃣ 构建字符分类正则
RE_CHINESE = re.compile(r'[\p{Han}]')
RE_ENGLISH = re.compile(r'[A-Za-z]+')
RE_DIGIT = re.compile(r'\d+')
RE_OTHER = re.compile(r'[^ \p{Han}A-Za-z0-9]+')
def classify(text):
"""返回四类字符数的字典"""
counts = Counter()
counts['ch'] = len(RE_CHINESE.findall(text))
counts['en'] = len(RE_ENGLISH.findall(text))
counts['di'] = len(RE_DIGIT.findall(text))
counts['ot'] = len(RE_OTHER.findall(text))
- # 其余的空格等使用 .strip() 统计为 “其他”
- return counts
复制代码 3️⃣ token 计数(cl100k_base)
enc = tiktoken.get_encoding("cl100k_base")
def token_len(text):
return len(enc.encode(text))
4️⃣ 主循环:累计全局统计
global_counts = Counter()
total_tokens = 0
for obj in load_jsonl("QZS_Phase2_Clean_Conversations.jsonl"):
content = obj["content"]
cls = classify(content)
global_counts.update(cls)
total_tokens += token_len(content)
5️⃣ 逆向推算(直接使用已知英文/数字/其他系数)
a_en, a_di, a_ot = 0.25, 0.5, 0.5
a_ch = (total_tokens - (a_englobal_counts['en'] +
a_diglobal_counts['di'] +
a_ot*global_counts['ot'])) / global_counts['ch']
print(f"中文系数 = {a_ch:.3f}")
print(f"全局 token = {total_tokens:,}")
6️⃣ 分段统计(示例:前 1/3、中 1/3、后 1/3)
def segment_stats(data, n_seg=3):
seg_len = len(data) // n_seg
seg_res = []
for i in range(n_seg):
start = i seg_len
end = (i + 1) seg_len if i < n_seg - 1 else len(data)
seg_counts = Counter()
seg_tokens = 0
for obj in data[start:end]:
cnt = classify(obj["content"])
seg_counts.update(cnt)
seg_tokens += token_len(obj["content"])
seg_res.append((seg_counts, seg_tokens))
return seg_res
读取全部数据到列表便于切分
all_data = list(load_jsonl("QZS_Phase2_Clean_Conversations.jsonl"))
segments = segment_stats(all_data)
7️⃣ 字节估算(UTF 8)
BYTE_PER = {'ch':3, 'en':1, 'di':1, 'ot':1.5}
def bytes_estimate(cnt):
return sum(cnt[k] * BYTE_PER[k] for k in BYTE_PER)
total_bytes = bytes_estimate(global_counts)
print(f"每 token 平均字节 = {total_bytes/total_tokens:.2f}")
-- coding: utf-8 --
import json
import regex as re
import tiktoken
from collections import Counter
1️⃣ 读取数据
def load_jsonl(path):
with open(path, "r", encoding="utf-8") as f:
for line in f:
yield json.loads(line)
2️⃣ 构建字符分类正则
RE_CHINESE = re.compile(r'[\p{Han}]')
RE_ENGLISH = re.compile(r'[A-Za-z]+')
RE_DIGIT = re.compile(r'\d+')
RE_OTHER = re.compile(r'[^ \p{Han}A-Za-z0-9]+')
def classify(text):
"""返回四类字符数的字典"""
counts = Counter()
counts['ch'] = len(RE_CHINESE.findall(text))
counts['en'] = len(RE_ENGLISH.findall(text))
counts['di'] = len(RE_DIGIT.findall(text))
counts['ot'] = len(RE_OTHER.findall(text))
- # 其余的空格等使用 .strip() 统计为 “其他”
- return counts
复制代码
3️⃣ token 计数(cl100k_base)
enc = tiktoken.get_encoding("cl100k_base")
def token_len(text):
return len(enc.encode(text))
4️⃣ 主循环:累计全局统计
global_counts = Counter()
total_tokens = 0
for obj in load_jsonl("QZS_Phase2_Clean_Conversations.jsonl"):
content = obj["content"]
cls = classify(content)
global_counts.update(cls)
total_tokens += token_len(content)
5️⃣ 逆向推算(直接使用已知英文/数字/其他系数)
a_en, a_di, a_ot = 0.25, 0.5, 0.5
a_ch = (total_tokens - (a_englobal_counts['en'] +
a_diglobal_counts['di'] +
a_ot*global_counts['ot'])) / global_counts['ch']
print(f"中文系数 = {a_ch:.3f}")
print(f"全局 token = {total_tokens:,}")
6️⃣ 分段统计(示例:前 1/3、中 1/3、后 1/3)
def segment_stats(data, n_seg=3):
seg_len = len(data) // n_seg
seg_res = []
for i in range(n_seg):
start = i seg_len
end = (i + 1) seg_len if i < n_seg - 1 else len(data)
seg_counts = Counter()
seg_tokens = 0
for obj in data[start:end]:
cnt = classify(obj["content"])
seg_counts.update(cnt)
seg_tokens += token_len(obj["content"])
seg_res.append((seg_counts, seg_tokens))
return seg_res
读取全部数据到列表便于切分
all_data = list(load_jsonl("QZS_Phase2_Clean_Conversations.jsonl"))
segments = segment_stats(all_data)
7️⃣ 字节估算(UTF 8)
BYTE_PER = {'ch':3, 'en':1, 'di':1, 'ot':1.5}
def bytes_estimate(cnt):
return sum(cnt[k] * BYTE_PER[k] for k in BYTE_PER)
total_bytes = bytes_estimate(global_counts)
print(f"每 token 平均字节 = {total_bytes/total_tokens:.2f}")
上述脚本已同步至 GitHub,requirements.txt 中列出了 tiktoken==0.5.0、regex==2023.12.25 等依赖,使用 Python 3.10 运行即可复现本文所有实验结果。
参考文献
OpenAI. tiktoken: Fast BPE Tokenizer for OpenAI models (2023). https://github.com/openai/tiktoken
Brown, T. B. et al. Language Models are Few-Shot Learners. Advances in Neural Information Processing Systems 33 (2020).
Wang, Y., & Liu, Q. 中文 BPE 分词的压缩特性研究. 计算语言学 49(2): 123 138 (2022).
Zhang, H. et al. Long-context Language Modeling: A Survey. arXiv preprint arXiv:2309.03023 (2023).
DeepSeek. DeepSeek 百万 Token 对话项目(Phase 2) (2024). https://github.com/DeepSeek/DeepSeek-Million-Token
|
|