-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathtrain-tokenizer.py
More file actions
executable file
·114 lines (87 loc) · 3.77 KB
/
train-tokenizer.py
File metadata and controls
executable file
·114 lines (87 loc) · 3.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import argparse
import json
import yaml
import os
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFKC
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
def load_config(config_path):
"""Load the YAML configuration file."""
with open(config_path, 'r') as file:
return yaml.safe_load(file)
def load_jsonl_texts(file_path):
"""Load and extract text from a JSONL file."""
texts = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
try:
item = json.loads(line.strip())
if 'text' in item:
texts.append(item['text'])
except json.JSONDecodeError:
print(f"Warning: Could not parse line: {line}")
return texts
def batch_iterator(texts, batch_size=1000):
"""Creates batches of texts for tokenizer training."""
for i in range(0, len(texts), batch_size):
yield texts[i:i+batch_size]
def train_tokenizer(config):
"""Train a byte-level BPE tokenizer based on the provided configuration."""
# Initialize the tokenizer with a BPE model
tokenizer = Tokenizer(BPE())
# Configure pre-tokenizer for superword BPE (no word boundaries)
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False, use_regex=False)
# Set up the normalizer
tokenizer.normalizer = NFKC()
# Set up the decoder
tokenizer.decoder = ByteLevelDecoder()
# Get special tokens from config
special_tokens = []
if 'data' in config and 'tokenizer' in config['data'] and 'special_tokens' in config['data']['tokenizer']:
special_tokens = list(config['data']['tokenizer']['special_tokens'].values())
# Get vocab size from config
vocab_size = 32000 # Default
if 'tokenizer' in config and 'vocab_size' in config['tokenizer']:
vocab_size = config['tokenizer']['vocab_size']
# Set up the trainer
trainer = BpeTrainer(
vocab_size=vocab_size,
min_frequency=2,
special_tokens=special_tokens,
show_progress=True
)
# Load training data
input_file = config['data']['input_file'] if 'data' in config and 'input_file' in config['data'] else 'train.jsonl'
texts = load_jsonl_texts(input_file)
if 'data' in config and 'max_texts_to_train_on' in config['data']:
texts = texts[:config['data']['max_texts_to_train_on']]
print(f"Training tokenizer on {len(texts)} texts with vocab size {vocab_size}")
# Train the tokenizer
tokenizer.train_from_iterator(batch_iterator(texts), trainer=trainer)
# Create output directory if it doesn't exist
output_dir = config['tokenizer']['output_dir'] if 'tokenizer' in config and 'output_dir' in config['tokenizer'] else 'tokenizer'
os.makedirs(output_dir, exist_ok=True)
# Save the tokenizer
output_path = os.path.join(output_dir, "tokenizer.json")
tokenizer.save(output_path)
print(f"Tokenizer saved to {output_path}")
# Test the tokenizer
if texts:
test_text = texts[0][:100] # Take first 100 chars of first text for testing
encoded = tokenizer.encode(test_text)
print("\nTest encoding:")
print(f"Text: {test_text}")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")
return tokenizer
def main():
parser = argparse.ArgumentParser(description="Train a BPE tokenizer using a YAML configuration")
parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
args = parser.parse_args()
config = load_config(args.config)
train_tokenizer(config)
if __name__ == "__main__":
main()