134 lines
4.4 KiB
Python
134 lines
4.4 KiB
Python
|
import os
|
||
|
import re
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
class Tokenizer:
|
||
|
def __init__(self):
|
||
|
self.prefixes = set()
|
||
|
self.suffixes = set()
|
||
|
self.vocab = set()
|
||
|
self.special_tokens = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>"]
|
||
|
self.token_file = "tokens.txt"
|
||
|
|
||
|
def _load__file(self, file_path):
|
||
|
if not os.path.exists(file_path):
|
||
|
return set()
|
||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||
|
return set(line.strip() for line in f.readlines())
|
||
|
|
||
|
def generate_tokens(self, prefix_file, text_file, suffix_file):
|
||
|
self.prefixes = self._load_list_from_file(prefix_file)
|
||
|
self.suffixes = self._load_list_from_file(suffix_file)
|
||
|
|
||
|
with open(text_file, 'r', encoding='utf-8') as f:
|
||
|
words = set(line.strip() for line in f.readlines())
|
||
|
|
||
|
# Add single character tokens with trailing space (e.g., "a ", "I ")
|
||
|
self.vocab = {w + ' ' if len(w) == 1 else w for w in words}
|
||
|
|
||
|
# Process each word in text file
|
||
|
for word in tqdm(words):
|
||
|
tokens = self._split_word(word)
|
||
|
self.vocab.update(tokens)
|
||
|
print(f"{word} -> {tokens}")
|
||
|
|
||
|
# Save all tokens to file
|
||
|
self._save_tokens()
|
||
|
|
||
|
def _save_tokens(self):
|
||
|
with open(self.token_file, 'w', encoding='utf-8') as f:
|
||
|
for token in self.special_tokens:
|
||
|
f.write(token + '\n')
|
||
|
for token in sorted(self.vocab, key=len, reverse=True):
|
||
|
f.write(token + '\n')
|
||
|
|
||
|
def _split_word(self, word):
|
||
|
tokens = []
|
||
|
|
||
|
# Check for prefixes
|
||
|
prefix_found = False
|
||
|
for prefix in sorted(self.prefixes, key=len, reverse=True):
|
||
|
if word.startswith(prefix):
|
||
|
tokens.append(prefix)
|
||
|
word = word[len(prefix):]
|
||
|
prefix_found = True
|
||
|
break
|
||
|
|
||
|
# Check for suffixes
|
||
|
suffix_found = False
|
||
|
for suffix in sorted(self.suffixes, key=len, reverse=True):
|
||
|
if word.endswith(suffix):
|
||
|
tokens.append(suffix)
|
||
|
word = word[:-len(suffix)]
|
||
|
suffix_found = True
|
||
|
break
|
||
|
|
||
|
# Split remaining middle part
|
||
|
middle_tokens = self._split_compound_word(word)
|
||
|
tokens.extend(middle_tokens)
|
||
|
|
||
|
return tokens
|
||
|
|
||
|
def _split_compound_word(self, word):
|
||
|
tokens = []
|
||
|
if not word:
|
||
|
return tokens
|
||
|
|
||
|
# Special handling of compound words with special characters (except '.')
|
||
|
split_pattern = re.compile(r"([^\w.])")
|
||
|
parts = re.split(split_pattern, word)
|
||
|
|
||
|
print(parts)
|
||
|
|
||
|
for part in parts:
|
||
|
part = part.strip() # Clean up any leading/trailing whitespace
|
||
|
if part == '':
|
||
|
continue
|
||
|
if re.match(r"[^\w.]", part): # If it's a special character (except '.')
|
||
|
# Attach previous token with the special character (e.g., p-)
|
||
|
if tokens:
|
||
|
tokens[-1] += part
|
||
|
else:
|
||
|
# If no previous token exists, treat as a special token
|
||
|
tokens.append(part)
|
||
|
else:
|
||
|
# Process the part to find tokens
|
||
|
sub_tokens = self._split_by_vocab(part)
|
||
|
if not sub_tokens:
|
||
|
# If no tokens found in vocab, add as a fallback special token
|
||
|
sub_tokens = [part]
|
||
|
tokens.extend(sub_tokens)
|
||
|
|
||
|
print(tokens)
|
||
|
# Ensure that any trailing special characters are included
|
||
|
if tokens and re.match(r"[^\w.]", word[-1]):
|
||
|
tokens[-1] += word[-1]
|
||
|
|
||
|
# Replace empty tokens with a fallback special token
|
||
|
tokens = [token if token else "<UNK>" for token in tokens]
|
||
|
|
||
|
print(tokens)
|
||
|
|
||
|
return tokens
|
||
|
|
||
|
def _split_by_vocab(self, word):
|
||
|
"""Helper method to split a word by longest matching tokens in the vocab."""
|
||
|
tokens = []
|
||
|
if not word:
|
||
|
return tokens
|
||
|
|
||
|
# Start from the longest match down to shortest
|
||
|
for w in sorted(self.vocab, key=len, reverse=True):
|
||
|
if word.startswith(w):
|
||
|
tokens.append(w)
|
||
|
remainder = word[len(w):]
|
||
|
tokens.extend(self._split_by_vocab(remainder))
|
||
|
break
|
||
|
|
||
|
return tokens
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
tokenizer = Tokenizer()
|
||
|
tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/words.txt", "tokens/suffixes.txt")
|
||
|
|