lyceum-env/lexicon/newtokenizer.py
2024-10-03 10:15:39 +02:00

133 lines
4.4 KiB
Python

import os
import re
from tqdm import tqdm
class Tokenizer:
def __init__(self):
self.prefixes = set()
self.suffixes = set()
self.vocab = set()
self.special_tokens = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>"]
self.token_file = "tokens.txt"
def _load__file(self, file_path):
if not os.path.exists(file_path):
return set()
with open(file_path, 'r', encoding='utf-8') as f:
return set(line.strip() for line in f.readlines())
def generate_tokens(self, prefix_file, text_file, suffix_file):
self.prefixes = self._load_list_from_file(prefix_file)
self.suffixes = self._load_list_from_file(suffix_file)
with open(text_file, 'r', encoding='utf-8') as f:
words = set(line.strip() for line in f.readlines())
# Add single character tokens with trailing space (e.g., "a ", "I ")
self.vocab = {w + ' ' if len(w) == 1 else w for w in words}
# Process each word in text file
for word in tqdm(words):
tokens = self._split_word(word)
self.vocab.update(tokens)
print(f"{word} -> {tokens}")
# Save all tokens to file
self._save_tokens()
def _save_tokens(self):
with open(self.token_file, 'w', encoding='utf-8') as f:
for token in self.special_tokens:
f.write(token + '\n')
for token in sorted(self.vocab, key=len, reverse=True):
f.write(token + '\n')
def _split_word(self, word):
tokens = []
# Check for prefixes
prefix_found = False
for prefix in sorted(self.prefixes, key=len, reverse=True):
if word.startswith(prefix):
tokens.append(prefix)
word = word[len(prefix):]
prefix_found = True
break
# Check for suffixes
suffix_found = False
for suffix in sorted(self.suffixes, key=len, reverse=True):
if word.endswith(suffix):
tokens.append(suffix)
word = word[:-len(suffix)]
suffix_found = True
break
# Split remaining middle part
middle_tokens = self._split_compound_word(word)
tokens.extend(middle_tokens)
return tokens
def _split_compound_word(self, word):
tokens = []
if not word:
return tokens
# Special handling of compound words with special characters (except '.')
split_pattern = re.compile(r"([^\w.])")
parts = re.split(split_pattern, word)
print(parts)
for part in parts:
part = part.strip() # Clean up any leading/trailing whitespace
if part == '':
continue
if re.match(r"[^\w.]", part): # If it's a special character (except '.')
# Attach previous token with the special character (e.g., p-)
if tokens:
tokens[-1] += part
else:
# If no previous token exists, treat as a special token
tokens.append(part)
else:
# Process the part to find tokens
sub_tokens = self._split_by_vocab(part)
if not sub_tokens:
# If no tokens found in vocab, add as a fallback special token
sub_tokens = [part]
tokens.extend(sub_tokens)
print(tokens)
# Ensure that any trailing special characters are included
if tokens and re.match(r"[^\w.]", word[-1]):
tokens[-1] += word[-1]
# Replace empty tokens with a fallback special token
tokens = [token if token else "<UNK>" for token in tokens]
print(tokens)
return tokens
def _split_by_vocab(self, word):
"""Helper method to split a word by longest matching tokens in the vocab."""
tokens = []
if not word:
return tokens
# Start from the longest match down to shortest
for w in sorted(self.vocab, key=len, reverse=True):
if word.startswith(w):
tokens.append(w)
remainder = word[len(w):]
tokens.extend(self._split_by_vocab(remainder))
break
return tokens
if __name__ == '__main__':
tokenizer = Tokenizer()
tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/words.txt", "tokens/suffixes.txt")