import os import re from tqdm import tqdm class Tokenizer: def __init__(self): self.prefixes = set() self.suffixes = set() self.vocab = set() self.special_tokens = ["", "", "", "", ""] self.token_file = "tokens.txt" def _load__file(self, file_path): if not os.path.exists(file_path): return set() with open(file_path, 'r', encoding='utf-8') as f: return set(line.strip() for line in f.readlines()) def generate_tokens(self, prefix_file, text_file, suffix_file): self.prefixes = self._load_list_from_file(prefix_file) self.suffixes = self._load_list_from_file(suffix_file) with open(text_file, 'r', encoding='utf-8') as f: words = set(line.strip() for line in f.readlines()) # Add single character tokens with trailing space (e.g., "a ", "I ") self.vocab = {w + ' ' if len(w) == 1 else w for w in words} # Process each word in text file for word in tqdm(words): tokens = self._split_word(word) self.vocab.update(tokens) print(f"{word} -> {tokens}") # Save all tokens to file self._save_tokens() def _save_tokens(self): with open(self.token_file, 'w', encoding='utf-8') as f: for token in self.special_tokens: f.write(token + '\n') for token in sorted(self.vocab, key=len, reverse=True): f.write(token + '\n') def _split_word(self, word): tokens = [] # Check for prefixes prefix_found = False for prefix in sorted(self.prefixes, key=len, reverse=True): if word.startswith(prefix): tokens.append(prefix) word = word[len(prefix):] prefix_found = True break # Check for suffixes suffix_found = False for suffix in sorted(self.suffixes, key=len, reverse=True): if word.endswith(suffix): tokens.append(suffix) word = word[:-len(suffix)] suffix_found = True break # Split remaining middle part middle_tokens = self._split_compound_word(word) tokens.extend(middle_tokens) return tokens def _split_compound_word(self, word): tokens = [] if not word: return tokens # Special handling of compound words with special characters (except '.') split_pattern = re.compile(r"([^\w.])") parts = re.split(split_pattern, word) print(parts) for part in parts: part = part.strip() # Clean up any leading/trailing whitespace if part == '': continue if re.match(r"[^\w.]", part): # If it's a special character (except '.') # Attach previous token with the special character (e.g., p-) if tokens: tokens[-1] += part else: # If no previous token exists, treat as a special token tokens.append(part) else: # Process the part to find tokens sub_tokens = self._split_by_vocab(part) if not sub_tokens: # If no tokens found in vocab, add as a fallback special token sub_tokens = [part] tokens.extend(sub_tokens) print(tokens) # Ensure that any trailing special characters are included if tokens and re.match(r"[^\w.]", word[-1]): tokens[-1] += word[-1] # Replace empty tokens with a fallback special token tokens = [token if token else "" for token in tokens] print(tokens) return tokens def _split_by_vocab(self, word): """Helper method to split a word by longest matching tokens in the vocab.""" tokens = [] if not word: return tokens # Start from the longest match down to shortest for w in sorted(self.vocab, key=len, reverse=True): if word.startswith(w): tokens.append(w) remainder = word[len(w):] tokens.extend(self._split_by_vocab(remainder)) break return tokens if __name__ == '__main__': tokenizer = Tokenizer() tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/words.txt", "tokens/suffixes.txt")