import os from tqdm import tqdm class Tokenizer: def __init__(self): self.special_tokens = ["", "", "", "", "", "", "", ""] self.token_dir = "tokens" self.token_filename = "tokens.txt" def _load_list_from_file(self, file_path): if not os.path.exists(file_path): raise NameError(f"File {file_path} not found. Are you sure it exists and/or that the name is correct?") with open(file_path, 'r', encoding='utf-8') as f: return set(line.strip() for line in f.readlines()) def generate_tokens( self, text_file, prefix_file=os.path.join("tokens", "prefixes.txt"), root_file=os.path.join("tokens", "roots.txt"), suffix_file=os.path.join("tokens", "suffix.txt"), ): self.token_set = set() self.prefixes = self._load_list_from_file(prefix_file) self.root_words = self._load_list_from_file(root_file) self.suffixes = self._load_list_from_file(suffix_file) self.vocab = self._load_list_from_file(text_file) for compound_word in tqdm(self.vocab): compound_word = compound_word.strip() print(compound_word) for root_word in sorted(self.root_words, key=len): if root_word in compound_word: print(compound_word) self.token_set.add(root_word) compound_word = compound_word.replace(root_word, '') print('--------------------------------------------') break print(compound_word) for prefix in sorted(self.prefixes, key=len, reverse=True): if compound_word.startswith(prefix): word_prefix = prefix print(f"Prefix {prefix}") compound_word = compound_word[len(prefix):] break print(compound_word) for suffix in sorted(self.suffixes, key=len, reverse=True): if compound_word.endswith(suffix): word_suffix = suffix print(f"Suffix {suffix}") compound_word = compound_word[:-len(suffix)] break print(compound_word) print('\n') if __name__ == '__main__': tokenizer = Tokenizer() tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/roots.txt", "tokens/suffixes.txt", "tokens/words.txt")