lyceum-env/lexicon/newnew.py
2024-10-03 10:15:39 +02:00

65 lines
2.5 KiB
Python

import os
from tqdm import tqdm
class Tokenizer:
def __init__(self):
self.special_tokens = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>", "<SOS>", "<EOS>", "<BOS>"]
self.token_dir = "tokens"
self.token_filename = "tokens.txt"
def _load_list_from_file(self, file_path):
if not os.path.exists(file_path):
raise NameError(f"File {file_path} not found. Are you sure it exists and/or that the name is correct?")
with open(file_path, 'r', encoding='utf-8') as f:
return set(line.strip() for line in f.readlines())
def generate_tokens(
self,
text_file,
prefix_file=os.path.join("tokens", "prefixes.txt"),
root_file=os.path.join("tokens", "roots.txt"),
suffix_file=os.path.join("tokens", "suffix.txt"),
):
self.token_set = set()
self.prefixes = self._load_list_from_file(prefix_file)
self.root_words = self._load_list_from_file(root_file)
self.suffixes = self._load_list_from_file(suffix_file)
self.vocab = self._load_list_from_file(text_file)
for compound_word in tqdm(self.vocab):
compound_word = compound_word.strip()
print(compound_word)
for root_word in sorted(self.root_words, key=len):
if root_word in compound_word:
print(compound_word)
self.token_set.add(root_word)
compound_word = compound_word.replace(root_word, '')
print('--------------------------------------------')
break
print(compound_word)
for prefix in sorted(self.prefixes, key=len, reverse=True):
if compound_word.startswith(prefix):
word_prefix = prefix
print(f"Prefix {prefix}")
compound_word = compound_word[len(prefix):]
break
print(compound_word)
for suffix in sorted(self.suffixes, key=len, reverse=True):
if compound_word.endswith(suffix):
word_suffix = suffix
print(f"Suffix {suffix}")
compound_word = compound_word[:-len(suffix)]
break
print(compound_word)
print('\n')
if __name__ == '__main__':
tokenizer = Tokenizer()
tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/roots.txt", "tokens/suffixes.txt", "tokens/words.txt")