65 lines
2.5 KiB
Python
65 lines
2.5 KiB
Python
import os
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
class Tokenizer:
|
|
def __init__(self):
|
|
self.special_tokens = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>", "<SOS>", "<EOS>", "<BOS>"]
|
|
self.token_dir = "tokens"
|
|
self.token_filename = "tokens.txt"
|
|
|
|
def _load_list_from_file(self, file_path):
|
|
if not os.path.exists(file_path):
|
|
raise NameError(f"File {file_path} not found. Are you sure it exists and/or that the name is correct?")
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return set(line.strip() for line in f.readlines())
|
|
|
|
def generate_tokens(
|
|
self,
|
|
text_file,
|
|
prefix_file=os.path.join("tokens", "prefixes.txt"),
|
|
root_file=os.path.join("tokens", "roots.txt"),
|
|
suffix_file=os.path.join("tokens", "suffix.txt"),
|
|
):
|
|
self.token_set = set()
|
|
self.prefixes = self._load_list_from_file(prefix_file)
|
|
self.root_words = self._load_list_from_file(root_file)
|
|
self.suffixes = self._load_list_from_file(suffix_file)
|
|
self.vocab = self._load_list_from_file(text_file)
|
|
|
|
for compound_word in tqdm(self.vocab):
|
|
|
|
compound_word = compound_word.strip()
|
|
|
|
print(compound_word)
|
|
for root_word in sorted(self.root_words, key=len):
|
|
if root_word in compound_word:
|
|
print(compound_word)
|
|
self.token_set.add(root_word)
|
|
compound_word = compound_word.replace(root_word, '')
|
|
print('--------------------------------------------')
|
|
break
|
|
|
|
print(compound_word)
|
|
for prefix in sorted(self.prefixes, key=len, reverse=True):
|
|
if compound_word.startswith(prefix):
|
|
word_prefix = prefix
|
|
print(f"Prefix {prefix}")
|
|
compound_word = compound_word[len(prefix):]
|
|
break
|
|
print(compound_word)
|
|
|
|
for suffix in sorted(self.suffixes, key=len, reverse=True):
|
|
if compound_word.endswith(suffix):
|
|
word_suffix = suffix
|
|
print(f"Suffix {suffix}")
|
|
compound_word = compound_word[:-len(suffix)]
|
|
break
|
|
|
|
print(compound_word)
|
|
print('\n')
|
|
|
|
if __name__ == '__main__':
|
|
tokenizer = Tokenizer()
|
|
tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/roots.txt", "tokens/suffixes.txt", "tokens/words.txt")
|