lyceum-env/lexicon/newnew.py

import os

from tqdm import tqdm


class Tokenizer:
    def __init__(self):
        self.special_tokens = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>", "<SOS>", "<EOS>", "<BOS>"]
        self.token_dir = "tokens"
        self.token_filename = "tokens.txt"

    def _load_list_from_file(self, file_path):
        if not os.path.exists(file_path):
            raise NameError(f"File {file_path} not found. Are you sure it exists and/or that the name is correct?")
        with open(file_path, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f.readlines())

    def generate_tokens(
        self,
        text_file,
        prefix_file=os.path.join("tokens", "prefixes.txt"),
        root_file=os.path.join("tokens", "roots.txt"),
        suffix_file=os.path.join("tokens", "suffix.txt"),
    ):
        self.token_set = set()
        self.prefixes = self._load_list_from_file(prefix_file)
        self.root_words = self._load_list_from_file(root_file)
        self.suffixes = self._load_list_from_file(suffix_file)
        self.vocab = self._load_list_from_file(text_file)

        for compound_word in tqdm(self.vocab):

            compound_word = compound_word.strip()

            print(compound_word)
            for root_word in sorted(self.root_words, key=len):
                if root_word in compound_word:
                    print(compound_word)
                    self.token_set.add(root_word)
                    compound_word = compound_word.replace(root_word, '')
                    print('--------------------------------------------')
                    break

            print(compound_word)
            for prefix in sorted(self.prefixes, key=len, reverse=True):
                if compound_word.startswith(prefix):
                    word_prefix = prefix
                    print(f"Prefix {prefix}")
                    compound_word = compound_word[len(prefix):]
                    break
            print(compound_word)

            for suffix in sorted(self.suffixes, key=len, reverse=True):
                if compound_word.endswith(suffix):
                    word_suffix = suffix
                    print(f"Suffix {suffix}")
                    compound_word = compound_word[:-len(suffix)]
                    break

            print(compound_word)
            print('\n')

if __name__ == '__main__':
    tokenizer = Tokenizer()
    tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/roots.txt", "tokens/suffixes.txt", "tokens/words.txt")