lyceum-env/lexicon/newnew.py

import os

from tqdm import tqdm


class Tokenizer:
    def __init__(self):
        self.special_tokens = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>", "<SOS>", "<EOS>", "<BOS>"]
        self.token_dir = "tokens"
        self.token_filename = "tokens.txt"

    def _load_list_from_file(self, file_path):
        if not os.path.exists(file_path):
            raise NameError(f"File {file_path} not found. Are you sure it exists and/or that the name is correct?")
        with open(file_path, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f.readlines())

    def generate_tokens(
        self,
        text_file,
        prefix_file=os.path.join("tokens", "prefixes.txt"),
        root_file=os.path.join("tokens", "roots.txt"), 
        suffix_file=os.path.join("tokens", "suffix.txt"), 
    ):
        self.token_set = set()
        self.prefixes = self._load_list_from_file(prefix_file)
        self.root_words = self._load_list_from_file(root_file)
        self.suffixes = self._load_list_from_file(suffix_file)
        self.vocab = self._load_list_from_file(text_file)
        
        for compound_word in tqdm(self.vocab):

            compound_word = compound_word.strip()

            print(compound_word)
            for root_word in sorted(self.root_words, key=len):
                if root_word in compound_word:
                    print(compound_word)
                    self.token_set.add(root_word)
                    compound_word = compound_word.replace(root_word, '')
                    print('--------------------------------------------')
                    break

            print(compound_word)
            for prefix in sorted(self.prefixes, key=len, reverse=True):
                if compound_word.startswith(prefix):
                    word_prefix = prefix
                    print(f"Prefix {prefix}")
                    compound_word = compound_word[len(prefix):]
                    break
            print(compound_word)

            for suffix in sorted(self.suffixes, key=len, reverse=True):
                if compound_word.endswith(suffix):
                    word_suffix = suffix
                    print(f"Suffix {suffix}")
                    compound_word = compound_word[:-len(suffix)]
                    break

            print(compound_word)
            print('\n')

if __name__ == '__main__':
    tokenizer = Tokenizer()
    tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/roots.txt", "tokens/suffixes.txt", "tokens/words.txt")
initial commit 2024-10-03 08:15:39 +00:00			`import os`

			`from tqdm import tqdm`


			`class Tokenizer:`
			`def __init__(self):`
			`self.special_tokens = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>", "<SOS>", "<EOS>", "<BOS>"]`
			`self.token_dir = "tokens"`
			`self.token_filename = "tokens.txt"`

			`def _load_list_from_file(self, file_path):`
			`if not os.path.exists(file_path):`
			`raise NameError(f"File {file_path} not found. Are you sure it exists and/or that the name is correct?")`
			`with open(file_path, 'r', encoding='utf-8') as f:`
			`return set(line.strip() for line in f.readlines())`

			`def generate_tokens(`
			`self,`
			`text_file,`
			`prefix_file=os.path.join("tokens", "prefixes.txt"),`
			`root_file=os.path.join("tokens", "roots.txt"),`
			`suffix_file=os.path.join("tokens", "suffix.txt"),`
			`):`
			`self.token_set = set()`
			`self.prefixes = self._load_list_from_file(prefix_file)`
			`self.root_words = self._load_list_from_file(root_file)`
			`self.suffixes = self._load_list_from_file(suffix_file)`
			`self.vocab = self._load_list_from_file(text_file)`

			`for compound_word in tqdm(self.vocab):`

			`compound_word = compound_word.strip()`

			`print(compound_word)`
			`for root_word in sorted(self.root_words, key=len):`
			`if root_word in compound_word:`
			`print(compound_word)`
			`self.token_set.add(root_word)`
			`compound_word = compound_word.replace(root_word, '')`
			`print('--------------------------------------------')`
			`break`

			`print(compound_word)`
			`for prefix in sorted(self.prefixes, key=len, reverse=True):`
			`if compound_word.startswith(prefix):`
			`word_prefix = prefix`
			`print(f"Prefix {prefix}")`
			`compound_word = compound_word[len(prefix):]`
			`break`
			`print(compound_word)`

			`for suffix in sorted(self.suffixes, key=len, reverse=True):`
			`if compound_word.endswith(suffix):`
			`word_suffix = suffix`
			`print(f"Suffix {suffix}")`
			`compound_word = compound_word[:-len(suffix)]`
			`break`

			`print(compound_word)`
			`print('\n')`

			`if __name__ == '__main__':`
			`tokenizer = Tokenizer()`
			`tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/roots.txt", "tokens/suffixes.txt", "tokens/words.txt")`