lyceum-env/lexicon/newtokenizer.py

import os
import re
from tqdm import tqdm

class Tokenizer:
    def __init__(self):
        self.prefixes = set()
        self.suffixes = set()
        self.vocab = set()
        self.special_tokens = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>"]
        self.token_file = "tokens.txt"

    def _load__file(self, file_path):
        if not os.path.exists(file_path):
            return set()
        with open(file_path, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f.readlines())

    def generate_tokens(self, prefix_file, text_file, suffix_file):
        self.prefixes = self._load_list_from_file(prefix_file)
        self.suffixes = self._load_list_from_file(suffix_file)

        with open(text_file, 'r', encoding='utf-8') as f:
            words = set(line.strip() for line in f.readlines())

        # Add single character tokens with trailing space (e.g., "a ", "I ")
        self.vocab = {w + ' ' if len(w) == 1 else w for w in words}

        # Process each word in text file
        for word in tqdm(words):
            tokens = self._split_word(word)
            self.vocab.update(tokens)
            print(f"{word} -> {tokens}")

        # Save all tokens to file
        self._save_tokens()

    def _save_tokens(self):
        with open(self.token_file, 'w', encoding='utf-8') as f:
            for token in self.special_tokens:
                f.write(token + '\n')
            for token in sorted(self.vocab, key=len, reverse=True):
                f.write(token + '\n')

    def _split_word(self, word):
        tokens = []

        # Check for prefixes
        prefix_found = False
        for prefix in sorted(self.prefixes, key=len, reverse=True):
            if word.startswith(prefix):
                tokens.append(prefix)
                word = word[len(prefix):]
                prefix_found = True
                break

        # Check for suffixes
        suffix_found = False
        for suffix in sorted(self.suffixes, key=len, reverse=True):
            if word.endswith(suffix):
                tokens.append(suffix)
                word = word[:-len(suffix)]
                suffix_found = True
                break

        # Split remaining middle part
        middle_tokens = self._split_compound_word(word)
        tokens.extend(middle_tokens)

        return tokens

    def _split_compound_word(self, word):
        tokens = []
        if not word:
            return tokens

        # Special handling of compound words with special characters (except '.')
        split_pattern = re.compile(r"([^\w.])")
        parts = re.split(split_pattern, word)

        print(parts)

        for part in parts:
            part = part.strip()  # Clean up any leading/trailing whitespace
            if part == '':
                continue
            if re.match(r"[^\w.]", part):  # If it's a special character (except '.')
                # Attach previous token with the special character (e.g., p-)
                if tokens:
                    tokens[-1] += part
                else:
                    # If no previous token exists, treat as a special token
                    tokens.append(part)
            else:
                # Process the part to find tokens
                sub_tokens = self._split_by_vocab(part)
                if not sub_tokens:
                    # If no tokens found in vocab, add as a fallback special token
                    sub_tokens = [part]
                tokens.extend(sub_tokens)

        print(tokens)
        # Ensure that any trailing special characters are included
        if tokens and re.match(r"[^\w.]", word[-1]):
            tokens[-1] += word[-1]

        # Replace empty tokens with a fallback special token
        tokens = [token if token else "<UNK>" for token in tokens]

        print(tokens)

        return tokens

    def _split_by_vocab(self, word):
        """Helper method to split a word by longest matching tokens in the vocab."""
        tokens = []
        if not word:
            return tokens

        # Start from the longest match down to shortest
        for w in sorted(self.vocab, key=len, reverse=True):
            if word.startswith(w):
                tokens.append(w)
                remainder = word[len(w):]
                tokens.extend(self._split_by_vocab(remainder))
                break

        return tokens

if __name__ == '__main__':
    tokenizer = Tokenizer()
    tokenizer.generate_tokens("tokens/prefixes.txt", "tokens/words.txt", "tokens/suffixes.txt")