Removed img and started working on english training

2024-11-05 20:45:19 +01:00 · 2024-11-05 20:45:19 +01:00 · 4ae2e29e16
commit 4ae2e29e16
parent 6eab835385
3 changed files with 734 additions and 547 deletions
--- a/classes/english/new_training.py
+++ b/classes/english/new_training.py
@ -0,0 +1,178 @@
+import os
+import time
+import pickle
+import numpy as np
+
+from contextlib import nullcontext
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+
+import mlflow
+
+from transformer import Transformer
+# from config import Config
+
+# Default config values
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+output_dir = 'chkpts'
+eval_interval = 2000
+log_interval = 1
+eval_iters = 200
+eval_only = False
+always_save_checkpoint = True
+init_from = 'start'
+
+# Data loading
+# dataset = 'english'
+
+# Hyperparams (These are currently based off of NanoGPT)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+## Transformer Architecture
+gradient_accumulation_steps = 5*8
+batch_size = 12
+block_size = 1024
+n_head = 6
+n_layer = 6
+n_emb = 384
+dropout = 0.1
+bias = False
+## Adam optim (Modified from default)
+learning_rate = 3e-5
+beta1 = beta2 = 0.9
+grad_clip = 1.0
+
+
+# Save params in dict for saving & MLFlow
+model_args = dict(
+    bias=bias,
+    n_emb=n_emb,
+    beta1=beta1,
+    beta2=beta2,
+    n_head=n_head, 
+    n_layer=n_layer, 
+    dropout=dropout,
+    grad_clip=grad_size,
+    batch_size=batch_size,
+    block_size=block_size,
+    learning_rate=learning_rate,
+)
+
+# TODO: MLFlow logging
+mlflow_log = True
+mlflow.set_tracking_uri(uri="http://localhost:5000")
+mlflow.set_experiment("Lyceum English Teacher")
+mlflow.log_params(model_args)
+
+
+# Estimate Loss
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()
+    for split in ['train', 'val']:
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X, Y = get_batch(split)
+            with ctx:
+                logits, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    if mlflow_log:
+        mlflow.log_metric("loss", f"{loss:2f}", step=)
+    model.train()
+    return out
+
+
+###### INIT MODEL ######
+
+print(f'Initialising from {init_from}')
+if init_from=='start':
+
+    if meta_vocab_size is None:
+        # TODO: Figure out vocab_size
+        print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
+        model_args['vocab_size'] = 50304
+    else:
+        model_args['vocab_size'] = meta_vocab_size
+          
+    config = Config(**model_args)
+    model = Transformer(config)
+elif init_from=='chkpt':
+    chkpt_path = os.path.join(output_dir, 'chkpt.pt')
+    checkpoint = torch.load(chkpt_path, map_location=device)
+    chkpt_args = checkpoint['model_args']
+
+    config = Config(**chkpt_args)
+    model = Transformer(config)
+
+    state_dict = checkpoint['model']
+    #Apparently this is an issue?
+    unwanted_prefix = '_orig_mod.'
+    for k,v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+    model.load_state_dict(state_dict)
+    iter_num = checkpoint['iter_num']
+    best_val_loss = checkpoint['best_val_loss']
+
+if block_size < model.config.block_size:
+    model.crop_block_size(block_size)
+
+model.to(device)
+
+###### OPTIMIZER ########
+optimizer = model.configure_optim(learning_rate, (beta1, beta2), device)
+if init_from == 'chkpt':
+    optimizer.load_state.dict(checkpoint(['optimizer']))
+
+del checkpoint
+    
+# Compile PyTorch model (Requires PyTorch 2.0)
+if compile:
+    print("compiling the model...")
+    unoptimized_model = model
+    model = torch.compile(model)
+
+if mlflow_log:
+    import mlflow
+
+X, Y = get_batch('train')
+start_time = time.time()
+local_iter_num = 0
+
+running_mfu = -1.0
+
+while True:
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = learning_rate
+
+    if iter_num % eval_interval == 0 and master_process:
+        losses = estimate_loss()
+        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+        if mlflow_log:
+            mlflow.log()
+        if losses['val'] < best_val_loss or always_save_checkpoint:
+            best_val_loss = losses['val']
+            if iter_num > 0:
+                checkpoint = {
+                    'config': config,
+                    'iter_num': iter_num,
+                    'model_args': model_args,
+                    'best_val_loss': best_val_loss,
+                    'model': raw_model.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                }
+                print(f"saving chkpt to '{output_dir}/chkpt.pt'")
+                torch.save(checkpoint, os.path.join(output_dir, 'chkpt.pt'))
+
+    if iter_num == 0 and eval_only:
+        break
+
+    for step in range(gradient_accumulation_steps):
+        with ctx:
+            logits, loss = model(X, Y)
+            loss = loss / gradient_accumulation_steps
+
+            
--- a/classes/english/total_loss.png
+++ b/classes/english/total_loss.png
--- a/poetry.lock
+++ b/poetry.lock