Move project to nyrid melite

2024-10-23 00:26:05 +02:00 · 2024-10-23 00:26:05 +02:00 · 3cd93f27b5
commit 3cd93f27b5
11 changed files with 6073 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 nimcache/
 nimblecache/
 htmldocs/
--- a/61
+++ b/61
@ -0,0 +1,61 @@
 Don't Be Evil License (DBEL) 1.0
 1. Acceptance
 By using, copying, modifying, or distributing the source code, training data, training environment, or its associated machine learning model weights (collectively the "Software"), you agree to comply with all terms outlines in this license.
 2. Copyright License
 The Licensor (defined below) grants you a non-exclusive, worldwide, royalty-free, non-sublicensable, non-transferable clicense to use, copy, modify, and distribute the Software, including associated model weights, training data, and training environments, subject to the conditions set forth in this license. This includes the right to create and distribute derivative works of the Software, provided that the limitations below are observed.
 3. Non-Commercial Use Only
 You may use, copy, modify, and distribute the Software and derivative works solely for non-commercial purposes. Non-commercial purposes include, but are not limited to:
 - Personal research and study.
 - Educational and academic projects.
 - Public knowledge and hobby projects
 - Religious observance.
 - Non-commercial research, or AI and machine learning (ML) experimentation.
 4. Distribution and Monetization Provisions
 Any use of the Software or derivative works for profit, or in a business context, including in monetized services and products, requries explicit, seperate permission from the Licensor. The restrictions on commercial use apply to both the source code and any model weights produced by the Software.
 Any distribution must include this license, and the non-commercial restriction must be maintained. Weights resulting from use of the Software, including but not limited to training or fine-tuning models, must be shared under this same license, ensuring all restrictions and conditions are preserved.
 5. Integrity of the Licensor's Software
 You may not alter, remove, or obscure any functionalities related to payment, donation, or attribution in any distributed version of the Licensed Materials. You must retain all notices of copyright, licensing, and attribution provided by the Licensor in any derivative works.
 You may not alter or remove copyright, license, or trademark notices in the Software, and any public mention of the Software must include attribution to the Licensor.
 6. Patents
 This license grants you a patent license under any patents held by the Licensor that are directly related to the Software. If you or your company make any claim that the Software infringes on a patent, your rights under this license terminate immediately.
 7. Distribution of Modifications
 If you modify the Software, you must 
 - Provide prominent and clear notice of any modifications
 - Retain all original notices of copyright, licensing, and attribution to the Licensor.
 - Distribute modified versions under this license.
 8. Fair Use
 Nothing under this license restricts your rights under applicable laws regarding fair use of copyrighted material.
 9. No Other Rights
 These terms do not allow you to sublicense, assign, or transfer any of your rights to third parties, except as expressly allowed by the terms.
 These terms do not prevent the Licensor from granting licenses to anyone else.
 These terms do not imply any other licenses.
 No other rights beyond those explicitly stated are granted.
 10. Termination
 Your rights under this license will automatically terminate if you breach any of its terms. The Licensor may provide you with a 30-day period to rectify any breach. If you fail to do so, or if you breach the terms again after rectification, your license will terminate permanently.
 11. Disclaimer of Warranty
 The Licensed Materials are provides “as-is”, without any warranties, express or implied, including but not limited to warranties of fitness for a particular purpose. The Licensor is not liable for any claims or damages arising from your use of the Licensed Materials.
 12. Definitions
 - "Licensor": The entity or individual offering the Licensed Materials under this license.
 - "Licensed Materials": The software, source code, training data, training environment, model weights, and any associated AI/ML components provided under this license.
 - "You": The individual or entity accepting the terms of this license, including any organization or entity that this individual or entity might work for or represent, including any entities under common control.
 - "Your license": The license granted to you for the software under this terms.
 - "Model weights": The machine learning model parameters generated by training or fine-tuning models using the Licensed Materials.
 - "Use": Anything you do with the software requiring your license
 - "Trademark": Trademarks, service marks, and similar rights.
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
 # melite
 an exploration of nlp bigram and ngram models using nim, to both learn the language and to learn nlp
--- a/batcher.nim
+++ b/batcher.nim
@ -0,0 +1,34 @@
 import hparams
 import arraymancer
 ### CPU Part Starts Here
 # var trainingBlock: seq[int] = trainingSet[0..blockSize]
 # var trainingBlockNext: seq[int] = trainingSet[1..blockSize+1]
 # for i in 0..blockSize-1:
 #   var context = trainingBlock[0..i+1]
 #   var target = trainingBlockNext[i]
 #   echo "when input is", context, "target is", target
 #[
 The above  is done sequentially on the CPU, as a baseline since I can't afford a GPU.
 Below is the implementation for the GPU, using batches. We can (and probably will) use the CPU for this, but Arraymancer allows to send to device at compile time using a flag (-d:cuda) so we don't have to use the PyTorch .to_device('cuda') stuff . More testing is definitely needed.
 ]#
 proc getBatch*(split: string, trainingSet: seq[int], validationSet: seq[int]): (Tensor[int], Tensor[int]) =
  var data: seq[int]
  if split == "train":
    data = trainingSet
  else:
    data = validationSet
  let ix = randomTensor(shape=[batchSize], max=len(data)-blockSize)
  var 
    x: Tensor[int] = [data[0..<blockSize-1]].toTensor()
    y: Tensor[int] = [data[1..<blockSize]].toTensor()
  for i in ix[1..len(ix)-1]:
    x = x.concat([data[i..<i+blockSize-1]].toTensor(), axis=0)
    y = y.concat([data[i+1..<i+blockSize]].toTensor(), axis=0)
  result=(x,y)
--- a/bigram.nim
+++ b/bigram.nim
@ -0,0 +1,133 @@
 import std / [ tables, os, strformat ] 
 import random
 import arraymancer
 import ./batcher
 import ./hparams
 import ./generator
 import ./textEncoder
 randomize()
 ###### Text encoding
 let vocabSize: int = stringToInt.len() 
 var encodedText: seq[int] = encodeString(textContent, stringToInt)
 ###### Split corpus into training and validation sets #######
 const perchentageTraining = 80 # how much % of the corpus is given for training.
 let trainingSetEnd:int = (perchentageTraining*encodedText.len/100).int
 let trainingSet: seq[int] = encodedText[0..trainingSetEnd]
 let validationSet: seq[int] = encodedText[trainingSetEnd..textContent.len-1]
 ###### Define NN
 let ctx = newContext Tensor[float32]
 network Nimertes:
  layers:
    encoder: Embedding(vocabSize, hiddenSize)
    hiddenLinear: Linear(hiddenSize, hiddenSize)
    outputLayer: Linear(hiddenSize, vocabSize)
  forward x:
    x.encoder.tanh.hiddenLinear.tanh.hiddenLinear.tanh.outputLayer
 ###### Save/Load Model
 proc saveModel(ctx: Context[AnyTensor[float32]], model: Nimertes, dir: string) = 
  echo "\nsaving model..."
  for layer, layerField in model.fieldPairs:
    var layerName = layer
    for field, tensorVariable in layerField.fieldPairs:
      var fieldName = field
      when tensorVariable is Variable[Tensor[float32]]:
        tensorVariable.value.writeNPY(dir/fmt"{layerName}_{fieldName}.npy")
      else:
        discard
  echo "model saved"
 proc initModel(ctx: Context[AnyTensor[float32]], model: Nimertes, dir: string): Nimertes =
  echo "\nweights exist"
  echo "\nloading model..."
  for layer, _ in model.fieldPairs:
    var layerName = layer
    case layerName
      of "encoder":
        model.encoder.weight.value = readNPY[float32](dir/fmt"{layerName}_weight.npy")
      of "hiddenLinear":
        model.hiddenLinear.weight.value = readNPY[float32](dir/fmt"{layerName}_weight.npy")
        model.hiddenLinear.bias.value = readNPY[float32](dir/fmt"{layerName}_bias.npy")
      of "outputLinear":
        model.outputLayer.weight.value = readNPY[float32](dir/fmt"{layerName}_weight.npy")
        model.outputLayer.bias.value = readNPY[float32](dir/fmt"{layerName}_bias.npy")
  echo "model loaded\n"
  return model
 #### Initialize NN
 var 
  model = ctx.init(Nimertes)
  optim = model.optimizer(Adam, learningRate=3e-4'f32, beta1=0.9'f32, beta2=0.9'f32, eps=1e-5'f32)
 if fileExists("tinyBiGram/encoder_weight.npy"):
  model = ctx.initModel(model, "tinyBiGram")
 ###### Generate Text
 proc generateText(ctx: Context[AnyTensor[float32]], model: Nimertes, seedCharacters="Wh", seqLen=blockSize, temperature=0.8'f32): string =
  ctx.no_grad_mode:
    let primer = encodeString(seedCharacters, stringToInt).toTensor.unsqueeze(1)
    result = seedCharacters
    var
      input = primer[^1, _]
      output: Variable[Tensor[float32]]
    for _ in 0 ..< seqLen:
      output = model.forward(input.squeeze(0))
      var preds = output.value
      preds /.= temperature
      let probs = preds.softmax().squeeze(0)
      # Sample and append to result
      let encodedChar = probs.sample()
      result &= decodeString(encodedChar, intToString)
      input = newTensor[int](1,1)
      input[0, 0] = encodedChar
 ###### Training
 var totalLoss: seq[float]
 var plotidx : seq[float]
 for i in 0..numEpochs:
  var
    (trainingBatch, trainingBatchNext): (Tensor[int], Tensor[int]) = getBatch("train", trainingSet, validationSet)
    output: Variable[Tensor[float32]]
    batchLoss: Variable[Tensor[float32]]
  if i %% evalIter == 0:
    echo "\n", ctx.generateText(model), "\n" 
    ctx.saveModel(model, "tinyBiGram")
  else:
    for i in 0 ..< batchSize:
      var 
        inputTensor: Tensor[int] = trainingBatch[i, _]
        targetTensor: Tensor[int] = trainingBatchNext[i, _]
      output = model.forward(inputTensor.squeeze(0))
      batchLoss = output.sparseSoftmaxCrossEntropy(target=targetTensor.squeeze(0))
      batchLoss.backprop()
      optim.update()
    totalLoss.add(batchLoss.value[0])
    plotidx.add(i.float)
 ###### Plot results and show final output
 echo ctx.generateText(model)
--- a/generator.nim
+++ b/generator.nim
@ -0,0 +1,21 @@
 import std/algorithm
 import random
 import arraymancer
 proc searchsorted[T](x: openArray[T], value: T, leftSide: static bool = true): int =
  when leftSide:
    result = x.lowerBound(value)
  else:
    result = x.upperBound(value)
 proc sample*(probs: Tensor[float32]): int =
  var
    rng = initRand()
  let
    u = rng.rand(1.0'f32)
    cdf = cumsum(probs, axis=0)
    cdfA = cast[ptr UncheckedArray[float32]](cdf.unsafeRawOffset)
  result = cdfA.toOpenArray(0, cdf.size-1).searchsorted(u, leftSide=false)
--- a/gpt.nim
+++ b/gpt.nim
@ -0,0 +1,65 @@
 import std / [ tables, os ]
 import random
 import arraymancer
 import plotly
 import progress
 import therapist
 import ./batcher
 import ./hparams
 import ./textEncoder
 import ./generator
 randomize()
 let spec = (
  dir: newStringArg(@["-d", "--dir"], defaultVal="defaultDir", help="Directory to save/load from."),
  help: newHelpArg(@["-h", "--help"], help="Show help message"),
 )
 spec.parseOrQuit("Nimertes")
 let dirName = spec.dir.value
 var 
  bar = newProgressBar(total=numEpochs)
 bar.start()
 ###### Text encoding
 let vocabSize: int = stringToInt.len() 
 # var encodedText: seq[int] = encodeString(textContent, stringToInt)
 # ###### Split corpus into training and validation sets #######
 # const perchentageTraining = 80 # how much % of the corpus is given for training.
 # let trainingSetEnd:int = (perchentageTraining*encodedText.len/100).int
 # let trainingSet: seq[int] = encodedText[0..trainingSetEnd]
 # let validationSet: seq[int] = encodedText[trainingSetEnd..textContent.len-1]
 ###### Define NN
 let ctx = newContext Tensor[float32]
 # TODO: make Block type for Nimertes
 # type Block:
 network NimertesGPT:
  layers:
    tokenEmbedder: Embedding(vocabSize, hiddenSize)
    positionEmbedder: Embedding(blockSize, nEmbed)
    # blockLayer: Block(nEmbed,)
    languageModelHead: Linear(nEmbed, vocabSize)
    hiddenLinear: Linear(hiddenSize, hiddenSize)
    outputLayer: Linear(hiddenSize, nEmbed)
  forward x:
    tokenEmbedding = x.tokenEmbedder()
    positionEmbedding = .positionEmbedder()
    x.tokenEmbedding.positionEmbedding.tanh.hiddenLinear.tanh.hiddenLinear.tanh.outputLayer
 ###### Initialize NN
 var
  model = ctx.init(NimertesGPT)
--- a/hparams.nim
+++ b/hparams.nim
@ -0,0 +1,11 @@
 const 
  blockSize*: int = 200
  batchSize*: int = 100
  numEpochs*: int = 1000
  evalIter*: int = 250
 ### Network architecture params
 const
  hiddenSize* = 100
  numLayers* = 4
  nEmbed*: int = 300
--- a/network.nim
+++ b/network.nim
@ -0,0 +1,90 @@
 import arraymancer
 #### The following need to be combined gently
 let ctx = newContext Tensor[float32]
 let
  SINGLETON = 1
 type
  LinearLayer = object
    weight: Variable[Tensor[float32]]
    bias: Variable[Tensor[float32]]
  Nimertes = object
    hidden: LinearLayer
    output: LinearLayer
 template 
  weightInit(shape:varargs[int], initKind: untyped): Variable =
    ctx.variable(
      initKind(shape, float32),
      requiresGrad = true
    )
 proc newNimertesInstance*(ctx: Context[Tensor[float32]], hiddenSize: int, dimIn: int, dimOut: int): Nimertes =
  result.hidden.weight = weightInit(hiddenSize, dimIn, kaimingNormal)
  result.hidden.bias = weightInit(SINGLETON, hiddenSize, kaimingNormal) 
  result.output.weight = weightInit(dimOut, hiddenSize, kaimingNormal)
  result.output.bias = weightInit(SINGLETON, dimOut, kaimingNormal)
 proc forward*(network: Nimertes, x: Variable[Tensor[float32]]): Variable[Tensor[float32]] =
  result = x.linear(
    network.hidden.weight, network.hidden.bias).relu.linear(network.output.weight, network.output.bias)
 proc saveModel*(network: Nimertes) =
  # this is a quick prototype, but you get the idea.
  # perhaps a better way to do this would be to save all weights/biases of
  # the model into a single file.
  network.hidden.weight.value.writeNpy("hiddenweight.npy")
  network.hidden.bias.value.writeNpy("hiddenbias.npy")
  network.output.weight.value.writeNpy("outputweight.npy")
  network.output.bias.value.writeNpy("outputbias.npy") 
 proc load*(ctx: Context[Tensor[float32]]): Nimertes =
  result.hidden.weight = ctx.variable(readNpy[float32]("hiddenweight.npy"),requiresGrad = true)
  result.hidden.bias = ctx.variable(readNpy[float32]("hiddenbias.npy"),requiresGrad = true)
  result.output.weight = ctx.variable(readNpy[float32]("outputweight.npy"),requiresGrad = true)
  result.output.bias = ctx.variable(readNpy[float32]("outputbias.npy"),requiresGrad = true)
 ##### Second Way to implement
 let 
  vocabSize = 64
  hiddenSize = 100
  nLayers = 2
 network Nimertes2:
  layers:
    encoder: Embedding(vocabSize, vocabSize)
    gru: GRULayer(encoder.out_shape[0], hiddenSize, nLayers)
    decoder: Linear(hiddenSize, vocabSize)
  forward input, hidden0:
    let (output, hiddenN) = init.encoder.gru(hidden0)
    # result.output is of shape [Sequence, BatchSize, HiddenSize]
    # In our case the sequence is 1 so we can simply flatten
    let flattened = output.reshape(output.value.shape[1], hiddenSize)
    (output: flattened.decoder, hidden: hiddenN)
 export Nimertes2
 #### Third Way
 let
  dimIn = vocabSize
  dimOut = vocabSize
 network Nimertes3:
  layers:
    encoder: Embedding(vocabSize, vocabSize)
    fc1: Linear(dimIn, hiddenSize)
    fc2: Linear(hiddenSize, dimOut)
  forward input, targets, output:
    let 
      logits = input.encoder 
      (batch, time, channels) = logits.shape()
      reshapeLogits = logits.reshape(batch*time, channels)
      reshapeTargets = targets.reshape(batch*time)
      loss = output.softmax_cross_entropy()
    # x.fc1.relu.fc2
 export Nimertes3
--- a/textEncoder.nim
+++ b/textEncoder.nim
@ -0,0 +1,49 @@
 import
  std / [setutils, sequtils, tables]
 const 
  textContent* = readFile("wizard.txt")
 ########### Text Endocing/Decoding ############
 proc createStringTable*(text: string): Table[char, int] =
  let charSet = text.toSet
  var stringToInt = initTable[char, int]() 
  for id, glyph in charSet.toSeq:
    stringToInt[glyph] = id
  result = stringToInt
 proc createIntTable*(text: string): Table[int,char] =
  let charSet = text.toSet
  var intToString = initTable[int, char]()
  for id, glyph in charSet.toSeq:
    intToString[id] = glyph
  result = intToString
 proc encodeString*(str: string, stringToInt: Table[char, int]): seq[int] =
  result = @[]
  for glyph in str:
    result.add(stringToInt[glyph])
 proc decodeString*(list: seq[int], intToString: Table[int, char]): string =
  result = ""
  for item in list:
    result.add(intToString[item])
 proc decodeString*(letter: int, intToString: Table[int, char]): char =
  result = intToString[letter]
 let
  stringToInt*: Table[char, int] = createStringTable(textContent)
  intToString*: Table[int, char] = createIntTable(textContent)
--- a/wizard.txt
+++ b/wizard.txt
		`@ -0,0 +1,3 @@`
							`# melite`

							`an exploration of nlp bigram and ngram models using nim, to both learn the language and to learn nlp`