Move project to nyrid melite

This commit is contained in:
aethrvmn 2024-10-23 00:26:05 +02:00
commit 3cd93f27b5
11 changed files with 6073 additions and 0 deletions

3
.gitignore vendored Executable file
View file

@ -0,0 +1,3 @@
nimcache/
nimblecache/
htmldocs/

61
LICENSE Normal file
View file

@ -0,0 +1,61 @@
Don't Be Evil License (DBEL) 1.0
1. Acceptance
By using, copying, modifying, or distributing the source code, training data, training environment, or its associated machine learning model weights (collectively the "Software"), you agree to comply with all terms outlines in this license.
2. Copyright License
The Licensor (defined below) grants you a non-exclusive, worldwide, royalty-free, non-sublicensable, non-transferable clicense to use, copy, modify, and distribute the Software, including associated model weights, training data, and training environments, subject to the conditions set forth in this license. This includes the right to create and distribute derivative works of the Software, provided that the limitations below are observed.
3. Non-Commercial Use Only
You may use, copy, modify, and distribute the Software and derivative works solely for non-commercial purposes. Non-commercial purposes include, but are not limited to:
- Personal research and study.
- Educational and academic projects.
- Public knowledge and hobby projects
- Religious observance.
- Non-commercial research, or AI and machine learning (ML) experimentation.
4. Distribution and Monetization Provisions
Any use of the Software or derivative works for profit, or in a business context, including in monetized services and products, requries explicit, seperate permission from the Licensor. The restrictions on commercial use apply to both the source code and any model weights produced by the Software.
Any distribution must include this license, and the non-commercial restriction must be maintained. Weights resulting from use of the Software, including but not limited to training or fine-tuning models, must be shared under this same license, ensuring all restrictions and conditions are preserved.
5. Integrity of the Licensor's Software
You may not alter, remove, or obscure any functionalities related to payment, donation, or attribution in any distributed version of the Licensed Materials. You must retain all notices of copyright, licensing, and attribution provided by the Licensor in any derivative works.
You may not alter or remove copyright, license, or trademark notices in the Software, and any public mention of the Software must include attribution to the Licensor.
6. Patents
This license grants you a patent license under any patents held by the Licensor that are directly related to the Software. If you or your company make any claim that the Software infringes on a patent, your rights under this license terminate immediately.
7. Distribution of Modifications
If you modify the Software, you must
- Provide prominent and clear notice of any modifications
- Retain all original notices of copyright, licensing, and attribution to the Licensor.
- Distribute modified versions under this license.
8. Fair Use
Nothing under this license restricts your rights under applicable laws regarding fair use of copyrighted material.
9. No Other Rights
These terms do not allow you to sublicense, assign, or transfer any of your rights to third parties, except as expressly allowed by the terms.
These terms do not prevent the Licensor from granting licenses to anyone else.
These terms do not imply any other licenses.
No other rights beyond those explicitly stated are granted.
10. Termination
Your rights under this license will automatically terminate if you breach any of its terms. The Licensor may provide you with a 30-day period to rectify any breach. If you fail to do so, or if you breach the terms again after rectification, your license will terminate permanently.
11. Disclaimer of Warranty
The Licensed Materials are provides “as-is”, without any warranties, express or implied, including but not limited to warranties of fitness for a particular purpose. The Licensor is not liable for any claims or damages arising from your use of the Licensed Materials.
12. Definitions
- "Licensor": The entity or individual offering the Licensed Materials under this license.
- "Licensed Materials": The software, source code, training data, training environment, model weights, and any associated AI/ML components provided under this license.
- "You": The individual or entity accepting the terms of this license, including any organization or entity that this individual or entity might work for or represent, including any entities under common control.
- "Your license": The license granted to you for the software under this terms.
- "Model weights": The machine learning model parameters generated by training or fine-tuning models using the Licensed Materials.
- "Use": Anything you do with the software requiring your license
- "Trademark": Trademarks, service marks, and similar rights.

3
README.md Executable file
View file

@ -0,0 +1,3 @@
# melite
an exploration of nlp bigram and ngram models using nim, to both learn the language and to learn nlp

34
batcher.nim Executable file
View file

@ -0,0 +1,34 @@
import hparams
import arraymancer
### CPU Part Starts Here
# var trainingBlock: seq[int] = trainingSet[0..blockSize]
# var trainingBlockNext: seq[int] = trainingSet[1..blockSize+1]
# for i in 0..blockSize-1:
# var context = trainingBlock[0..i+1]
# var target = trainingBlockNext[i]
# echo "when input is", context, "target is", target
#[
The above is done sequentially on the CPU, as a baseline since I can't afford a GPU.
Below is the implementation for the GPU, using batches. We can (and probably will) use the CPU for this, but Arraymancer allows to send to device at compile time using a flag (-d:cuda) so we don't have to use the PyTorch .to_device('cuda') stuff . More testing is definitely needed.
]#
proc getBatch*(split: string, trainingSet: seq[int], validationSet: seq[int]): (Tensor[int], Tensor[int]) =
var data: seq[int]
if split == "train":
data = trainingSet
else:
data = validationSet
let ix = randomTensor(shape=[batchSize], max=len(data)-blockSize)
var
x: Tensor[int] = [data[0..<blockSize-1]].toTensor()
y: Tensor[int] = [data[1..<blockSize]].toTensor()
for i in ix[1..len(ix)-1]:
x = x.concat([data[i..<i+blockSize-1]].toTensor(), axis=0)
y = y.concat([data[i+1..<i+blockSize]].toTensor(), axis=0)
result=(x,y)

133
bigram.nim Executable file
View file

@ -0,0 +1,133 @@
import std / [ tables, os, strformat ]
import random
import arraymancer
import ./batcher
import ./hparams
import ./generator
import ./textEncoder
randomize()
###### Text encoding
let vocabSize: int = stringToInt.len()
var encodedText: seq[int] = encodeString(textContent, stringToInt)
###### Split corpus into training and validation sets #######
const perchentageTraining = 80 # how much % of the corpus is given for training.
let trainingSetEnd:int = (perchentageTraining*encodedText.len/100).int
let trainingSet: seq[int] = encodedText[0..trainingSetEnd]
let validationSet: seq[int] = encodedText[trainingSetEnd..textContent.len-1]
###### Define NN
let ctx = newContext Tensor[float32]
network Nimertes:
layers:
encoder: Embedding(vocabSize, hiddenSize)
hiddenLinear: Linear(hiddenSize, hiddenSize)
outputLayer: Linear(hiddenSize, vocabSize)
forward x:
x.encoder.tanh.hiddenLinear.tanh.hiddenLinear.tanh.outputLayer
###### Save/Load Model
proc saveModel(ctx: Context[AnyTensor[float32]], model: Nimertes, dir: string) =
echo "\nsaving model..."
for layer, layerField in model.fieldPairs:
var layerName = layer
for field, tensorVariable in layerField.fieldPairs:
var fieldName = field
when tensorVariable is Variable[Tensor[float32]]:
tensorVariable.value.writeNPY(dir/fmt"{layerName}_{fieldName}.npy")
else:
discard
echo "model saved"
proc initModel(ctx: Context[AnyTensor[float32]], model: Nimertes, dir: string): Nimertes =
echo "\nweights exist"
echo "\nloading model..."
for layer, _ in model.fieldPairs:
var layerName = layer
case layerName
of "encoder":
model.encoder.weight.value = readNPY[float32](dir/fmt"{layerName}_weight.npy")
of "hiddenLinear":
model.hiddenLinear.weight.value = readNPY[float32](dir/fmt"{layerName}_weight.npy")
model.hiddenLinear.bias.value = readNPY[float32](dir/fmt"{layerName}_bias.npy")
of "outputLinear":
model.outputLayer.weight.value = readNPY[float32](dir/fmt"{layerName}_weight.npy")
model.outputLayer.bias.value = readNPY[float32](dir/fmt"{layerName}_bias.npy")
echo "model loaded\n"
return model
#### Initialize NN
var
model = ctx.init(Nimertes)
optim = model.optimizer(Adam, learningRate=3e-4'f32, beta1=0.9'f32, beta2=0.9'f32, eps=1e-5'f32)
if fileExists("tinyBiGram/encoder_weight.npy"):
model = ctx.initModel(model, "tinyBiGram")
###### Generate Text
proc generateText(ctx: Context[AnyTensor[float32]], model: Nimertes, seedCharacters="Wh", seqLen=blockSize, temperature=0.8'f32): string =
ctx.no_grad_mode:
let primer = encodeString(seedCharacters, stringToInt).toTensor.unsqueeze(1)
result = seedCharacters
var
input = primer[^1, _]
output: Variable[Tensor[float32]]
for _ in 0 ..< seqLen:
output = model.forward(input.squeeze(0))
var preds = output.value
preds /.= temperature
let probs = preds.softmax().squeeze(0)
# Sample and append to result
let encodedChar = probs.sample()
result &= decodeString(encodedChar, intToString)
input = newTensor[int](1,1)
input[0, 0] = encodedChar
###### Training
var totalLoss: seq[float]
var plotidx : seq[float]
for i in 0..numEpochs:
var
(trainingBatch, trainingBatchNext): (Tensor[int], Tensor[int]) = getBatch("train", trainingSet, validationSet)
output: Variable[Tensor[float32]]
batchLoss: Variable[Tensor[float32]]
if i %% evalIter == 0:
echo "\n", ctx.generateText(model), "\n"
ctx.saveModel(model, "tinyBiGram")
else:
for i in 0 ..< batchSize:
var
inputTensor: Tensor[int] = trainingBatch[i, _]
targetTensor: Tensor[int] = trainingBatchNext[i, _]
output = model.forward(inputTensor.squeeze(0))
batchLoss = output.sparseSoftmaxCrossEntropy(target=targetTensor.squeeze(0))
batchLoss.backprop()
optim.update()
totalLoss.add(batchLoss.value[0])
plotidx.add(i.float)
###### Plot results and show final output
echo ctx.generateText(model)

21
generator.nim Executable file
View file

@ -0,0 +1,21 @@
import std/algorithm
import random
import arraymancer
proc searchsorted[T](x: openArray[T], value: T, leftSide: static bool = true): int =
when leftSide:
result = x.lowerBound(value)
else:
result = x.upperBound(value)
proc sample*(probs: Tensor[float32]): int =
var
rng = initRand()
let
u = rng.rand(1.0'f32)
cdf = cumsum(probs, axis=0)
cdfA = cast[ptr UncheckedArray[float32]](cdf.unsafeRawOffset)
result = cdfA.toOpenArray(0, cdf.size-1).searchsorted(u, leftSide=false)

65
gpt.nim Normal file
View file

@ -0,0 +1,65 @@
import std / [ tables, os ]
import random
import arraymancer
import plotly
import progress
import therapist
import ./batcher
import ./hparams
import ./textEncoder
import ./generator
randomize()
let spec = (
dir: newStringArg(@["-d", "--dir"], defaultVal="defaultDir", help="Directory to save/load from."),
help: newHelpArg(@["-h", "--help"], help="Show help message"),
)
spec.parseOrQuit("Nimertes")
let dirName = spec.dir.value
var
bar = newProgressBar(total=numEpochs)
bar.start()
###### Text encoding
let vocabSize: int = stringToInt.len()
# var encodedText: seq[int] = encodeString(textContent, stringToInt)
# ###### Split corpus into training and validation sets #######
# const perchentageTraining = 80 # how much % of the corpus is given for training.
# let trainingSetEnd:int = (perchentageTraining*encodedText.len/100).int
# let trainingSet: seq[int] = encodedText[0..trainingSetEnd]
# let validationSet: seq[int] = encodedText[trainingSetEnd..textContent.len-1]
###### Define NN
let ctx = newContext Tensor[float32]
# TODO: make Block type for Nimertes
# type Block:
network NimertesGPT:
layers:
tokenEmbedder: Embedding(vocabSize, hiddenSize)
positionEmbedder: Embedding(blockSize, nEmbed)
# blockLayer: Block(nEmbed,)
languageModelHead: Linear(nEmbed, vocabSize)
hiddenLinear: Linear(hiddenSize, hiddenSize)
outputLayer: Linear(hiddenSize, nEmbed)
forward x:
tokenEmbedding = x.tokenEmbedder()
positionEmbedding = .positionEmbedder()
x.tokenEmbedding.positionEmbedding.tanh.hiddenLinear.tanh.hiddenLinear.tanh.outputLayer
###### Initialize NN
var
model = ctx.init(NimertesGPT)

11
hparams.nim Executable file
View file

@ -0,0 +1,11 @@
const
blockSize*: int = 200
batchSize*: int = 100
numEpochs*: int = 1000
evalIter*: int = 250
### Network architecture params
const
hiddenSize* = 100
numLayers* = 4
nEmbed*: int = 300

90
network.nim Executable file
View file

@ -0,0 +1,90 @@
import arraymancer
#### The following need to be combined gently
let ctx = newContext Tensor[float32]
let
SINGLETON = 1
type
LinearLayer = object
weight: Variable[Tensor[float32]]
bias: Variable[Tensor[float32]]
Nimertes = object
hidden: LinearLayer
output: LinearLayer
template
weightInit(shape:varargs[int], initKind: untyped): Variable =
ctx.variable(
initKind(shape, float32),
requiresGrad = true
)
proc newNimertesInstance*(ctx: Context[Tensor[float32]], hiddenSize: int, dimIn: int, dimOut: int): Nimertes =
result.hidden.weight = weightInit(hiddenSize, dimIn, kaimingNormal)
result.hidden.bias = weightInit(SINGLETON, hiddenSize, kaimingNormal)
result.output.weight = weightInit(dimOut, hiddenSize, kaimingNormal)
result.output.bias = weightInit(SINGLETON, dimOut, kaimingNormal)
proc forward*(network: Nimertes, x: Variable[Tensor[float32]]): Variable[Tensor[float32]] =
result = x.linear(
network.hidden.weight, network.hidden.bias).relu.linear(network.output.weight, network.output.bias)
proc saveModel*(network: Nimertes) =
# this is a quick prototype, but you get the idea.
# perhaps a better way to do this would be to save all weights/biases of
# the model into a single file.
network.hidden.weight.value.writeNpy("hiddenweight.npy")
network.hidden.bias.value.writeNpy("hiddenbias.npy")
network.output.weight.value.writeNpy("outputweight.npy")
network.output.bias.value.writeNpy("outputbias.npy")
proc load*(ctx: Context[Tensor[float32]]): Nimertes =
result.hidden.weight = ctx.variable(readNpy[float32]("hiddenweight.npy"),requiresGrad = true)
result.hidden.bias = ctx.variable(readNpy[float32]("hiddenbias.npy"),requiresGrad = true)
result.output.weight = ctx.variable(readNpy[float32]("outputweight.npy"),requiresGrad = true)
result.output.bias = ctx.variable(readNpy[float32]("outputbias.npy"),requiresGrad = true)
##### Second Way to implement
let
vocabSize = 64
hiddenSize = 100
nLayers = 2
network Nimertes2:
layers:
encoder: Embedding(vocabSize, vocabSize)
gru: GRULayer(encoder.out_shape[0], hiddenSize, nLayers)
decoder: Linear(hiddenSize, vocabSize)
forward input, hidden0:
let (output, hiddenN) = init.encoder.gru(hidden0)
# result.output is of shape [Sequence, BatchSize, HiddenSize]
# In our case the sequence is 1 so we can simply flatten
let flattened = output.reshape(output.value.shape[1], hiddenSize)
(output: flattened.decoder, hidden: hiddenN)
export Nimertes2
#### Third Way
let
dimIn = vocabSize
dimOut = vocabSize
network Nimertes3:
layers:
encoder: Embedding(vocabSize, vocabSize)
fc1: Linear(dimIn, hiddenSize)
fc2: Linear(hiddenSize, dimOut)
forward input, targets, output:
let
logits = input.encoder
(batch, time, channels) = logits.shape()
reshapeLogits = logits.reshape(batch*time, channels)
reshapeTargets = targets.reshape(batch*time)
loss = output.softmax_cross_entropy()
# x.fc1.relu.fc2
export Nimertes3

49
textEncoder.nim Executable file
View file

@ -0,0 +1,49 @@
import
std / [setutils, sequtils, tables]
const
textContent* = readFile("wizard.txt")
########### Text Endocing/Decoding ############
proc createStringTable*(text: string): Table[char, int] =
let charSet = text.toSet
var stringToInt = initTable[char, int]()
for id, glyph in charSet.toSeq:
stringToInt[glyph] = id
result = stringToInt
proc createIntTable*(text: string): Table[int,char] =
let charSet = text.toSet
var intToString = initTable[int, char]()
for id, glyph in charSet.toSeq:
intToString[id] = glyph
result = intToString
proc encodeString*(str: string, stringToInt: Table[char, int]): seq[int] =
result = @[]
for glyph in str:
result.add(stringToInt[glyph])
proc decodeString*(list: seq[int], intToString: Table[int, char]): string =
result = ""
for item in list:
result.add(intToString[item])
proc decodeString*(letter: int, intToString: Table[int, char]): char =
result = intToString[letter]
let
stringToInt*: Table[char, int] = createStringTable(textContent)
intToString*: Table[int, char] = createIntTable(textContent)

5603
wizard.txt Executable file

File diff suppressed because it is too large Load diff