import arraymancer

#### The following need to be combined gently
let ctx = newContext Tensor[float32]

let
  SINGLETON = 1


type
  LinearLayer = object
    weight: Variable[Tensor[float32]]
    bias: Variable[Tensor[float32]]
  Nimertes = object
    hidden: LinearLayer
    output: LinearLayer

template 
  weightInit(shape:varargs[int], initKind: untyped): Variable =
    ctx.variable(
      initKind(shape, float32),
      requiresGrad = true
    )
      
proc newNimertesInstance*(ctx: Context[Tensor[float32]], hiddenSize: int, dimIn: int, dimOut: int): Nimertes =
  result.hidden.weight = weightInit(hiddenSize, dimIn, kaimingNormal)
  result.hidden.bias = weightInit(SINGLETON, hiddenSize, kaimingNormal) 
  result.output.weight = weightInit(dimOut, hiddenSize, kaimingNormal)
  result.output.bias = weightInit(SINGLETON, dimOut, kaimingNormal)

proc forward*(network: Nimertes, x: Variable[Tensor[float32]]): Variable[Tensor[float32]] =
  result = x.linear(
    network.hidden.weight, network.hidden.bias).relu.linear(network.output.weight, network.output.bias)

proc saveModel*(network: Nimertes) =
  # this is a quick prototype, but you get the idea.
  # perhaps a better way to do this would be to save all weights/biases of
  # the model into a single file.
  network.hidden.weight.value.writeNpy("hiddenweight.npy")
  network.hidden.bias.value.writeNpy("hiddenbias.npy")
  network.output.weight.value.writeNpy("outputweight.npy")
  network.output.bias.value.writeNpy("outputbias.npy") 

proc load*(ctx: Context[Tensor[float32]]): Nimertes =
  result.hidden.weight = ctx.variable(readNpy[float32]("hiddenweight.npy"),requiresGrad = true)
  result.hidden.bias = ctx.variable(readNpy[float32]("hiddenbias.npy"),requiresGrad = true)
  result.output.weight = ctx.variable(readNpy[float32]("outputweight.npy"),requiresGrad = true)
  result.output.bias = ctx.variable(readNpy[float32]("outputbias.npy"),requiresGrad = true)

##### Second Way to implement
let 
  vocabSize = 64
  hiddenSize = 100
  nLayers = 2

network Nimertes2:
  layers:
    encoder: Embedding(vocabSize, vocabSize)
    gru: GRULayer(encoder.out_shape[0], hiddenSize, nLayers)
    decoder: Linear(hiddenSize, vocabSize)
  forward input, hidden0:
    let (output, hiddenN) = init.encoder.gru(hidden0)
    # result.output is of shape [Sequence, BatchSize, HiddenSize]
    # In our case the sequence is 1 so we can simply flatten
    let flattened = output.reshape(output.value.shape[1], hiddenSize)

    (output: flattened.decoder, hidden: hiddenN)

export Nimertes2

#### Third Way
let
  dimIn = vocabSize
  dimOut = vocabSize

network Nimertes3:
  layers:
    encoder: Embedding(vocabSize, vocabSize)
    fc1: Linear(dimIn, hiddenSize)
    fc2: Linear(hiddenSize, dimOut)
  forward input, targets, output:
    let 
      logits = input.encoder 
      (batch, time, channels) = logits.shape()
      reshapeLogits = logits.reshape(batch*time, channels)
      reshapeTargets = targets.reshape(batch*time)
      loss = output.softmax_cross_entropy()
    # x.fc1.relu.fc2

export Nimertes3