import random

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

class Words(object):
    """Représente une liste de mots, ainsi que la liste ordonnée des caractères les composants."""

    EOS = '.'

    def __init__(self, filename):
        self.filename = filename
        self.words = open(self.filename, 'r').read().splitlines()
        self.nb_words = len(self.words)
        self.chars = sorted(list(set(''.join(self.words))))
        self.nb_chars = len(self.chars) + 1  # On ajoute 1 pour EOS
        self.ctoi = {c:i+1 for i,c in enumerate(self.chars)}
        self.ctoi[self.EOS] = 0
        self.itoc = {i:s for s,i in self.ctoi.items()}

    def __repr__(self):
        l = []
        l.append("<Words")
        l.append(f'  filename="{self.filename}"')
        l.append(f'  nb_words="{self.nb_words}"')
        l.append(f'  nb_chars="{self.nb_chars}"/>')
        return '\n'.join(l)

words = Words('civil_mots.txt')
print(words)

<Words
  filename="civil_mots.txt"
  nb_words="7223"
  nb_chars="41"/>

class Datasets:
    """Construits les jeu de données d'entraînement, de test et de validation.
    
    Prend en paramètres une liste de mots et la taille du contexte pour la prédiction.
    """

    def _build_dataset(self, lwords:list, context_size:int):
        X, Y = [], []
        for w in lwords:
            context = [0] * context_size
            for ch in w + self.words.EOS:
                ix = self.words.ctoi[ch]
                X.append(context)
                Y.append(ix)
                context = context[1:] + [ix] # crop and append
        X = torch.tensor(X)
        Y = torch.tensor(Y)
        return X, Y
    
    def __init__(self, words:Words, context_size:int, seed:int=42):
        # 80%, 10%, 10%
        self.shuffled_words = words.words.copy()
        random.shuffle(self.shuffled_words)
        self.n1 = int(0.8*len(self.shuffled_words))
        self.n2 = int(0.9*len(self.shuffled_words))
        self.words = words
        self.Xtr, self.Ytr = self._build_dataset(self.shuffled_words[:self.n1], context_size)
        self.Xdev, self.Ydev = self._build_dataset(self.shuffled_words[self.n1:self.n2], context_size)
        self.Xte, self.Yte = self._build_dataset(self.shuffled_words[self.n2:], context_size)

context_size = 3
datasets = Datasets(words, context_size)

class BengioFFN:
    
    def __init__(self, e_dims, n_hidden, context_size, nb_chars, g):
        self.g = g
        self.nb_chars = nb_chars
        self.e_dims = e_dims
        self.n_hidden = n_hidden
        self.context_size = context_size
        self.create_network()

    def layers(self):
        self.C = torch.randn((self.nb_chars, self.e_dims), generator=self.g)
        self.W1 = torch.randn((self.context_size * self.e_dims, self.n_hidden), generator=self.g)
        self.b1 = torch.randn(self.n_hidden, generator=self.g)
        self.W2 = torch.randn((self.n_hidden, self.nb_chars), generator=self.g)
        self.b2 = torch.randn(self.nb_chars, generator=self.g)

    def create_network(self):
        self.layers()
        self.loss = None
        self.steps = 0
        self.parameters = [self.C, self.W1, self.b1, self.W2, self.b2]
        self.nb_parameters = sum(p.nelement() for p in self.parameters) # number of parameters in total
        for p in self.parameters:
            p.requires_grad = True

    def forward(self, X, Y):
        self.emb = self.C[X] # Embed characters into vectors
        self.embcat = self.emb.view(self.emb.shape[0], -1) # Concatenate the vectors
        self.hpreact = self.embcat @ self.W1 + self.b1 # hidden layer pre-activation
        self.h = torch.tanh(self.hpreact) # hidden layer
        self.logits = self.h @ self.W2 + self.b2 # output layer
        self.loss = F.cross_entropy(self.logits, Y) # loss function

    def backward(self):
        for p in self.parameters:
            p.grad = None
        self.loss.backward()

    def train(self, datasets: Datasets, max_steps, mini_batch_size):
        lossi = []
        for i in range(max_steps):
            # minibatch construct
            ix = torch.randint(0, datasets.Xtr.shape[0], (mini_batch_size,), generator=self.g)
            Xb, Yb = datasets.Xtr[ix], datasets.Ytr[ix]
            
            # forward pass
            self.forward(Xb, Yb)
        
            # backward pass
            self.backward()
        
            # update
            lr = 0.2 if i < 100000 else 0.02 # step learning rate decay
            self.update_grad(lr)
        
            # track stats
            if i % 10000 == 0:
                print(f"{i:7d}/{max_steps:7d}: {self.loss.item():.4f}")
            lossi.append(self.loss.log10().item())
        self.steps += max_steps
        return lossi

    def update_grad(self, lr):
        for p in self.parameters:
            p.data += -lr * p.grad

    @torch.no_grad() # this decorator disables gradient tracking
    def compute_loss(self, X, Y):
        emb = self.C[X] # Embed characters into vectors
        embcat = emb.view(emb.shape[0], -1) # Concatenate the vectors
        hpreact = embcat @ self.W1 + self.b1 # hidden layer pre-activation
        h = torch.tanh(hpreact) # hidden layer
        logits = h @ self.W2 + self.b2 # output layer
        loss = F.cross_entropy(logits, Y) # loss function
        return loss

    @torch.no_grad() # this decorator disables gradient tracking
    def training_loss(self, datasets:Datasets):
        loss = self.compute_loss(datasets.Xtr, datasets.Ytr)
        return loss.item()

    @torch.no_grad() # this decorator disables gradient tracking
    def test_loss(self, datasets:Datasets):
        loss = self.compute_loss(datasets.Xte, datasets.Yte)
        return loss.item()

    @torch.no_grad() # this decorator disables gradient tracking
    def dev_loss(self, datasets:Datasets):
        loss = self.compute_loss(datasets.Xdev, datasets.Xdev)
        return loss.item()

    @torch.no_grad()
    def generate_word(self, itoc, g):
        out = []
        context = [0] * self.context_size
        while True:
            emb = self.C[torch.tensor([context])]
            h = torch.tanh(emb.view(1, -1) @ self.W1 + self.b1)
            logits = h @ self.W2 + self.b2
            probs = F.softmax(logits, dim=1)
            # Sample from the probability distribution
            ix = torch.multinomial(probs, num_samples=1, generator=g).item()
            # Shift the context window
            context = context[1:] + [ix]
            # Store the generated character
            if ix != 0:
                out.append(ix)
            else:
                # Stop when encounting '.'
                break
        return ''.join(itoc[i] for i in out)

    def __repr__(self):
        l = []
        l.append("<BengioMLP")
        l.append(f'  nb_chars="{self.nb_chars}"')
        l.append(f'  e_dims="{self.e_dims}"')
        l.append(f'  n_hidden="{self.n_hidden}"')
        l.append(f'  context_size="{self.context_size}"')
        l.append(f'  loss="{self.loss}"')
        l.append(f'  steps="{self.steps}"')
        l.append(f'  nb_parameters="{self.nb_parameters}"/>')
        return '\n'.join(l)

e_dims = 10  # Dimensions des embeddings
n_hidden = 200
seed = 2147483647
g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
print(nn)

<BengioMLP
  nb_chars="41"
  e_dims="10"
  n_hidden="200"
  context_size="3"
  loss="None"
  steps="0"
  nb_parameters="14851"/>

max_steps = 200000
mini_batch_size = 32
lossi = nn.train(datasets, max_steps, mini_batch_size)

      0/ 200000: 30.4901
  10000/ 200000: 2.1505
  20000/ 200000: 1.5062
  30000/ 200000: 1.9053
  40000/ 200000: 1.8812
  50000/ 200000: 2.8604
  60000/ 200000: 2.1297
  70000/ 200000: 1.6733
  80000/ 200000: 2.1570
  90000/ 200000: 1.7283
 100000/ 200000: 1.9989
 110000/ 200000: 1.5019
 120000/ 200000: 1.2979
 130000/ 200000: 1.5895
 140000/ 200000: 1.6045
 150000/ 200000: 1.8367
 160000/ 200000: 1.7854
 170000/ 200000: 1.1291
 180000/ 200000: 1.7075
 190000/ 200000: 1.4321

print(nn)

<BengioMLP
  nb_chars="41"
  e_dims="10"
  n_hidden="200"
  context_size="3"
  loss="1.25055730342865"
  steps="200000"
  nb_parameters="14851"/>

plt.plot(lossi);

[<matplotlib.lines.Line2D at 0x11eec5be0>]

train_loss = nn.training_loss(datasets)
val_loss = nn.test_loss(datasets)
print(f"{train_loss=}")
print(f"{val_loss=}")

train_loss=1.4996627569198608
val_loss=1.7031011581420898

g = torch.Generator().manual_seed(2147483647 + 10)
for _ in range(20):
    word = nn.generate_word(words.itoc, g)
    print(word)

aveilles
saienti
pénies
ints
apprés
précieursulte
expulsé
pournile
achement
citressort
résignemention
moyés
soumonial
hement
crivéalis
memblateurs
résent
qu'aucontribunifiencé
prendu
dété

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, 1, mini_batch_size)

      0/      1: 30.4901

1/41

0.024390243902439025

-torch.tensor(1/41.0).log().item()

3.7135720252990723

def comp_loss(l:list, item:int):
    # Calcule le loss associé à l'item dans la distribution de probabilité
    # associée au logits
    logits = torch.tensor(l)
    probs = torch.softmax(logits, dim=0)
    loss = -probs[item].log()
    return logits, probs, loss.item()

# Distributions uniformes
print(comp_loss([0.0, 0.0, 0.0, 0.0], 2))
print(comp_loss([1.0, 1.0, 1.0, 1.0], 2))

(tensor([0., 0., 0., 0.]), tensor([0.2500, 0.2500, 0.2500, 0.2500]), 1.3862943649291992)
(tensor([1., 1., 1., 1.]), tensor([0.2500, 0.2500, 0.2500, 0.2500]), 1.3862943649291992)

# Loss très faible car on sélectionne l'item qui a le logit le plus élevé
print(comp_loss([0.0, 0.0, 10.0, 0.0], 2))
print(comp_loss([0.0, 0.0, 10000.0, 0.0], 2))

(tensor([ 0.,  0., 10.,  0.]), tensor([4.5394e-05, 4.5394e-05, 9.9986e-01, 4.5394e-05]), 0.00013626550207845867)
(tensor([    0.,     0., 10000.,     0.]), tensor([0., 0., 1., 0.]), -0.0)

# Valeurs disparates
print(comp_loss([-3.0, 5.0, 0.0, -5.0], 2))
print(comp_loss([100.0, 50.0, 5.0, -45.0], 2))

(tensor([-3.,  5.,  0., -5.]), tensor([3.3309e-04, 9.9293e-01, 6.6903e-03, 4.5079e-05]), 5.00709342956543)
(tensor([100.,  50.,   5., -45.]), tensor([1.0000e+00, 1.9287e-22, 5.5211e-42, 0.0000e+00]), 94.99999237060547)

print(comp_loss((torch.randn(4) * 10).tolist(), 2))

(tensor([13.4427, -0.7880, -7.3284,  1.1521]), tensor([9.9999e-01, 6.6022e-07, 9.5328e-10, 4.5945e-06]), 20.7711124420166)

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, 1, mini_batch_size)
print(nn.logits[0])

      0/      1: 30.4901
tensor([  4.6166,   7.7147,  15.4573,   1.0207,  -2.7009,  -5.7819,  -5.3783,
         23.6806,  21.1588,  -2.8979,  -4.8657, -11.5645,   8.7837,   4.6332,
        -13.9403, -13.4483,  -3.9130,   1.4592,   8.2284,  10.1325, -16.5248,
         -7.4705, -14.2061,  -4.3193,   0.8513, -11.2382,  30.9323,   1.3783,
         -6.4384,  26.5325, -14.2742,  14.5535,   4.6325,  21.9473,  14.6677,
         18.3815,  -1.1816,   1.2932,   7.4166,  -7.3696, -19.7648],
       grad_fn=<SelectBackward0>)

self.logits = self.h @ self.W2 + self.b2 # output layer

def layers(self):
    self.C = torch.randn((self.nb_chars, self.e_dims), generator=self.g)
    self.W1 = torch.randn((self.context_size * self.e_dims, self.n_hidden), generator=self.g)
    self.b1 = torch.randn(self.n_hidden, generator=self.g)
    self.W2 = torch.randn((self.n_hidden, self.nb_chars), generator=self.g) * 0.01  # Pour l'entropie
    self.b2 = torch.randn(self.nb_chars, generator=self.g) * 0
BengioFFN.layers = layers

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, 1, mini_batch_size)
print(nn.logits[0])

      0/      1: 3.7308
tensor([ 0.0420,  0.0911,  0.1608, -0.0033, -0.0175, -0.0748, -0.0536,  0.2406,
         0.2165, -0.0141, -0.0452, -0.1238,  0.0900,  0.0641, -0.1163, -0.1273,
        -0.0343, -0.0122,  0.0917,  0.1035, -0.1610, -0.0921, -0.1565, -0.0330,
        -0.0077, -0.0974,  0.3215,  0.0067, -0.0524,  0.2585, -0.1674,  0.1523,
         0.0457,  0.2221,  0.1550,  0.1901, -0.0024,  0.0209,  0.0841, -0.0792,
        -0.1927], grad_fn=<SelectBackward0>)

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, max_steps, mini_batch_size)

      0/ 200000: 3.7308
  10000/ 200000: 1.9096
  20000/ 200000: 1.4936
  30000/ 200000: 1.7350
  40000/ 200000: 1.5614
  50000/ 200000: 2.4063
  60000/ 200000: 1.5985
  70000/ 200000: 1.6582
  80000/ 200000: 2.1206
  90000/ 200000: 1.4665
 100000/ 200000: 1.7602
 110000/ 200000: 1.7375
 120000/ 200000: 1.3745
 130000/ 200000: 1.4203
 140000/ 200000: 1.6508
 150000/ 200000: 1.6507
 160000/ 200000: 1.6627
 170000/ 200000: 1.1234
 180000/ 200000: 1.5974
 190000/ 200000: 1.3893

plt.plot(lossi);

train_loss = nn.training_loss(datasets)
val_loss = nn.test_loss(datasets)
print(f"{train_loss=}")
print(f"{val_loss=}")

train_loss=1.4785451889038086
val_loss=1.6728813648223877

self.hpreact = self.embcat @ self.W1 + self.b1 # hidden layer pre-activation
self.h = torch.tanh(self.hpreact) # hidden layer

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, 1, mini_batch_size)
nn.h

      0/      1: 3.7308

tensor([[-1.0000, -0.9998, -0.9985,  ..., -0.9964,  1.0000,  0.9995],
        [-1.0000,  0.9432, -0.9935,  ...,  0.9990,  0.9751, -0.9991],
        [-1.0000, -0.9998, -0.9985,  ..., -0.9964,  1.0000,  0.9995],
        ...,
        [-1.0000, -0.9995,  0.9973,  ..., -0.9959,  0.9997, -0.9979],
        [-1.0000,  1.0000, -0.9998,  ..., -0.3458,  1.0000,  1.0000],
        [ 0.9631, -0.9770,  1.0000,  ..., -0.8631,  0.9987, -0.8641]],
       grad_fn=<TanhBackward0>)

# Histogramme des valeurs de la couche `h`
plt.hist(nn.h.view(-1).tolist(), 50);

# Histogramme des valeurs de la couche `h`
plt.hist(nn.hpreact.view(-1).tolist(), 50);

def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, children=(self,), op='tanh')
        def _backward():
            self.grad = (1 - t**2) * out.grad
        out._backward = _backward
        return out

plt.figure(figsize=(20,10))
plt.imshow(nn.h.abs() > 0.99, cmap='gray', interpolation='nearest');

def layers(self):
    self.C = torch.randn((self.nb_chars, self.e_dims), generator=self.g)
    self.W1 = torch.randn((self.context_size * self.e_dims, self.n_hidden), generator=self.g) * 0.2
    self.b1 = torch.randn(self.n_hidden, generator=self.g) * 0.01  # un peu d'entropie
    self.W2 = torch.randn((self.n_hidden, self.nb_chars), generator=self.g) * 0.01  # Pour l'entropie
    self.b2 = torch.randn(self.nb_chars, generator=self.g) * 0
BengioFFN.layers = layers

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, 1, mini_batch_size)
print(nn.h)

      0/      1: 3.7371
tensor([[-0.9646, -0.6130, -0.8121,  ..., -0.6439,  0.7729,  0.7591],
        [-0.8748,  0.5015, -0.7546,  ...,  0.5581,  0.2301, -0.5421],
        [-0.9646, -0.6130, -0.8121,  ..., -0.6439,  0.7729,  0.7591],
        ...,
        [-0.7323, -0.5527,  0.2453,  ..., -0.6356,  0.5811, -0.4778],
        [-0.9943,  0.9848, -0.8656,  ..., -0.2016,  0.9336,  0.9865],
        [ 0.5922, -0.2423,  0.9416,  ..., -0.3742,  0.4832, -0.0968]],
       grad_fn=<TanhBackward0>)

plt.hist(nn.h.view(-1).tolist(), 50);

plt.hist(nn.hpreact.view(-1).tolist(), 50);

plt.figure(figsize=(20,10));
plt.imshow(nn.h.abs() > 0.99, cmap='gray', interpolation='nearest');

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, max_steps, mini_batch_size)
train_loss = nn.training_loss(datasets)
val_loss = nn.test_loss(datasets)
print(f"{train_loss=}")
print(f"{val_loss=}")
plt.plot(lossi);

      0/ 200000: 3.7371
  10000/ 200000: 1.7613
  20000/ 200000: 1.4222
  30000/ 200000: 1.8768
  40000/ 200000: 1.4871
  50000/ 200000: 2.2807
  60000/ 200000: 1.7422
  70000/ 200000: 1.5854
  80000/ 200000: 1.9995
  90000/ 200000: 1.4310
 100000/ 200000: 1.7311
 110000/ 200000: 1.4871
 120000/ 200000: 1.3908
 130000/ 200000: 1.3281
 140000/ 200000: 1.5639
 150000/ 200000: 1.7007
 160000/ 200000: 1.6561
 170000/ 200000: 1.1525
 180000/ 200000: 1.7005
 190000/ 200000: 1.4037
train_loss=1.4660104513168335
val_loss=1.657399296760559

# Exemple pour discuter
x = torch.randn(1000, 10)  # 1000 examples de dimension 10
h = torch.randn(10, 200)  # couche cachée de 200 neurones et de 10 entrées
y = x @ h
print(x.mean(), x.std())
print(y.mean(), y.std())
plt.figure(figsize=(20,5))
plt.subplot(121)
plt.hist(x.view(-1).tolist(), 50, density=True);
plt.subplot(122)
plt.hist(y.view(-1).tolist(), 50, density=True);

tensor(-0.0177) tensor(1.0071)
tensor(0.0058) tensor(3.2131)

# Exemple pour discuter
x = torch.randn(1000, 10)
h = torch.randn(10, 200) * 5
#h = torch.randn(10, 200) * 0.2
y = x @ h
print(x.mean(), x.std())
print(y.mean(), y.std())
plt.figure(figsize=(20,5))
plt.subplot(121)
plt.hist(x.view(-1).tolist(), 50, density=True);
plt.subplot(122)
plt.hist(y.view(-1).tolist(), 50, density=True);

tensor(-0.0151) tensor(1.0040)
tensor(-0.0461) tensor(15.6056)

# Exemple pour discuter
fanin = 10
nb_exemples = 1000
n_hidden = 200
x = torch.randn(nb_exemples, fanin)
h = torch.randn(fanin, n_hidden) / fanin**0.5 
y = x @ h
print(x.mean(), x.std())
print(y.mean(), y.std())
plt.figure(figsize=(20,5))
plt.subplot(121)
plt.hist(x.view(-1).tolist(), 50, density=True);
plt.subplot(122)
plt.hist(y.view(-1).tolist(), 50, density=True);

tensor(-0.0049) tensor(0.9964)
tensor(-0.0029) tensor(1.0097)

(5/3) / (30 ** 0.5)

0.3042903097250923

def layers(self):
    self.C = torch.randn((self.nb_chars, self.e_dims), generator=self.g)
    fan_in = self.context_size * self.e_dims
    tanh_gain = 5/3
    self.W1 = torch.randn((self.context_size * self.e_dims, self.n_hidden), generator=self.g) * (tanh_gain / (fan_in ** 0.5))
    self.b1 = torch.randn(self.n_hidden, generator=self.g) * 0.01  # un peu d'entropie
    self.W2 = torch.randn((self.n_hidden, self.nb_chars), generator=self.g) * 0.01  # Pour l'entropie
    self.b2 = torch.randn(self.nb_chars, generator=self.g) * 0
BengioFFN.layers = layers

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, max_steps, mini_batch_size)
train_loss = nn.training_loss(datasets)
val_loss = nn.test_loss(datasets)
print(f"{train_loss=}")
print(f"{val_loss=}")
plt.plot(lossi);

      0/ 200000: 3.7392
  10000/ 200000: 1.7750
  20000/ 200000: 1.3628
  30000/ 200000: 1.8122
  40000/ 200000: 1.5174
  50000/ 200000: 2.2347
  60000/ 200000: 1.7743
  70000/ 200000: 1.5951
  80000/ 200000: 2.1101
  90000/ 200000: 1.4280
 100000/ 200000: 1.7477
 110000/ 200000: 1.6083
 120000/ 200000: 1.3621
 130000/ 200000: 1.3791
 140000/ 200000: 1.5519
 150000/ 200000: 1.6893
 160000/ 200000: 1.5604
 170000/ 200000: 1.1256
 180000/ 200000: 1.6355
 190000/ 200000: 1.4911
train_loss=1.4627671241760254
val_loss=1.657903790473938

self.hpreact = self.embcat @ self.W1 + self.b1 # hidden layer pre-activation

def layers(self):
    self.C = torch.randn((self.nb_chars, self.e_dims), generator=self.g)
    fan_in = self.context_size * self.e_dims
    tanh_gain = 5/3
    self.W1 = torch.randn((self.context_size * self.e_dims, self.n_hidden), generator=self.g) * (tanh_gain / (fan_in ** 0.5))
    self.W2 = torch.randn((self.n_hidden, self.nb_chars), generator=self.g) * 0.01  # Pour l'entropie
    self.b2 = torch.randn(self.nb_chars, generator=self.g) * 0
    self.bngain = torch.ones((1, n_hidden))
    self.bnbias = torch.zeros((1, n_hidden))
BengioFFN.layers = layers

def create_network(self):
    self.layers()
    self.loss = None
    self.steps = 0
    self.parameters = [self.C, self.W1, self.W2, self.b2, self.bngain, self.bnbias]
    self.nb_parameters = sum(p.nelement() for p in self.parameters) # number of parameters in total
    for p in self.parameters:
        p.requires_grad = True
    self.bnmean_running = torch.zeros((1, n_hidden))
    self.bnstd_running = torch.zeros((1, n_hidden))
BengioFFN.create_network = create_network

def forward(self, X, Y):
    self.emb = self.C[X] # Embed characters into vectors
    self.embcat = self.emb.view(self.emb.shape[0], -1) # Concatenate the vectors
    # Linear layer
    self.hpreact = self.embcat @ self.W1 # hidden layer pre-activation
    # BatchNorm layer
    self.bnmeani = self.hpreact.mean(0, keepdim=True)
    self.bnstdi = self.hpreact.std(0, keepdim=True)
    self.hpreact = self.bngain * (self.hpreact - self.bnmeani) / self.bnstdi + self.bnbias
    # Non linearity
    self.h = torch.tanh(self.hpreact) # hidden layer
    self.logits = self.h @ self.W2 + self.b2 # output layer
    self.loss = F.cross_entropy(self.logits, Y) # loss function
    # mean, std
    with torch.no_grad():
        self.bnmean_running = 0.999 * self.bnmean_running + 0.001 * self.bnmeani
        self.bnstd_running = 0.999 * self.bnstd_running + 0.001 * self.bnstdi
BengioFFN.forward = forward

@torch.no_grad() # this decorator disables gradient tracking
def compute_loss(self, X, Y):
    emb = self.C[X] # Embed characters into vectors
    embcat = emb.view(emb.shape[0], -1) # Concatenate the vectors
    hpreact = embcat @ self.W1 # hidden layer pre-activation
    hpreact = self.bngain * (hpreact - self.bnmean_running) / self.bnstd_running + self.bnbias
    h = torch.tanh(hpreact) # hidden layer
    logits = h @ self.W2 + self.b2 # output layer
    loss = F.cross_entropy(logits, Y) # loss function
    return loss
BengioFFN.compute_loss = compute_loss

@torch.no_grad()
def generate_word(self, itoc, g):
    out = []
    context = [0] * self.context_size
    while True:
        emb = self.C[torch.tensor([context])]
        embcat = emb.view(1, -1)
        hpreact = embcat @ self.W1
        hpreact = self.bngain * (hpreact - self.bnmean_running) / self.bnstd_running + self.bnbias
        h = torch.tanh(hpreact)
        logits = h @ self.W2 + self.b2
        probs = F.softmax(logits, dim=1)
        # Sample from the probability distribution
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        # Shift the context window
        context = context[1:] + [ix]
        # Store the generated character
        if ix != 0:
            out.append(ix)
        else:
            # Stop when encounting '.'
            break
    return ''.join(itoc[i] for i in out)
BengioFFN.generate_word = generate_word

g = torch.Generator().manual_seed(seed)
nn = BengioFFN(e_dims, n_hidden, context_size, words.nb_chars, g)
lossi = nn.train(datasets, max_steps, mini_batch_size)
train_loss = nn.training_loss(datasets)
val_loss = nn.test_loss(datasets)
print(f"{train_loss=}")
print(f"{val_loss=}")
plt.plot(lossi);

      0/ 200000: 3.7070
  10000/ 200000: 2.1842
  20000/ 200000: 2.0719
  30000/ 200000: 2.0493
  40000/ 200000: 2.0728
  50000/ 200000: 1.8556
  60000/ 200000: 1.9816
  70000/ 200000: 1.6609
  80000/ 200000: 1.5744
  90000/ 200000: 2.1140
 100000/ 200000: 1.8943
 110000/ 200000: 1.5171
 120000/ 200000: 1.8516
 130000/ 200000: 1.5455
 140000/ 200000: 1.3465
 150000/ 200000: 1.3427
 160000/ 200000: 1.6364
 170000/ 200000: 1.9168
 180000/ 200000: 1.5088
 190000/ 200000: 1.6241
train_loss=1.5353056192398071
val_loss=1.6674079895019531

g = torch.Generator().manual_seed(2147483647 + 10)
for _ in range(20):
    word = nn.generate_word(words.itoc, g)
    print(word)

aves
des
saien
impôt
récialir
empêchercise
ubularieu
artenue
soire
assume
unitorigrogré
momentes
judividarf
soument
lohes
facirié
alte
jours
océ
déritestiture

7. Initialisations et normalisations¶

Restructuration du code précédent¶

Words¶

Datasets¶

Réseau de neurones à propagation avant (Feed-Forward Network)¶

Apprentissage¶

Génération de mots¶

Discussion sur l'inititialisation et la normalisation des couches¶

Loss initial¶

Squashing via tanh¶

Initialisation "Kaiming"¶

Méthode BatchNorm¶

Lien avec Pytorch¶

Code final¶

Entraînement¶

Génération¶

Synthèse des scores de loss¶