diff --git a/.vscode/launch.json b/.vscode/launch.json
index 4107551..8670901 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,15 +1,6 @@
 {
-    // Usare IntelliSense per informazioni sui possibili attributi.
-    // Al passaggio del mouse vengono visualizzate le descrizioni degli attributi esistenti.
-    // Per altre informazioni, visitare: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
     "configurations": [
-        {
-            "name": "Python: File corrente",
-            "type": "python",
-            "request": "launch",
-            "program": "/home/christian/Documenti/GitHub/Image-Captioning/v1/NeuralNet.py",
-            "console": "integratedTerminal"
-        }
+    {"name":"Main","type":"python","request":"launch","program":"${workspaceFolder}/main.py","console":"integratedTerminal", 
+        "args": ["RNetvI", "train", "1024", "1024", "--attention", "True", "--attention_dim", "1024", "--dataset_folder", "./dataset/flickr30k_images", "--device", "cuda:0", "--splits", "1", "1", "1", "--epochs", "2"]}
     ]
 }
\ No newline at end of file
diff --git a/CaRNetvHC.py b/CaRNetvHC.py
deleted file mode 100644
index 3221730..0000000
--- a/CaRNetvHC.py
+++ /dev/null
@@ -1,347 +0,0 @@
-#####################################################
-## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! 
-# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :)
-##
-##
-##  pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html
-
-
-import torch
-import torch.nn as nn
-import torchvision.models as models
-from torch.nn.utils.rnn import pack_padded_sequence
-import torch.nn.functional as F
-
-device="cpu"
-class EncoderCNN(nn.Module):
-    def __init__(self, projection_size):
-        super(EncoderCNN, self).__init__()
-        resnet = models.resnet50(pretrained=True)
-        for param in resnet.parameters():
-            param.requires_grad_(False)
-        
-        modules = list(resnet.children())[:-1]   # remove last fc layer
-        self.resnet = nn.Sequential(*modules)
-        self.linear = nn.Linear(resnet.fc.in_features, projection_size) 
-        
-    def forward(self, images):
-        features = self.resnet(images) 
-        features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.)
-        features = self.linear(features)
-        return features
-    
-class DecoderRNN(nn.Module):
-    def __init__(self, hidden_size, padding_index, vocab_size, embedding_size):
-        """[summary]
-
-        Args:
-            hidden_size ([type]): [description]
-            padding_index ([type]): [description]
-            vocab_size ([type]): [description]
-            embedding_size ([type]): [description]
-        """
-        super(DecoderRNN, self).__init__()
-    
-        # Embedding layer that turns words into a vector of a specified size
-        self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index)
-        
-        # The LSTM takes embedded word vectors (of a specified size) as input
-        # and outputs hidden states of size hidden_dim
-        self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size)
-        
-        self.hidden_size = hidden_size
-        
-        # The linear layer that maps the hidden state output dimension
-        # to the number of words we want as output, vocab_size
-        self.linear_1 = nn.Linear(hidden_size, vocab_size)
-                
-
-    def forward(self, features, captions):
-        """[summary]
-
-        Args:
-            features (torch.tensor(batch_size, hidden_size)): [description]
-            captions (torch.tensor(batch_size, max_captions_length, word_embedding)): [description]
-
-        Returns:
-            [torch.tensor(batch_size, max_captions_length, vocab_size)]: [description]
-        """             
-        
-        # Initialize the hidden state
-        batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
-        
-        # Create embedded word vector for each word in the captions
-        inputs = self.word_embeddings(captions) # In:       Out: (batch_size, captions length, embed_size)
-        
-        # Feed LSTMCell with image features and retrieve the state
-        
-        _h, _c = tuple( features, features) # _h : (Batch size, Hidden size)
-        
-        # Deterministict <SOS> Output as first word of the caption :)
-        start = torch.zeros(self.word_embeddings.num_embeddings)
-        start[1] = 1
-        outputs = start.repeat(batch_size,1,1).to(torch.device(device)) # Bulk insert of <SOS> embeddings to all the elements of the batch 
-          
-        
-        
-        # How it works the loop?
-        # For each time step t \in {0, N-1}, where N is the caption length 
-        
-        # Since the sequences are padded, how the forward is performed? Since the <EOS> don't need to be feeded as input?
-        # The assumption is that the captions are of lenght N-1, so the captions provide by external as input are without <EOS> token
-        
-        for idx in range(0,inputs.shape[1]): 
-            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))
-            _outputs = self.linear_1(_h)
-            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1)
-        
-        return outputs # (Batch Size, N, |Vocabulary|)
-    
-    def generate_caption(self, features, max_caption_length):
-        """Generate captions for given image features using greedy search."""
-       
-        sampled_ids = [torch.tensor([1]).to(torch.device(device))] # Hardcoded <SOS>
-        input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,-1))
-        with torch.no_grad(): 
-            _h, _c = tuple( features.unsqueeze(0), features.unsqueeze(0))
-            for _ in range(max_caption_length-1):
-                _h, _c = self.lstm_unit(input, (_h ,_c))           # _h: (1, 1, hidden_size)
-                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
-                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if device == "cuda" else   F.softmax(outputs,dim=1).max(1)                # predicted: (batch_size)
-                sampled_ids.append(predicted)
-                input = self.word_embeddings(predicted)                       # inputs: (batch_size, embed_size)
-                input = input.to(torch.device(device))                       # inputs: (batch_size, 1, embed_size)
-                if predicted == 2:
-                    break
-            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
-        return sampled_ids
-
-    
-class CaRNet1(nn.Module):
-    
-    def __init__(self, hidden_size, padding_index, vocab_size, embedding_size, device = "cpu"):
-        """[summary]
-
-        Args:
-            hidden_size ([type]): [description]
-            padding_index ([type]): [description]
-            vocab_size ([type]): [description]
-            embedding_size ([type]): [description]
-        """
-        super(CaRNet1, self).__init__()
-        self.padding_index = padding_index
-        self.device = torch.device(device)
-        self.C = EncoderCNN(embedding_size)
-        self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size)
-
-        self.C.to(self.device)
-        self.R.to(self.device)
-        
-    def save(self, file_name):
-        """Save the classifier."""
-        torch.save(self.C.state_dict(), f".saved/vHC/{file_name}_C.pth")
-        torch.save(self.R.state_dict(), f".saved/vHC/{file_name}_R.pth")
-
-    def load(self, file_name):
-        """Load the classifier."""
-
-        # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
-        self.C.load_state_dict(torch.load(f".saved/vHC/{file_name}_C.pth", map_location=self.device))
-        self.R.load_state_dict(torch.load(f".saved/vHC/{file_name}_R.pth", map_location=self.device))
-    
-    def forward(self,images,captions):
-        features = self.C(images)
-        return self.R(features, captions)
-    
-    def __accuracy(self, outputs, labels):
-        """[summary]
-
-        Args:
-            outputs ([type]): [description]
-            labels ([type]): [description]
-
-        Returns:
-            [type]: [description]
-        """
-        # Assume outputs and labels have same shape and already padded
-        # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch!
-        # With this technique we evaluate all major case:
-        # 1) Output caption is longer than expected : Output.ID - <PAD>.ID != 0
-        # 2) Output is less longer than expect : <PAD>.ID - Target.ID != 0
-        # 3) Output has equal dimension but different label : Output.ID - Target.ID != 0, 
-        # Hp. 1 : Output<PAD>.ID - Target<PAD>.ID = 0 need to be considered as good match because it means that both output and target end before this token
-        # Hp. 2 : Both Outputs and Target need to be dropped on the first word because <SOS> is evaluated in a deterministic fashion :)
-        # computing the accuracy
-        
-        right_predictions = torch.eq(outputs[:,1:], labels[:,1:])
-        acc = torch.mean(right_predictions.to(torch.float32).sum(axis=1) / right_predictions.shape[1] ).item()  # Accuracy = TP+TN / ALL
-        return acc
-    
-        # TO DO: Devo usare la confusion matrix????????? 
-    
-    def train(self, train_set, validation_set, lr, epochs, vocabulary):
-            
-            criterion = nn.CrossEntropyLoss(ignore_index=self.padding_index,reduction="sum").cuda() if self.device.type == "cuda"  \
-                                                else nn.CrossEntropyLoss(ignore_index=0,reduction="sum")
-            
-            # initializing some elements
-            best_val_acc = -1.  # the best accuracy computed on the validation data
-            best_epoch = -1  # the epoch in which the best accuracy above was computed
-
-            
-            
-            # ensuring the classifier is in 'train' mode (pytorch)
-            self.C.train()
-            self.R.train()
-
-            # creating the optimizer
-            optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr)
-
-            # loop on epochs!
-            for e in range(0, epochs):
-
-                # epoch-level stats (computed by accumulating mini-batch stats)
-                epoch_train_acc = 0.
-                epoch_train_loss = 0.
-                epoch_num_train_examples = 0
-
-                for images,captions_training_ids,captions_target_ids in train_set:
-                    optimizer.zero_grad() 
-                    
-                    batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size')
-                    epoch_num_train_examples += batch_num_train_examples
-                    
-                    
-                    images = images.to(self.device)
-                    captions_training_ids = captions_training_ids.to(self.device) # captions > (B, L)
-                    captions_target_ids = captions_target_ids.to(self.device) # captions > (B, |L|-1) without end token
-
-                    # computing the network output on the current mini-batch
-                    features = self.C(images)
-                    outputs = self.R(features, captions_training_ids) # outputs > (B, L, |V|); 
-                    
-                    # (B, L, |V|) -> (B * L, |V|) and captions > (B * L)
-                    loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions_target_ids.reshape(-1))
-                    
-                    # computing gradients and updating the network weights
-                    loss.backward()  # computing gradients
-                    optimizer.step()  # updating weights
-                    
-                    # with torch.no_grad():
-                    #     self.C.eval()
-                    #     self.R.eval()
-                    #     features = self.C(images)
-                    #     import random
-                    #     numb = random.randint(0,2)
-                    #     caption = self.R.generate_caption(features[numb],30)
-                    #     print(vocabulary.rev_translate(captions_target_ids[numb]))
-                    #     print(vocabulary.rev_translate(caption[0]))
-                    #     self.C.train()
-                    #     self.R.train()
-                    
-                    with torch.no_grad():
-                        self.C.eval()
-                        self.R.eval()
-                        
-                        # Compute captions as ids for all the training images
-                        projections = self.C(images)
-                        
-                        captions_output = torch.zeros((projections.shape[0],captions_target_ids.shape[1])).to(torch.device(device))
-                        
-                        for idx,projection in enumerate(range(projections.shape[0])):
-                            _caption_no_pad = self.R.generate_caption(projections[idx],captions_target_ids.shape[1])
-                            captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
-                            # Fill the remaining portion of caption eventually with zeros
-                            # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
-    
-                        captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors
-                        
-                        # computing performance
-                        batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_target_ids)
-
-                        # accumulating performance measures to get a final estimate on the whole training set
-                        epoch_train_acc += batch_train_acc * batch_num_train_examples
-
-                        # accumulating other stats
-                        epoch_train_loss += loss.item() * batch_num_train_examples
-                        self.C.train()
-                        self.R.train()
-                        
-                        # printing (mini-batch related) stats on screen
-                        print("  mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc))
-                        
-                val_acc = self.eval_classifier(validation_set)
-
-                # saving the model if the validation accuracy increases
-                if val_acc > best_val_acc:
-                    best_val_acc = val_acc
-                    best_epoch = e + 1
-                    self.save("CaRNetvHC")
-
-                epoch_train_loss /= epoch_num_train_examples
-
-                # printing (epoch related) stats on screen
-                print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}"
-                    + (", BEST!" if best_epoch == e + 1 else ""))
-                    .format(e + 1, epochs, epoch_train_loss,
-                            epoch_train_acc / epoch_num_train_examples, val_acc))
-
-    def eval_classifier(self, data_set):
-        """Evaluate the classifier on the given data set."""
-
-        # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state)
-        training_mode_originally_on = self.C.training and self.R.training
-        if training_mode_originally_on:
-            self.C.eval()
-            self.R.eval()  # enforcing evaluation mode
-
-        
-
-        with torch.no_grad():  # keeping off the autograd engine
-
-            # loop on mini-batches to accumulate the network outputs (creating a new iterator)
-            for images,_,captions_validation_target_ids in data_set:
-                images = images.to(self.device)
-                
-                captions_validation_target_ids = captions_validation_target_ids.to(self.device)
-
-                projections = self.C(images)
-                        
-                captions_output = torch.zeros((projections.shape[0],captions_validation_target_ids.shape[1])).to(torch.device(device))
-                
-                for idx,projection in enumerate(range(projections.shape[0])):
-                    _caption_no_pad = self.R.generate_caption(projections[idx],captions_validation_target_ids.shape[1])
-                    captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
-                    # Fill the remaining portion of caption eventually with zeros
-                    # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
-
-                captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors
-                
-                # computing performance
-                acc = self.__accuracy(captions_output_padded.squeeze(1), captions_validation_target_ids)
-
-        if training_mode_originally_on:
-            self.C.train()  # restoring the training state, if needed
-            self.R.train()
-        return acc
-# Example of usage
-if __name__ == "__main__":
-    from Vocabulary import Vocabulary
-    from Dataset import MyDataset
-    from torch.utils.data import DataLoader
-    ds = MyDataset("./dataset/flickr30k_images/", percentage=8)
-    v = Vocabulary(ds,reload=True) 
-    dc = ds.get_fraction_of_dataset(percentage=70)
-    df = ds.get_fraction_of_dataset(percentage=30)
-    # use dataloader facilities which requires a preprocessed dataset
-       
-    
-    dataloader_training = DataLoader(dc, batch_size=100,
-                        shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_training(data,v))
-    
-    dataloader_evaluation = DataLoader(df, batch_size=50,
-                        shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v))
-    
-    net = CaRNet1(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cpu")
-    net.load("CaRNetvHC")
-    net.train(dataloader_training,dataloader_evaluation,1e-3,500,v)
diff --git a/Images Documentation/ResNet-50.png b/Images Documentation/ResNet-50.png
new file mode 100644
index 0000000..a7ba321
Binary files /dev/null and b/Images Documentation/ResNet-50.png differ
diff --git a/LICENSE b/LICENSE
index 3ad989d..6fe794f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2022 christianSistemiPos
+Copyright (c) 2022 christiandimaio
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/NeuralModel/CaRNetvH.py b/NeuralModel/CaRNetvH.py
deleted file mode 100644
index 906ba13..0000000
--- a/NeuralModel/CaRNetvH.py
+++ /dev/null
@@ -1,347 +0,0 @@
-#####################################################
-## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! 
-# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :)
-##
-##
-##  pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html
-
-
-import torch
-import torch.nn as nn
-import torchvision.models as models
-from torch.nn.utils.rnn import pack_padded_sequence
-import torch.nn.functional as F
-
-device="cpu"
-class EncoderCNN(nn.Module):
-    def __init__(self, projection_size):
-        super(EncoderCNN, self).__init__()
-        resnet = models.resnet50(pretrained=True)
-        for param in resnet.parameters():
-            param.requires_grad_(False)
-        
-        modules = list(resnet.children())[:-1]   # remove last fc layer
-        self.resnet = nn.Sequential(*modules)
-        self.linear = nn.Linear(resnet.fc.in_features, projection_size) 
-        
-    def forward(self, images):
-        features = self.resnet(images) 
-        features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.)
-        features = self.linear(features)
-        return features
-    
-class DecoderRNN(nn.Module):
-    def __init__(self, hidden_size, padding_index, vocab_size, embedding_size):
-        """[summary]
-
-        Args:
-            hidden_size ([type]): [description]
-            padding_index ([type]): [description]
-            vocab_size ([type]): [description]
-            embedding_size ([type]): [description]
-        """
-        super(DecoderRNN, self).__init__()
-    
-        # Embedding layer that turns words into a vector of a specified size
-        self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index)
-        
-        # The LSTM takes embedded word vectors (of a specified size) as input
-        # and outputs hidden states of size hidden_dim
-        self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size)
-        
-        self.hidden_size = hidden_size
-        
-        # The linear layer that maps the hidden state output dimension
-        # to the number of words we want as output, vocab_size
-        self.linear_1 = nn.Linear(hidden_size, vocab_size)
-                
-
-    def forward(self, features, captions):
-        """[summary]
-
-        Args:
-            features (torch.tensor(batch_size, hidden_size)): [description]
-            captions (torch.tensor(batch_size, max_captions_length, word_embedding)): [description]
-
-        Returns:
-            [torch.tensor(batch_size, max_captions_length, vocab_size)]: [description]
-        """             
-        
-        # Initialize the hidden state
-        batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
-        
-        # Create embedded word vector for each word in the captions
-        inputs = self.word_embeddings(captions) # In:       Out: (batch_size, captions length, embed_size)
-        
-        # Feed LSTMCell with image features and retrieve the state
-        
-        _h, _c = tuple( features, torch.zeros((captions.shape[0],self.hidden_size))) # _h : (Batch size, Hidden size)
-        
-        # Deterministict <SOS> Output as first word of the caption :)
-        start = torch.zeros(self.word_embeddings.num_embeddings)
-        start[1] = 1
-        outputs = start.repeat(batch_size,1,1).to(torch.device(device)) # Bulk insert of <SOS> embeddings to all the elements of the batch 
-          
-        
-        
-        # How it works the loop?
-        # For each time step t \in {0, N-1}, where N is the caption length 
-        
-        # Since the sequences are padded, how the forward is performed? Since the <EOS> don't need to be feeded as input?
-        # The assumption is that the captions are of lenght N-1, so the captions provide by external as input are without <EOS> token
-        
-        for idx in range(0,inputs.shape[1]): 
-            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))
-            _outputs = self.linear_1(_h)
-            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1)
-        
-        return outputs # (Batch Size, N, |Vocabulary|)
-    
-    def generate_caption(self, features, max_caption_length):
-        """Generate captions for given image features using greedy search."""
-       
-        sampled_ids = [torch.tensor([1]).to(torch.device(device))] # Hardcoded <SOS>
-        input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,-1))
-        with torch.no_grad(): 
-            _h, _c = tuple( features.unsqueeze(0), torch.zeros((1,self.hidden_size)))
-            for _ in range(max_caption_length-1):
-                _h, _c = self.lstm_unit(input, (_h ,_c))           # _h: (1, 1, hidden_size)
-                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
-                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if device == "cuda" else   F.softmax(outputs,dim=1).max(1)                # predicted: (batch_size)
-                sampled_ids.append(predicted)
-                input = self.word_embeddings(predicted)                       # inputs: (batch_size, embed_size)
-                input = input.to(torch.device(device))                       # inputs: (batch_size, 1, embed_size)
-                if predicted == 2:
-                    break
-            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
-        return sampled_ids
-
-    
-class CaRNet1(nn.Module):
-    
-    def __init__(self, hidden_size, padding_index, vocab_size, embedding_size, device = "cpu"):
-        """[summary]
-
-        Args:
-            hidden_size ([type]): [description]
-            padding_index ([type]): [description]
-            vocab_size ([type]): [description]
-            embedding_size ([type]): [description]
-        """
-        super(CaRNet1, self).__init__()
-        self.padding_index = padding_index
-        self.device = torch.device(device)
-        self.C = EncoderCNN(embedding_size)
-        self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size)
-
-        self.C.to(self.device)
-        self.R.to(self.device)
-        
-    def save(self, file_name):
-        """Save the classifier."""
-        torch.save(self.C.state_dict(), f".saved/vH/{file_name}_C.pth")
-        torch.save(self.R.state_dict(), f".saved/vH/{file_name}_R.pth")
-
-    def load(self, file_name):
-        """Load the classifier."""
-
-        # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
-        self.C.load_state_dict(torch.load(f".saved/vH/{file_name}_C.pth", map_location=self.device))
-        self.R.load_state_dict(torch.load(f".saved/vH/{file_name}_R.pth", map_location=self.device))
-    
-    def forward(self,images,captions):
-        features = self.C(images)
-        return self.R(features, captions)
-    
-    def __accuracy(self, outputs, labels):
-        """[summary]
-
-        Args:
-            outputs ([type]): [description]
-            labels ([type]): [description]
-
-        Returns:
-            [type]: [description]
-        """
-        # Assume outputs and labels have same shape and already padded
-        # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch!
-        # With this technique we evaluate all major case:
-        # 1) Output caption is longer than expected : Output.ID - <PAD>.ID != 0
-        # 2) Output is less longer than expect : <PAD>.ID - Target.ID != 0
-        # 3) Output has equal dimension but different label : Output.ID - Target.ID != 0, 
-        # Hp. 1 : Output<PAD>.ID - Target<PAD>.ID = 0 need to be considered as good match because it means that both output and target end before this token
-        # Hp. 2 : Both Outputs and Target need to be dropped on the first word because <SOS> is evaluated in a deterministic fashion :)
-        # computing the accuracy
-        
-        right_predictions = torch.eq(outputs[:,1:], labels[:,1:])
-        acc = torch.mean(right_predictions.to(torch.float32).sum(axis=1) / right_predictions.shape[1] ).item()  # Accuracy = TP+TN / ALL
-        return acc
-    
-        # TO DO: Devo usare la confusion matrix????????? 
-    
-    def train(self, train_set, validation_set, lr, epochs, vocabulary):
-            
-            criterion = nn.CrossEntropyLoss(ignore_index=self.padding_index,reduction="sum").cuda() if self.device.type == "cuda"  \
-                                                else nn.CrossEntropyLoss(ignore_index=0,reduction="sum")
-            
-            # initializing some elements
-            best_val_acc = -1.  # the best accuracy computed on the validation data
-            best_epoch = -1  # the epoch in which the best accuracy above was computed
-
-            
-            
-            # ensuring the classifier is in 'train' mode (pytorch)
-            self.C.train()
-            self.R.train()
-
-            # creating the optimizer
-            optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr)
-
-            # loop on epochs!
-            for e in range(0, epochs):
-
-                # epoch-level stats (computed by accumulating mini-batch stats)
-                epoch_train_acc = 0.
-                epoch_train_loss = 0.
-                epoch_num_train_examples = 0
-
-                for images,captions_training_ids,captions_target_ids in train_set:
-                    optimizer.zero_grad() 
-                    
-                    batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size')
-                    epoch_num_train_examples += batch_num_train_examples
-                    
-                    
-                    images = images.to(self.device)
-                    captions_training_ids = captions_training_ids.to(self.device) # captions > (B, L)
-                    captions_target_ids = captions_target_ids.to(self.device) # captions > (B, |L|-1) without end token
-
-                    # computing the network output on the current mini-batch
-                    features = self.C(images)
-                    outputs = self.R(features, captions_training_ids) # outputs > (B, L, |V|); 
-                    
-                    # (B, L, |V|) -> (B * L, |V|) and captions > (B * L)
-                    loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions_target_ids.reshape(-1))
-                    
-                    # computing gradients and updating the network weights
-                    loss.backward()  # computing gradients
-                    optimizer.step()  # updating weights
-                    
-                    # with torch.no_grad():
-                    #     self.C.eval()
-                    #     self.R.eval()
-                    #     features = self.C(images)
-                    #     import random
-                    #     numb = random.randint(0,2)
-                    #     caption = self.R.generate_caption(features[numb],30)
-                    #     print(vocabulary.rev_translate(captions_target_ids[numb]))
-                    #     print(vocabulary.rev_translate(caption[0]))
-                    #     self.C.train()
-                    #     self.R.train()
-                    
-                    with torch.no_grad():
-                        self.C.eval()
-                        self.R.eval()
-                        
-                        # Compute captions as ids for all the training images
-                        projections = self.C(images)
-                        
-                        captions_output = torch.zeros((projections.shape[0],captions_target_ids.shape[1])).to(torch.device(device))
-                        
-                        for idx,projection in enumerate(range(projections.shape[0])):
-                            _caption_no_pad = self.R.generate_caption(projections[idx],captions_target_ids.shape[1])
-                            captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
-                            # Fill the remaining portion of caption eventually with zeros
-                            # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
-    
-                        captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors
-                        
-                        # computing performance
-                        batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_target_ids)
-
-                        # accumulating performance measures to get a final estimate on the whole training set
-                        epoch_train_acc += batch_train_acc * batch_num_train_examples
-
-                        # accumulating other stats
-                        epoch_train_loss += loss.item() * batch_num_train_examples
-                        self.C.train()
-                        self.R.train()
-                        
-                        # printing (mini-batch related) stats on screen
-                        print("  mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc))
-                        
-                val_acc = self.eval_classifier(validation_set)
-
-                # saving the model if the validation accuracy increases
-                if val_acc > best_val_acc:
-                    best_val_acc = val_acc
-                    best_epoch = e + 1
-                    self.save("CaRNetvH")
-
-                epoch_train_loss /= epoch_num_train_examples
-
-                # printing (epoch related) stats on screen
-                print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}"
-                    + (", BEST!" if best_epoch == e + 1 else ""))
-                    .format(e + 1, epochs, epoch_train_loss,
-                            epoch_train_acc / epoch_num_train_examples, val_acc))
-
-    def eval_classifier(self, data_set):
-        """Evaluate the classifier on the given data set."""
-
-        # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state)
-        training_mode_originally_on = self.C.training and self.R.training
-        if training_mode_originally_on:
-            self.C.eval()
-            self.R.eval()  # enforcing evaluation mode
-
-        
-
-        with torch.no_grad():  # keeping off the autograd engine
-
-            # loop on mini-batches to accumulate the network outputs (creating a new iterator)
-            for images,_,captions_validation_target_ids in data_set:
-                images = images.to(self.device)
-                
-                captions_validation_target_ids = captions_validation_target_ids.to(self.device)
-
-                projections = self.C(images)
-                        
-                captions_output = torch.zeros((projections.shape[0],captions_validation_target_ids.shape[1])).to(torch.device(device))
-                
-                for idx,projection in enumerate(range(projections.shape[0])):
-                    _caption_no_pad = self.R.generate_caption(projections[idx],captions_validation_target_ids.shape[1])
-                    captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
-                    # Fill the remaining portion of caption eventually with zeros
-                    # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
-
-                captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors
-                
-                # computing performance
-                acc = self.__accuracy(captions_output_padded.squeeze(1), captions_validation_target_ids)
-
-        if training_mode_originally_on:
-            self.C.train()  # restoring the training state, if needed
-            self.R.train()
-        return acc
-# Example of usage
-if __name__ == "__main__":
-    from Vocabulary import Vocabulary
-    from Dataset import MyDataset
-    from torch.utils.data import DataLoader
-    ds = MyDataset("./dataset/flickr30k_images/", percentage=8)
-    v = Vocabulary(ds,reload=True) 
-    dc = ds.get_fraction_of_dataset(percentage=70)
-    df = ds.get_fraction_of_dataset(percentage=30)
-    # use dataloader facilities which requires a preprocessed dataset
-       
-    
-    dataloader_training = DataLoader(dc, batch_size=100,
-                        shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_training(data,v))
-    
-    dataloader_evaluation = DataLoader(df, batch_size=50,
-                        shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v))
-    
-    net = CaRNet1(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cpu")
-    net.load("CaRNetvH")
-    net.train(dataloader_training,dataloader_evaluation,1e-3,500,v)
diff --git a/NeuralModel/CaRNetvHC.py b/NeuralModel/CaRNetvHC.py
deleted file mode 100644
index 2cf2488..0000000
--- a/NeuralModel/CaRNetvHC.py
+++ /dev/null
@@ -1,444 +0,0 @@
-#####################################################
-## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! 
-# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :)
-##
-##
-##  pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html
-
-
-import torch
-import torch.nn as nn
-import torchvision.models as models
-from torch.nn.utils.rnn import pack_padded_sequence
-import torch.nn.functional as F
-from typing import Tuple,List
-from Dataset import MyDataset
-from Vocabulary import Vocabulary
-
-class EncoderCNN(nn.Module):
-    def __init__(self, projection_size: int, device: str = "cpu"):
-        """Constructor of the Encoder NN
-
-        Args:
-            projection_size (int): The dimension of projection into the space of RNN (Could be the input or the hidden state).
-            
-            device (str, optional): The device on which the operations will be performed. Default "cpu".
-        """
-        super(EncoderCNN, self).__init__()
-        
-        self.device = torch.device(device)
-        resnet = models.resnet50(pretrained=True)
-        for param in resnet.parameters(): # Freezing weights 
-            param.requires_grad_(False)
-        
-        modules = list(resnet.children())[:-1]   # remove last fc layer
-        self.resnet = nn.Sequential(*modules)
-        
-        self.linear = nn.Linear(resnet.fc.in_features, projection_size) # define a last layer 
-        
-    def forward(self, images: torch.Tensor) -> torch.Tensor:
-        """Forward operation of the nn
-
-        Args:
-            images (torch.tensor): The tensor of the image in the form (Batch Size, Channels, Width, Height)
-
-        Returns:
-            [torch.tensor]: Features Projection in the form (Batch Size, Projection Dim.)
-        """
-        # To Do Add dimensionality 
-        features = self.resnet(images)
-        
-        features = features.reshape(features.size(0), -1).to(self.device)
-        features = self.linear(features)
-        
-        return features
-    
-class DecoderRNN(nn.Module):
-    def __init__(self, hidden_size: int, padding_index: int, vocab_size: int, embedding_size: int, device: str = "cpu"):
-        """Define the constructor for the RNN Net
-
-        Args:
-            hidden_size (int): The Capacity of the LSTM Cell
-            padding_index (int): The index of the padding id, given from the vocabulary associated to the dataset
-            vocab_size (int)): The size of the vocabulary associated to the dataset
-            embedding_size (int): The number of dimension associated to the input of the LSTM cell
-            device (str, optional): The device on which the operations will be performed. Default "cpu"
-        """
-        super(DecoderRNN, self).__init__()
-    
-        self.device = torch.device(device)
-        # Embedding layer that turns words into a vector of a specified size
-        self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index)
-        
-        # The LSTM takes embedded word vectors (of a specified size) as input
-        # and outputs hidden states of size hidden_dim
-        self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size)
-        
-        # The linear layer that maps the hidden state output dimension
-        # to the number of words we want as output, vocab_size
-        self.linear_1 = nn.Linear(hidden_size, vocab_size)
-                
-
-    def forward(self, features: torch.tensor, captions: torch.tensor, captions_length: List[int]) -> Tuple[torch.tensor, List[int]]:
-        """Compute the forward operation of the RNN.
-                input of the LSTM cell for each time step:
-                    t_{-1}: feature vector 
-                    t_0: Deterministict <SOS> 
-                    .
-                    .
-                    .
-                    t_{N-1}: The embedding vector associated to the S_{N-1} id.
-
-        Args:
-            features (torch.tensor): The features associated to each element of the batch. (batch_size, embed_size)
-            
-            captions (torch.tensor): The caption associated to each element of the batch. (batch_size, max_captions_length, word_embedding)
-                REMARK Each caption is in the full form: <SOS> + .... + <EOS>
-                
-            caption_length ([int]): The length of each caption in the batch.    
-        Returns:
-            (torch.tensor): The hidden state of each time step from t_1 to t_N. (batch_size, max_captions_length, vocab_size)
-            
-            (list(int)): The length of each decoded caption. 
-                REMARK The <SOS> is provided as input at t_0.
-                REMARK The <EOS> token will be removed from the input of the LSTM.
-        """             
-        
-        # Retrieve batch size 
-        batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
-        
-        # Create embedded word vector for each word in the captions
-        inputs = self.word_embeddings(captions) # In:       Out: (batch_size, captions length, embed_size)
-        
-        # Initialize the hidden state and the cell state at time t_{-1}
-        _h, _c = (features, features) # _h : (Batch size, Hidden size), _c : (Batch size, Hidden size)
-        
-        # Deterministict <SOS> Output as first word of the caption t_{0}
-        start = self.word_embeddings(torch.LongTensor([1]).to(self.device))  # Get the embeddings of the token <SOS>
-        
-        # Bulk insert of <SOS> embeddings to all the elements of the batch 
-        outputs = start.repeat(batch_size,1,1).to(self.device) 
-          
-        # Feed LSTMCell with image features and retrieve the state
-        
-        # How it works the loop?
-        # For each time step t \in {0, N-1}, where N is the caption length 
-        
-        # Since the sequences are padded, how the forward is performed? Since the <EOS> don't need to be feeded as input?
-        # The assumption is that the decode captions will have a length 
-        
-        for idx in range(0,inputs.shape[1]): 
-            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))  # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions
-            _outputs = self.linear_1(_h) 
-            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell for all the elements in batch
-        
-        return outputs, list(map(lambda length: length-1, captions_length))  
-    
-    def generate_caption(self, feature: torch.tensor, captions_length: int) -> torch.tensor:
-        """Given the features vector retrieved by the encoder, perform a decoding (Generate a caption)
-
-        Args:
-            feature (torch.tensor): The features vector (1, embedding_size)
-            captions_length (int): The length of the caption
-
-        Returns:
-            torch.tensor: The caption associated to the image given. 
-                    It includes <SOS> at t_0 by default.
-        """
-        
-        sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded <SOS>
-        input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1))
-        with torch.no_grad(): 
-            _h ,_c = (feature.unsqueeze(0),feature.unsqueeze(0))
-            for _ in range(captions_length-1):
-                _h, _c = self.lstm_unit(input, (_h ,_c))           # _h: (1, 1, hidden_size)
-                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
-                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if self.device.type == "cuda" else   F.softmax(outputs,dim=1).max(1)  # predicted: The predicted id
-                sampled_ids.append(predicted)
-                input = self.word_embeddings(predicted)                       # inputs: (batch_size, embed_size)
-                input = input.to(torch.device(self.device))                       # inputs: (batch_size, 1, embed_size)
-                if predicted == 2:
-                    break
-            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (1, captions_length)
-        return sampled_ids
-
-    
-class CaRNetvHC(nn.Module):
-    
-    def __init__(self, hidden_size: int, padding_index: int, vocab_size: int, embedding_size: int, device: str = "cpu"):
-        """Create the CaRNet 
-
-        Args:
-            hidden_size (int): The Capacity of the LSTM Cell
-            padding_index (int): The index of the padding id, given from the vocabulary associated to the dataset
-            vocab_size (int)): The size of the vocabulary associated to the dataset
-            embedding_size (int): The number of dimension associated to the input of the LSTM cell
-            device (str, optional): The device on which the net does the computation. Defaults to "cpu".
-        """
-        
-        super(CaRNetvHC, self).__init__()
-        self.padding_index = padding_index
-        self.device = torch.device(device)
-        
-        # Define Encoder and Decoder
-        self.C = EncoderCNN(hidden_size, device)
-        self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size, device)
-
-        self.C.to(self.device)
-        self.R.to(self.device)
-        
-    def save(self, file_path: str) -> bool:
-        """Save the net in non-volatile memory
-
-        Args:
-            file_name (str): Relative path to save the net. Ex. "home/pippo/saved"
-
-        Returns:
-            bool: If True: Net saved correctly. False otherwise.
-        """
-        try:
-            torch.save(self.C.state_dict(), f"{file_path}/CaRNetvHC_C.pth")
-            torch.save(self.R.state_dict(), f"{file_path}/CaRNetvHC_R.pth")
-        except Exception as ex:
-            print(ex)
-            return False
-        return True
-
-    def load(self, file_path: str) -> bool:
-        """Load the net from non-volatile memory into RAM
-
-        Args:
-            file_name (str): Relative path of the net. Ex. "home/pippo/saved"
-
-        Returns:
-            bool: If True: Net loaded correctly. False otherwise.
-        """
-        
-        # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
-        self.C.load_state_dict(torch.load(f"{file_path}/CaRNetvHC_C.pth", map_location=self.device))
-        self.R.load_state_dict(torch.load(f"{file_path}/CaRNetvHC_R.pth", map_location=self.device))
-    
-    def forward(self, images: torch.tensor, captions: torch.tensor) -> torch.tensor:
-        """Provide images to the net for retrieve captions
-
-        Args:
-            images (torch.tensor): The images of the batch. (Batch Size, Channels, Width, Height)
-            captions (torch.tensor): (Batch Size, Max_Captions_Length). 
-                ASSUMPION: The captions are padded with <PAD> Token
-
-        Returns:
-            (torch.tensor): The hidden state of each time step from t_1 to t_N. (batch_size, max_captions_length, vocab_size)
-        """
-        features = self.C(images)
-        return self.R(features, captions)
-
-    def __accuracy(self, outputs: torch.tensor, labels: torch.tensor, captions_length: List[int]) -> float:
-        """Evaluate the accuracy of the Net.
-                Assumption: outputs and labels have same shape and already padded.
-
-        Args:
-            outputs (torch.tensor): [description]
-            labels (torch.tensor): [description]
-            captions_length (list): [description]
-
-        Returns:
-            float: The accuracy of the Net
-        """
-        
-        # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch!
-        
-        # computing the accuracy 
-        
-        # To Do add dimensionality 
-        outputs = torch.nn.utils.rnn.pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True).to(self.device)
-        labels = torch.nn.utils.rnn.pack_padded_sequence(labels, captions_length.cpu(), batch_first=True).to(self.device)
-        right_predictions =  outputs.data - labels.data == 0
-        
-        acc = right_predictions.to(torch.float32).sum(axis=0) / right_predictions.shape[0]  
-        return acc
-    
-        # TO DO: Devo usare la confusion matrix????????? 
-    
-    def train(self, train_set: MyDataset, validation_set: MyDataset, lr: float, epochs: int, vocabulary: Vocabulary):
-        """[summary]
-
-        Args:
-            train_set (MyDataset): [description]
-            validation_set (MyDataset): [description]
-            lr (float): [description]
-            epochs (int): [description]
-            vocabulary (Vocabulary): [description]
-        """
-        
-        # Initialize Loss: CrossEntropyLoss -> Softmax + NegativeLogLikelihoodLoss 
-        # Q. Why ignore_index is setted to <SOS> instead of <PAD>?
-        # A. In the training, both output of the CaRNet and Target label start as padded tensor, but when we compute the loss it will evaluate the tensor with pack_padded_sequence.
-        #       And since <SOS> token is hardcoded as output at t_0 we could avoid the computation of loss on it, since will be 0 fover.                     
-        
-        criterion = nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()["<SOS>"],reduction="sum").cuda() if self.device.type == "cuda"  \
-                                            else nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()["<SOS>"],reduction="sum")
-        
-        # initializing some elements
-        best_val_acc = -1.  # the best accuracy computed on the validation data
-        best_epoch = -1  # the epoch in which the best accuracy above was computed
-        
-        # ensuring the classifier is in 'train' mode (pytorch)
-        self.C.train()
-        self.R.train()
-
-        # creating the optimizer
-        optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr)
-
-        # loop on epochs!
-        for e in range(0, epochs):
-
-            # epoch-level stats (computed by accumulating mini-batch stats)
-            epoch_train_acc = 0.
-            epoch_train_loss = 0.
-            epoch_num_train_examples = 0
-
-            for images,captions_ids,captions_length in  train_set:
-                optimizer.zero_grad() 
-                
-                batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size') -> last batch truncated
-                epoch_num_train_examples += batch_num_train_examples
-                
-                
-                images = images.to(self.device)
-                captions_ids = captions_ids.to(self.device) # captions > (B, L)
-                captions_length = captions_length.to(self.device)
-                
-                # computing the network output on the current mini-batch
-                features = self.C(images)
-                outputs, outputs_length = self.R(features, captions_ids, captions_length) # outputs > (B, L, |V|); 
-                
-                outputs = pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True)  #(Batch, MaxCaptionLength, |Vocabulary|) -> (Batch * CaptionLength, |Vocabulary|)
-                
-                targets = pack_padded_sequence(captions_ids, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength) -> (Batch * CaptionLength)
-                
-                
-                loss = criterion(outputs.data, targets.data)
-                
-                # computing gradients and updating the network weights
-                loss.backward()  # computing gradients
-                optimizer.step()  # updating weights
-                
-                with torch.no_grad():
-                    self.C.eval()
-                    self.R.eval()
-                    features = self.C(images)
-                    import random
-                    numb = random.randint(0,2)
-                    caption = self.R.generate_caption(features[numb],30)
-                    print(vocabulary.rev_translate(captions_ids[numb]))
-                    print(vocabulary.rev_translate(caption[0]))
-                    self.C.train()
-                    self.R.train()
-                
-                with torch.no_grad():
-                    self.C.eval()
-                    self.R.eval()
-                    
-                    # Compute captions as ids for all the training images
-                    projections = self.C(images)
-                    
-                    captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device)
-                    
-                    for idx,projection in enumerate(range(projections.shape[0])):
-                        _caption_no_pad = self.R.generate_caption(projections[idx],captions_ids.shape[1])
-                        captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
-                        # Fill the remaining portion of caption eventually with zeros
-                        # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
-
-                    captions_output_padded = captions_output.type(torch.int32).to(self.device) # From list of tensors to tensors
-                    
-                    # computing performance
-                    batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length)
-
-                    # accumulating performance measures to get a final estimate on the whole training set
-                    epoch_train_acc += batch_train_acc * batch_num_train_examples
-
-                    # accumulating other stats
-                    epoch_train_loss += loss.item() * batch_num_train_examples
-                    self.C.train()
-                    self.R.train()
-                    
-                    # printing (mini-batch related) stats on screen
-                    print("  mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc))
-                    
-            val_acc = self.eval_classifier(validation_set)
-
-            # # saving the model if the validation accuracy increases
-            if val_acc > best_val_acc:
-                best_val_acc = val_acc
-                best_epoch = e + 1
-                
-                self.save("/content/drive/MyDrive/Progetti/Neural Networks/.saved")
-
-            epoch_train_loss /= epoch_num_train_examples
-
-            # printing (epoch related) stats on screen
-            print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}"
-                + (", BEST!" if best_epoch == e + 1 else ""))
-                .format(e + 1, epochs, epoch_train_loss,
-                        epoch_train_acc / epoch_num_train_examples, val_acc))
-
-
-    def eval_classifier(self, data_set):
-        """Evaluate the classifier on the given data set."""
-
-        # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state)
-        training_mode_originally_on = self.C.training and self.R.training
-        if training_mode_originally_on:
-            self.C.eval()
-            self.R.eval()  # enforcing evaluation mode
-
-        
-
-        with torch.no_grad():  # keeping off the autograd engine
-
-            # loop on mini-batches to accumulate the network outputs (creating a new iterator)
-            for images,captions_ids,captions_length  in data_set:
-                images = images.to(self.device)
-                
-                captions_ids = captions_ids.to(self.device)
-
-                projections = self.C(images)
-                        
-                captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device)
-                
-                for idx,projection in enumerate(range(projections.shape[0])):
-                    _caption_no_pad = self.R.generate_caption(projections[idx],captions_ids.shape[1])
-                    captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
-                    # Fill the remaining portion of caption eventually with zeros
-                    # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
-
-                captions_output_padded = captions_output.type(torch.int32).to(self.device) # From list of tensors to tensors
-                
-                # computing performance
-                acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length)
-
-        if training_mode_originally_on:
-            self.C.train()  # restoring the training state, if needed
-            self.R.train()
-        return acc
-    
-# Example of usage
-if __name__ == "__main__":
-    from torch.utils.data import DataLoader
-    ds = MyDataset("./dataset", percentage=1)
-    v = Vocabulary(ds,reload=True) 
-    dc = ds.get_fraction_of_dataset(percentage=70, delete_transfered_from_source=True)
-    df = ds.get_fraction_of_dataset(percentage=30, delete_transfered_from_source=True)
-    # use dataloader facilities which requires a preprocessed dataset
-       
-    
-    dataloader_training = DataLoader(dc, batch_size=32,
-                        shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_training(data,v))
-    
-    dataloader_evaluation = DataLoader(df, batch_size=32,
-                        shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v))
-    
-    net = CaRNetvHC(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cuda:0")
-    #net.load("CaRNetvI")
-    net.train(dataloader_training,dataloader_evaluation,1e-3,500,v)
diff --git a/NeuralModel/CaRNetvI.py b/NeuralModel/CaRNetvI.py
deleted file mode 100644
index 32e35ac..0000000
--- a/NeuralModel/CaRNetvI.py
+++ /dev/null
@@ -1,449 +0,0 @@
-#####################################################
-## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! 
-# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :)
-##
-##
-##  pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html
-
-
-import torch
-import torch.nn as nn
-import torchvision.models as models
-from torch.nn.utils.rnn import pack_padded_sequence
-import torch.nn.functional as F
-from typing import Tuple,List
-from Dataset import MyDataset
-from Vocabulary import Vocabulary
-class EncoderCNN(nn.Module):
-    def __init__(self, projection_size: int, device: str = "cpu"):
-        """Constructor of the Encoder NN
-
-        Args:
-            projection_size (int): The dimension of projection into the space of RNN (Could be the input or the hidden state).
-            
-            device (str, optional): The device on which the operations will be performed. Default "cpu".
-        """
-        super(EncoderCNN, self).__init__()
-        
-        self.device = torch.device(device)
-        resnet = models.resnet50(pretrained=True)
-        for param in resnet.parameters(): # Freezing weights 
-            param.requires_grad_(False)
-        
-        modules = list(resnet.children())[:-1]   # remove last fc layer
-        self.resnet = nn.Sequential(*modules)
-        
-        self.linear = nn.Linear(resnet.fc.in_features, projection_size) # define a last layer 
-        
-    def forward(self, images: torch.Tensor) -> torch.Tensor:
-        """Forward operation of the nn
-
-        Args:
-            images (torch.tensor): The tensor of the image in the form (Batch Size, Channels, Width, Height)
-
-        Returns:
-            [torch.tensor]: Features Projection in the form (Batch Size, Projection Dim.)
-        """
-        # To Do Add dimensionality 
-        features = self.resnet(images)
-        
-        features = features.reshape(features.size(0), -1).to(self.device)
-        features = self.linear(features)
-        
-        return features
-    
-class DecoderRNN(nn.Module):
-    def __init__(self, hidden_size: int, padding_index: int, vocab_size: int, embedding_size: int, device: str = "cpu"):
-        """Define the constructor for the RNN Net
-
-        Args:
-            hidden_size (int): The Capacity of the LSTM Cell
-            padding_index (int): The index of the padding id, given from the vocabulary associated to the dataset
-            vocab_size (int)): The size of the vocabulary associated to the dataset
-            embedding_size (int): The number of dimension associated to the input of the LSTM cell
-            device (str, optional): The device on which the operations will be performed. Default "cpu"
-        """
-        super(DecoderRNN, self).__init__()
-    
-        self.device = torch.device(device)
-        # Embedding layer that turns words into a vector of a specified size
-        self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index)
-        
-        # The LSTM takes embedded word vectors (of a specified size) as input
-        # and outputs hidden states of size hidden_dim
-        self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size)
-        
-        # The linear layer that maps the hidden state output dimension
-        # to the number of words we want as output, vocab_size
-        self.linear_1 = nn.Linear(hidden_size, vocab_size)
-                
-
-    def forward(self, features: torch.tensor, captions: torch.tensor, captions_length: list[int]) -> Tuple[torch.tensor, list[int]]:
-        """Compute the forward operation of the RNN.
-                input of the LSTM cell for each time step:
-                    t_{-1}: feature vector 
-                    t_0: Deterministict <SOS> 
-                    .
-                    .
-                    .
-                    t_{N-1}: The embedding vector associated to the S_{N-1} id.
-                    t_{N}:
-                    .
-                    .
-                    .
-                    
-
-        Args:
-            features (torch.tensor): The features associated to each element of the batch. (batch_size, embed_size)
-            
-            captions (torch.tensor): The caption associated to each element of the batch. (batch_size, max_captions_length, word_embedding)
-                REMARK Each caption is in the full form: <SOS> + .... + <EOS>
-                
-            caption_length ([int]): The length of each caption in the batch.    
-        Returns:
-            (torch.tensor): The hidden state of each time step from t_1 to t_{MaxN}. (batch_size, max_captions_length, vocab_size)
-            
-            (list(int)): The length of each decoded caption. 
-                REMARK The <SOS> is provided as input at t_0.
-                REMARK The <EOS> token will be removed from the input of the LSTM.
-        """             
-        
-        # Retrieve batch size 
-        batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
-        
-        # Create embedded word vector for each word in the captions
-        inputs = self.word_embeddings(captions) # In:       Out: (batch_size, captions length, embed_size)
-        
-        # Initialize the hidden state and the cell state at time t_{-1}
-        _h, _c = self.lstm_unit(features) # _h : (Batch size, Hidden size), _c : (Batch size, Hidden size)
-        
-        # Deterministict <SOS> Output as first word of the caption t_{0}
-        start = self.word_embeddings(torch.LongTensor([1]).to(self.device))  # Get the embeddings of the token <SOS>
-        
-        # Bulk insert of <SOS> embeddings to all the elements of the batch 
-        outputs = start.repeat(batch_size,1,1).to(self.device) 
-          
-        # Feed LSTMCell with image features and retrieve the state
-        
-        # How it works the loop?
-        # For each time step t \in {0, N-1}, where N is the caption length 
-        
-        # Since the sequences are padded, how the forward is performed? Since the <EOS> don't need to be feeded as input?
-        # The assumption is that the decode captions will have a length 
-        
-        for idx in range(0,inputs.shape[1]): 
-            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))  # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions
-            _outputs = self.linear_1(_h) 
-            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell for all the elements in batch
-        
-        return outputs, list(map(lambda length: length-1, captions_length))  
-    
-    def generate_caption(self, feature: torch.tensor, captions_length: int) -> torch.tensor:
-        """Given the features vector retrieved by the encoder, perform a decoding (Generate a caption)
-
-        Args:
-            feature (torch.tensor): The features vector (1, embedding_size)
-            captions_length (int): The length of the caption
-
-        Returns:
-            torch.tensor: The caption associated to the image given. 
-                    It includes <SOS> at t_0 by default.
-        """
-        
-        sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded <SOS>
-        input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1))
-        with torch.no_grad(): 
-            _h ,_c = self.lstm_unit(feature.unsqueeze(0))
-            for _ in range(captions_length-1):
-                _h, _c = self.lstm_unit(input, (_h ,_c))           # _h: (1, 1, hidden_size)
-                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
-                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if self.device.type == "cuda" else   F.softmax(outputs,dim=1).max(1)  # predicted: The predicted id
-                sampled_ids.append(predicted)
-                input = self.word_embeddings(predicted)                       # inputs: (batch_size, embed_size)
-                input = input.to(torch.device(self.device))                       # inputs: (batch_size, 1, embed_size)
-                if predicted == 2:
-                    break
-            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (1, captions_length)
-        return sampled_ids
-
-    
-class CaRNetvI(nn.Module):
-    
-    def __init__(self, hidden_size: int, padding_index: int, vocab_size: int, embedding_size: int, device: str = "cpu"):
-        """Create the CaRNet 
-
-        Args:
-            hidden_size (int): The Capacity of the LSTM Cell
-            padding_index (int): The index of the padding id, given from the vocabulary associated to the dataset
-            vocab_size (int)): The size of the vocabulary associated to the dataset
-            embedding_size (int): The number of dimension associated to the input of the LSTM cell
-            device (str, optional): The device on which the net does the computation. Defaults to "cpu".
-        """
-        
-        super(CaRNetvI, self).__init__()
-        self.padding_index = padding_index
-        self.device = torch.device(device)
-        
-        # Define Encoder and Decoder
-        self.C = EncoderCNN(embedding_size, device)
-        self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size, device)
-
-        self.C.to(self.device)
-        self.R.to(self.device)
-        
-    def save(self, file_path: str) -> bool:
-        """Save the net in non-volatile memory
-
-        Args:
-            file_name (str): Relative path to save the net. Ex. "home/pippo/saved"
-
-        Returns:
-            bool: If True: Net saved correctly. False otherwise.
-        """
-        try:
-            torch.save(self.C.state_dict(), f"{file_path}/CaRNetvI_C.pth")
-            torch.save(self.R.state_dict(), f"{file_path}/CaRNetvI_C_R.pth")
-        except Exception as ex:
-            print(ex)
-            return False
-        return True
-
-    def load(self, file_path: str) -> bool:
-        """Load the net from non-volatile memory into RAM
-
-        Args:
-            file_name (str): Relative path of the net. Ex. "home/pippo/saved"
-
-        Returns:
-            bool: If True: Net loaded correctly. False otherwise.
-        """
-        
-        # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
-        self.C.load_state_dict(torch.load(f"{file_path}/CaRNetvI_C.pth", map_location=self.device))
-        self.R.load_state_dict(torch.load(f"{file_path}/CaRNetvI_C_R.pth", map_location=self.device))
-    
-    def forward(self, images: torch.tensor, captions: torch.tensor) -> torch.tensor:
-        """Provide images to the net for retrieve captions
-
-        Args:
-            images (torch.tensor): The images of the batch. (Batch Size, Channels, Width, Height)
-            captions (torch.tensor): (Batch Size, Max_Captions_Length). 
-                ASSUMPION: The captions are padded with <PAD> Token
-
-        Returns:
-            (torch.tensor): The hidden state of each time step from t_1 to t_N. (batch_size, max_captions_length, vocab_size)
-        """
-        features = self.C(images)
-        return self.R(features, captions)
-
-    def __accuracy(self, outputs: torch.tensor, labels: torch.tensor, captions_length: List[int]) -> float:
-        """Evaluate the accuracy of the Net.
-                Assumption: outputs and labels have same shape and already padded.
-
-        Args:
-            outputs (torch.tensor): [description]
-            labels (torch.tensor): [description]
-            captions_length (list): [description]
-
-        Returns:
-            float: The accuracy of the Net
-        """
-        
-        # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch!
-        
-        # computing the accuracy 
-        
-        # To Do add dimensionality 
-        outputs = torch.nn.utils.rnn.pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True).to(self.device)
-        labels = torch.nn.utils.rnn.pack_padded_sequence(labels, captions_length.cpu(), batch_first=True).to(self.device)
-        right_predictions =  outputs.data - labels.data == 0
-        
-        acc = right_predictions.to(torch.float32).sum(axis=0) / right_predictions.shape[0]  
-        return acc
-    
-        # TO DO: Devo usare la confusion matrix????????? 
-    
-    def train(self, train_set: MyDataset, validation_set: MyDataset, lr: float, epochs: int, vocabulary: Vocabulary):
-        """[summary]
-
-        Args:
-            train_set (MyDataset): [description]
-            validation_set (MyDataset): [description]
-            lr (float): [description]
-            epochs (int): [description]
-            vocabulary (Vocabulary): [description]
-        """
-        
-        # Initialize Loss: CrossEntropyLoss -> Softmax + NegativeLogLikelihoodLoss 
-        # Q. Why ignore_index is setted to <SOS> instead of <PAD>?
-        # A. In the training, both output of the CaRNet and Target label start as padded tensor, but when we compute the loss it will evaluate the tensor with pack_padded_sequence.
-        #       And since <SOS> token is hardcoded as output at t_0 we could avoid the computation of loss on it, since will be 0 fover.                     
-        
-        criterion = nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()["<SOS>"],reduction="sum").cuda() if self.device.type == "cuda"  \
-                                            else nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()["<SOS>"],reduction="sum")
-        
-        # initializing some elements
-        best_val_acc = -1.  # the best accuracy computed on the validation data
-        best_epoch = -1  # the epoch in which the best accuracy above was computed
-        
-        # ensuring the classifier is in 'train' mode (pytorch)
-        self.C.train()
-        self.R.train()
-
-        # creating the optimizer
-        optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr)
-
-        # loop on epochs!
-        for e in range(0, epochs):
-
-            # epoch-level stats (computed by accumulating mini-batch stats)
-            epoch_train_acc = 0.
-            epoch_train_loss = 0.
-            epoch_num_train_examples = 0
-
-            for images, captions_ids, captions_length in  train_set:
-                optimizer.zero_grad() 
-                
-                batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size') -> last batch truncated
-                epoch_num_train_examples += batch_num_train_examples
-                
-                
-                images = images.to(self.device)
-                captions_ids = captions_ids.to(self.device) # captions > (B, L)
-                captions_length = captions_length.to(self.device)
-                
-                # computing the network output on the current mini-batch
-                features = self.C(images)
-                outputs, outputs_length = self.R(features, captions_ids, captions_length) # outputs > (B, L, |V|); 
-                
-                outputs = pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True)  #(Batch, MaxCaptionLength, |Vocabulary|) -> (Batch * CaptionLength, |Vocabulary|)
-                
-                targets = pack_padded_sequence(captions_ids, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength) -> (Batch * CaptionLength)
-                
-                
-                loss = criterion(outputs.data, targets.data)
-                
-                # computing gradients and updating the network weights
-                loss.backward()  # computing gradients
-                optimizer.step()  # updating weights
-                
-                with torch.no_grad():
-                    self.C.eval()
-                    self.R.eval()
-                    features = self.C(images)
-                    import random
-                    numb = random.randint(0,2)
-                    caption = self.R.generate_caption(features[numb],30)
-                    print(vocabulary.rev_translate(captions_ids[numb]))
-                    print(vocabulary.rev_translate(caption[0]))
-                    self.C.train()
-                    self.R.train()
-                
-                with torch.no_grad():
-                    self.C.eval()
-                    self.R.eval()
-                    
-                    # Compute captions as ids for all the training images
-                    projections = self.C(images)
-                    
-                    captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device)
-                    
-                    for idx,projection in enumerate(range(projections.shape[0])):
-                        _caption_no_pad = self.R.generate_caption(projections[idx],captions_ids.shape[1])
-                        captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
-                        # Fill the remaining portion of caption eventually with zeros
-                        # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
-
-                    captions_output_padded = captions_output.type(torch.int32).to(self.device) # From list of tensors to tensors
-                    
-                    # computing performance
-                    batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length)
-
-                    # accumulating performance measures to get a final estimate on the whole training set
-                    epoch_train_acc += batch_train_acc * batch_num_train_examples
-
-                    # accumulating other stats
-                    epoch_train_loss += loss.item() * batch_num_train_examples
-                    self.C.train()
-                    self.R.train()
-                    
-                    # printing (mini-batch related) stats on screen
-                    print("  mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc))
-                    
-            val_acc = self.eval_classifier(validation_set)
-
-            # # saving the model if the validation accuracy increases
-            # if val_acc > best_val_acc:
-            #     best_val_acc = val_acc
-            #     best_epoch = e + 1
-            #     self.save("CaRNetvI")
-
-            epoch_train_loss /= epoch_num_train_examples
-
-            # printing (epoch related) stats on screen
-            print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}"
-                + (", BEST!" if best_epoch == e + 1 else ""))
-                .format(e + 1, epochs, epoch_train_loss,
-                        epoch_train_acc / epoch_num_train_examples, val_acc))
-
-
-    def eval_classifier(self, data_set):
-        """Evaluate the classifier on the given data set."""
-
-        # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state)
-        training_mode_originally_on = self.C.training and self.R.training
-        if training_mode_originally_on:
-            self.C.eval()
-            self.R.eval()  # enforcing evaluation mode
-
-        
-
-        with torch.no_grad():  # keeping off the autograd engine
-
-            # loop on mini-batches to accumulate the network outputs (creating a new iterator)
-            for images, captions_ids, captions_length  in data_set:
-                images = images.to(self.device)
-                
-                captions_ids = captions_ids.to(self.device)
-
-                projections = self.C(images)
-                        
-                captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device)
-                
-                for idx,projection in enumerate(range(projections.shape[0])):
-                    _caption_no_pad = self.R.generate_caption(projections[idx],captions_ids.shape[1])
-                    captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
-                    # Fill the remaining portion of caption eventually with zeros
-                    # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
-
-                captions_output_padded = captions_output.type(torch.int32).to(self.device) # From list of tensors to tensors
-                
-                # computing performance
-                acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length)
-
-        if training_mode_originally_on:
-            self.C.train()  # restoring the training state, if needed
-            self.R.train()
-        return acc
-    
-# Example of usage
-if __name__ == "__main__":
-    from Vocabulary import Vocabulary
-    from Dataset import MyDataset
-    from torch.utils.data import DataLoader
-    ds = MyDataset("./dataset/images/", percentage=8)
-    v = Vocabulary(ds,reload=True) 
-    dc = ds.get_fraction_of_dataset(percentage=70, delete_transfered_from_source=True)
-    df = ds.get_fraction_of_dataset(percentage=30, delete_transfered_from_source=True)
-    # use dataloader facilities which requires a preprocessed dataset
-       
-    
-    dataloader_training = DataLoader(dc, batch_size=5,
-                        shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_training(data,v))
-    
-    dataloader_evaluation = DataLoader(df, batch_size=5,
-                        shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v))
-    
-    net = CaRNetvI(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cuda:0")
-    #net.load("CaRNetvI")
-    net.train(dataloader_training,dataloader_evaluation,1e-3,500,v)
diff --git a/NeuralModel/Dataset.py b/NeuralModel/Dataset.py
deleted file mode 100644
index 3ef9a28..0000000
--- a/NeuralModel/Dataset.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import os
-import pandas as pd 
-import torch
-import numpy as np 
-from enum import Enum
-from torch.utils.data import Dataset, DataLoader
-import torch.nn as nn
-from PIL import Image
-import re
-from torchvision import transforms
-
-
-# ENV:
-# MAX_CAPTION_LENGTH 
-MAX_CAPTION_LENGTH = 15 
-
-class MyDataset(Dataset):
-    
-    image_trasformation_parameter = {
-        "crop":{
-            "size": 224
-        },
-        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
-        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
-    }
-    
-    def __init__(self, directory_of_data:str = None, percentage:int = 100, already_computed_dataframe: pd.DataFrame = None):
-        """Create a new dataset from source files
-
-        Args:
-            directory_of_data (str): [description]
-        """
-        if already_computed_dataframe is not None:
-            self.directory_of_data = directory_of_data
-            self._dataset = already_computed_dataframe
-            return
-        
-        if not os.path.exists(directory_of_data):
-            raise ValueError(f"{directory_of_data} not Exist!")
-        if not os.path.isdir(directory_of_data):
-            raise ValueError(f"{directory_of_data} is not a directory!")
-        
-        _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]]
-        _temp_dataset["comment"] = _temp_dataset["comment"].apply( lambda comment: re.findall("[\\w]+|\.|\,",str(comment).lower()))
-        _temp_dataset = _temp_dataset[ _temp_dataset["comment"].map(len) <= MAX_CAPTION_LENGTH]
-        self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100)))
-        self.directory_of_data = directory_of_data
-        
-    def get_fraction_of_dataset(self, percentage: int, delete_transfered_from_source: bool = False): 
-        _temp_df_moved = self._dataset.head(int(len(self._dataset)*(percentage/100))).sample(frac=1)
-        _temp_df_copy = _temp_df_moved.copy()
-        
-        if delete_transfered_from_source:
-            self._dataset = self._dataset.drop(_temp_df_copy.index)
-        return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy)
-    
-    def get_all_distinct_words_in_dataset(self):
-        words = []
-        for idx,row in self._dataset.iterrows():
-            for word in row["comment"]:
-                if word not in words:
-                    words.append(word)
-        return words
-    
-    def __len__(self):
-        return self._dataset.shape[0]
-    
-    def __getitem__(self, idx):
-        
-        image, caption = Image.open(f"{self.directory_of_data}/images/{self._dataset.iloc[idx]['image_name']}").convert('RGB'), \
-                            self._dataset.iloc[idx]["comment"]
-        
-        return image, caption 
-    
-    def pack_minibatch_training(self, data, vocabulary):
-        
-        # Sort a data list by caption length (descending order).
-        data.sort(key=lambda x: len(x[1]), reverse=True)
-    
-        images, captions = zip(*data)
-        
-        operations = transforms.Compose([
-                transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"],MyDataset.image_trasformation_parameter["crop"]["size"])), # Crop a random portion of image and resize it to a given size.
-                transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability.
-                transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor.  (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
-                transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"]),
-        ])
-        images = list(map(lambda image: operations(image),list(images)))
-        
-        # Merge images (from tuple of 3D tensor to 4D tensor).
-        images = torch.stack(images, 0) # (Batch Size, Color, Height, Width)
-        
-        captions_length = torch.tensor([len(caption)+2 for caption in captions]) 
-        
-        captions = [vocabulary.translate(caption,"complete") for caption in captions]
-        
-        captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True)
-        
-        
-        return images, captions.type(torch.LongTensor), captions_length.type(torch.int32)
-    
-    def pack_minibatch_evaluation(self, data, vocabulary):
-        
-        # Sort a data list by caption length (descending order).
-        data.sort(key=lambda x: len(x[1]), reverse=True)
-    
-        images, captions = zip(*data)
-        
-        operations = transforms.Compose([
-                transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"], MyDataset.image_trasformation_parameter["crop"]["size"])),  # Crops the given image at the center.
-                transforms.ToTensor(),
-                transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"])
-        ])
-
-        images = list(map(lambda image: operations(image),list(images)))
-        
-        # Merge images (from tuple of 3D tensor to 4D tensor).
-        images = torch.stack(images, 0) # (Batch Size, Color, Height, Width)
-                           
-        captions_length = torch.tensor([len(caption)+2 for caption in captions]) 
-        
-        captions = [vocabulary.translate(caption,"complete") for caption in captions]
-        
-        captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True)
-        
-        
-        return images, captions.type(torch.LongTensor), captions_length.type(torch.int32)
-        
\ No newline at end of file
diff --git a/NeuralModel/Vocabulary.py b/NeuralModel/Vocabulary.py
deleted file mode 100644
index 725945a..0000000
--- a/NeuralModel/Vocabulary.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import os
-import torch
-import warnings
-from Dataset import MyDataset
-from typing import List
-
-class Vocabulary():
-    # The vocabulary implementation is done with a pre-trained word embedding GLOVE50d
-    # each word is represented by a record in a dataframe with this structure
-    
-    
-    def __init__(self, source_dataset: MyDataset, verbose: bool = False, reload: bool = False):
-        
-        self.enriched = False       # Tell that all the word coming from the dataset are in the vocabulary if it is set to True
-        self._make_enrich = False         # Allow the user to enrich the vocabulary if it is set to True
-        # Check if the enriched vocabulary(glove + PAD + SOS + EOS + UNK + dataset vocabulary) already exists
-        if os.path.exists(".saved/rich_embeddings_v1.pt") and os.path.exists(".saved/rich_word2id_v1.pt") and not reload:
-            self.embeddings = torch.load(".saved/rich_embeddings_v1.pt")
-            self.word2id = torch.load(".saved/rich_word2id_v1.pt")
-            self.enriched = True
-            return
-        
-        # Since the constructor arrived here, we need to load for the 1st time all the possible words from the dataset
-        dataset_words = source_dataset.get_all_distinct_words_in_dataset()
-        
-        # Dictionary length 
-        self.dictionary_length = len(dataset_words)+4 # Dictionary word + 4 Flavored Token (PAD + SOS + EOS + UNK)
-        
-        self.word2id = {}
-        self.embeddings = torch.zeros((self.dictionary_length, self.dictionary_length))  # DIM1: dict rows + 4 flavored token (PAD + SOS + EOS + UNK) | DIM2: Dict Rows +4 flavored token (PAD + SOS + EOS + UNK) as 1-hot
-        
-        # Initialize the token:
-        # <PAD>, <SOS>, <EOS>, <UNK>
-        self.word2id["<PAD>"] = 0
-        self.word2id["<SOS>"] = 1
-        self.word2id["<EOS>"] = 2
-        self.word2id["<UNK>"] = 3
-        
-        counter = 4 
-        for word in dataset_words:
-            self.word2id[word] = counter
-            counter += 1
-            
-        self.embeddings = torch.eye(self.dictionary_length)
-    
-    def predefined_token_idx(self) -> dict:
-        return {
-            "<PAD>":0,
-            "<SOS>":1,
-            "<EOS>":2,
-            "<UNK>":3
-        }
-    
-    def translate(self, word_sequence : List[str], type : str = "complete") -> torch.tensor:
-        """Given a sequence of word, translate into id list according to the vocabulary.
-
-        Args:
-            word_sequence (str): [description]
-        """
-        
-        # Initialize the translator
-        
-        if type == "uncomplete":
-            _sequence = torch.zeros(len(word_sequence)+1, dtype=torch.int32) # <SOS> + ...Caption...
-            
-        if type == "complete":
-            _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # <SOS> + ...Caption... + <EOS> 
-            _sequence[-1] = self.word2id["<EOS>"]
-            
-        _sequence[0] = self.word2id["<SOS>"]
-        
-        counter = 1 # Always skip <SOS> 
-        
-        # Evaluate all the word into the caption and translate it to an embeddings
-        for word in word_sequence:
-            if word.lower() in self.word2id.keys():
-                _sequence[counter] = self.word2id[word.lower()]
-            else:
-                _sequence[counter] = self.word2id["<UNK>"]
-            counter += 1
-        
-        return _sequence
-    
-    def rev_translate(self, words_id : torch.tensor) -> List[str]:
-        """Given a sequence of word, translate into id list according to the vocabulary.
-
-        Args:
-            word_sequence (str): [description]
-        """
-        # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset.
-        return [list(self.word2id.keys())[idx] for idx in words_id[:].tolist()]   # word_id (1,caption_length)
-    
-    
-    def __len__(self):
-        """The total number of words in this Vocabulary."""
-
-        return len(self.word2id.keys())
-    
-    
-# ----------------------------------------------------------------
-# Usage example
-
-if __name__ == '__main__':
-    #Load the vocabulary
-    v = Vocabulary(verbose=True)
-    # Make a translation
-    print(v.translate(["I","like","PLay","piano","."]))
-    # Enrich the vocabulary
-    v.make_enrich = True
-    dataset = ["I","Like","PLay","PIPPOplutopaperino"]
-    v.enrich(dataset)
-    v.make_enrich = False
-    # Enrich the vocabulary with a bulk insert 
-    v.make_enrich = True
-    dataset = [["I","Like","PLay","PIPPOplutopaperino"],["I","Like","PLay","pizza"]]
-    v.bulk_enrich(dataset)
-    v.make_enrich = False
-    
-    
-    
-        
-        
-        
-        
-            
-        
-        
-    
-    
-        
-    
\ No newline at end of file
diff --git a/NeuralModels/Attention/IAttention.py b/NeuralModels/Attention/IAttention.py
new file mode 100644
index 0000000..94b15f5
--- /dev/null
+++ b/NeuralModels/Attention/IAttention.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+
+class IAttention(nn.Module):
+    """
+        Class interface for Attention unit 
+        Args are intended as suggested.
+    """
+    def __init__(self, *args):
+        """Constructor for an Attention model 
+
+        Args:
+            encoder_dim (int): 
+                The number of features extracted from the image.
+            hidden_dim (int): 
+                The capacity of the LSTM.
+            attention_dim (int): 
+                The capacity of the Attention Model.
+        """
+        super(IAttention, self).__init__()
+        
+    def forward(self, *args):
+        """Compute z_t given images and hidden state at t-1 for all the element in the batch.
+
+        Args:
+            images (torch.Tensor): `(batch_dim, image_portions, encoder_dim)`
+                The tensor of the images in the batch. 
+            lstm_hidden_states (torch.Tensor): `(batch_dim, hidden_dim)`
+                The hidden states at t-1 of the elements in the batch. 
+
+        Returns:
+            (Tuple[torch.Tensor,torch.Tensor]): `[(batch_dim, encoder_dim), (batch_dim, image_portions)]`
+                Z_t and the alphas evaluated for each portion of the image, for each image in the batch.
+        """
+        pass
+    
+    
\ No newline at end of file
diff --git a/NeuralModels/Attention/SoftAttention.py b/NeuralModels/Attention/SoftAttention.py
new file mode 100644
index 0000000..c50888f
--- /dev/null
+++ b/NeuralModels/Attention/SoftAttention.py
@@ -0,0 +1,77 @@
+from re import S
+import torch.nn as nn
+import torch
+import torchvision.models as models
+from typing import Tuple
+
+class SoftAttention(nn.Module):
+    """
+        Simple implementation of Bahdanau Attention model.
+    """
+    
+    def __init__(self, encoder_dim: int , hidden_dim: int, attention_dim: int, number_of_splits: int = 7):
+        """Constructor for a SoftAttention model 
+
+        Args:
+            encoder_dim (int): 
+                The number of features extracted from the image.
+            hidden_dim (int): 
+                The capacity of the LSTM.
+            attention_dim (int): 
+                The capacity of the Attention Model.
+            number_of_splits (int):
+                Number of image portions for Heigth (square resolution)
+        """
+        super(SoftAttention, self).__init__()
+        
+        self.attention_dim = attention_dim
+        
+        self.encoder_dim = encoder_dim
+        
+        self.number_of_splits = number_of_splits
+        
+        self.image_attention_projection = nn.Linear(encoder_dim, attention_dim)
+        
+        self.lstm_hidden_state_attention_projection = nn.Linear(hidden_dim, attention_dim)
+        
+        print(f"Construction of Attention: \
+                \n\t Attention dimension: {attention_dim},\
+                \n\t Encoder dimension: {encoder_dim},\
+                \n\t LSTM Capacity: {hidden_dim},\
+                \n\t Alphas: {number_of_splits**2}")
+        
+        self.attention = nn.Linear(attention_dim, 1)
+        
+        self.ReLU = nn.ReLU()
+        
+        self.out = nn.Softmax(dim=1)
+        
+        
+    def forward(self, images: torch.Tensor, lstm_hidden_states: torch.Tensor) -> Tuple[torch.Tensor,torch.Tensor]:
+        """Compute z_t given images and hidden state at t-1 for all the element in the batch.
+
+        Args:
+            images (torch.Tensor): `(batch_dim, image_portions, encoder_dim)`
+                The tensor of the images in the batch.  
+            lstm_hidden_states (torch.Tensor): `(batch_dim, hidden_dim)`
+                The hidden states at t-1 of all the element in the batch. 
+
+        Returns:
+            (Tuple[torch.Tensor,torch.Tensor]): `[(batch_dim, encoder_dim), (batch_dim, image_portions)]`
+                Z_t and the alphas evaluated for each portion of the image, for each image in the batch.
+        """
+        
+        _images_attention = self.image_attention_projection(images) # IN: (batch_dim, image_portions, encoder_dim) -> Out: (batch_dim, image_portions, attention_dim)
+        
+        _lstm_attention = self.lstm_hidden_state_attention_projection(lstm_hidden_states) # IN: (batch_dim, hidden_dim) -> Out: (batch_size, attention_dim)
+        
+        # (batch_size, image_portions, attention_dim) + (batch_size, 1, attention_dim) -> Broadcast on dim 2 -> (batch_size, image_portions, attention_dim)
+        _attention = self.attention(self.ReLU(_images_attention + _lstm_attention.unsqueeze(1))).squeeze(2) # IN: (batch_dim, image_portions, attention_dim) -> Out: (batch_size, image_portions)
+        
+        _alphas_t = self.out(_attention) # Out: (batch_dim, image_portions)
+        
+        # Retrieve z_t
+        attention_weighted_encoding = (images * _alphas_t.unsqueeze(2)).sum(dim=1) # Out: (batch_dim, encoder_dim)
+        
+        return attention_weighted_encoding, _alphas_t
+        
\ No newline at end of file
diff --git a/NeuralModels/CaRNet.py b/NeuralModels/CaRNet.py
new file mode 100644
index 0000000..59dc6c5
--- /dev/null
+++ b/NeuralModels/CaRNet.py
@@ -0,0 +1,561 @@
+#####################################################
+##
+##
+##  pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html
+
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+from torch.nn.utils.rnn import pack_padded_sequence
+import torch.nn.functional as F
+from typing import Tuple,List
+from .Dataset import MyDataset
+from .Vocabulary import Vocabulary
+from .Decoder.IDecoder import IDecoder
+from .Encoder.IEncoder import IEncoder
+from .Attention.IAttention import IAttention
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from torchvision.utils import save_image
+import matplotlib.pyplot as plt
+from VARIABLE import MAX_CAPTION_LENGTH
+from .Metrics import Result
+
+class CaRNet(nn.Module):
+    """
+        The ConvolutionalandRecurrentNet (CaRNet).
+        CaRNet works with a Residual NeuralNet with 50layers (ResNet50) with the last layer removed.
+        In CaRNet it supports 3 types of LSTM:
+        - vI: the features extracted from the image are provided as input with <START> token
+        - vH: the features extracted from the image becames the hidden state at t_0
+        - vHC: the features extracted from the image becames both the hidden and cell state at t_0
+        
+        When it is flavoured with Attention, it becames a ConvolutionalAttentionRecurrentNet (CARNet).
+        CARNet works with a Residual NeuralNet with 50layers (ResNet50) with the last convolutional layer exposed.
+        For now support only 1 type of LSTM:
+        - vHC
+    """
+    
+    def __init__(self, encoder: IEncoder, decoder: IDecoder, net_name: str, encoder_dim: int, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, attention: IAttention = None, attention_dim: int = 1024, device: str = "cpu"):
+        """Create the C[aA]RNet 
+
+        Args:
+            encoder (IEncoder): 
+                The encoder to use.
+                
+            decoder (IDecoder): 
+                The decoder to use.
+                
+            net_name (str): 
+                Name of the Neural Network.
+                
+            encoder_dim (int): 
+                The dimensionality of the features vector extracted from the image.
+                
+            hidden_dim (int): 
+                The Capacity of the LSTM Cell.
+                
+            padding_index (int): 
+                The index of the padding id, given from the vocabulary associated to the dataset.
+                
+            vocab_size (int)): 
+                The size of the vocabulary associated to the dataset.
+                
+            embedding_dim (int): 
+                Size associated to the input of the LSTM cell.
+                
+            attention (IAttention, optional): (Default is None)
+                The attention if Provided.
+                
+            attention_dim (int, optional): (Default is 1024)
+                Size of the attention layer, used only if attention is not None.
+                
+            device (str, optional): 
+                The device on which the net does the computation. Defaults to "cpu".
+        """
+
+        super(CaRNet, self).__init__()
+        self.padding_index = padding_index
+        self.device = torch.device(device)
+        self.name_net = net_name
+        self.result_storer = Result()
+        # Define Encoder and Decoder
+        self.C = encoder(encoder_dim = encoder_dim, device = device)
+        self.R = None
+        
+        # Take the attention in consideration
+        self.attention = False
+        
+        if attention is not None: # I know..some skilled dev. will hate me for this if-else statement. Forgive ME.
+            self.attention = True
+            self.R = decoder(hidden_dim, padding_index, vocab_size, embedding_dim, device, attention(self.C.encoder_dim, hidden_dim, attention_dim))
+        else:
+            self.R = decoder(hidden_dim, padding_index, vocab_size, embedding_dim, device)
+
+        # Check if the Recurrent net was initialized oth. we are in error state.
+        if self.R is None:
+            raise ValueError("Could not create the Recurrent network.")
+        
+        # Send both net to the defined device -> cpu or gpu 
+        self.C.to(self.device)
+        self.R.to(self.device)
+    
+    def switch_mode(self, mode: str) -> bool: 
+        """ Change the working modality of the net among "training" or "evaluation".
+
+        Args:
+            mode (str): 
+                New mode of work, "training" | "evaluation"
+
+        Returns:
+            bool: 
+                If True the state is correctly changed, oth. not.
+        """
+        # Q. Why no control if they already stay in the wanted state?
+        # A. Increase the condition may lead to more than expected case to control. Avoid IfElse community addicted :) 
+        if mode == "training":
+            self.C.train()  # switch to training state
+            self.R.train()
+            return True
+        
+        if mode == "evaluation":
+            self.C.eval() # switch to evaluation state
+            self.R.eval()
+            return True
+        return False
+    
+    def save(self, file_path: str) -> bool:
+        """Save the net in non-volatile memory
+
+        Args:
+            file_name (str): Relative path to save the net. Ex. "home/pippo/saved"
+
+        Returns:
+            bool: If True: Net saved correctly. False otherwise.
+        """
+        try:
+            # Name_type_encoderdim_embeddingdim_hiddendim_attentiondim
+            torch.save(self.C.state_dict(), f"{file_path}/{self.name_net}_{self.C.encoder_dim}_{self.R.hidden_dim}_{self.R.attention.attention_dim if self.attention == True else 0}_C.pth")
+            torch.save(self.R.state_dict(), f"{file_path}/{self.name_net}_{self.C.encoder_dim}_{self.R.hidden_dim}_{self.R.attention.attention_dim if self.attention == True else 0}_R.pth")
+        except Exception as ex:
+            print(ex)
+            return False
+        return True
+
+    def load(self, file_path: str) -> bool:
+        """Load the net from non-volatile memory into RAM
+
+        Args:
+            file_name (str): Relative path of the net. Ex. "home/pippo/saved"
+
+        Returns:
+            bool: If True: Net loaded correctly. False otherwise.
+        """
+        
+        # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
+        self.C.load_state_dict(torch.load(f"{file_path}/{self.name_net}_{self.C.encoder_dim}_{self.R.hidden_dim}_{self.R.attention.attention_dim if self.attention == True else 0}_C.pth", map_location=self.device))
+        self.R.load_state_dict(torch.load(f"{file_path}/{self.name_net}_{self.C.encoder_dim}_{self.R.hidden_dim}_{self.R.attention.attention_dim if self.attention == True else 0}_R.pth", map_location=self.device))
+        
+    def forward(self, images: torch.tensor, captions: torch.tensor) -> torch.tensor:
+        """Provide images to the net for retrieve captions
+
+        Args:
+            images (torch.tensor): `(Batch Size, Channels, Width, Height)`
+                The images of the batch.
+                
+            captions (torch.tensor): `(Batch Size, Max_Captions_Length)`. 
+                ASSUMPION: The captions are padded with <PAD> Token
+
+        Returns:
+            (torch.tensor): `(batch_size, max_captions_length, vocab_size)`
+                The output of each time step from t_1 to t_N.
+                    REMARK <START> token is provided as output at t_0
+        """
+        features = self.C(images)
+        return self.R(features, captions)
+
+    def __accuracy(self, outputs: torch.tensor, labels: torch.tensor, captions_length: List[int]) -> float:
+        """Evaluate the accuracy of the Net with Jaccard Similarity.
+                Assumption: outputs and labels have same shape and already padded.
+
+        Args:
+            outputs (torch.tensor): `(batch_dim, MAX_CAPTION_LENGTH)`
+                The captions generated from the net.
+            labels (torch.tensor): `(batch_dim, MAX_CAPTION_LENGTH)` 
+                The Real captions.
+            captions_length (list): 
+
+        Returns:
+            float: The accuracy of the Net
+        """
+        
+
+        # computing the accuracy with Jaccard Similarity, pytorch unique facility has bugs with cuda....it can be done "a manella" :)
+        # from python 3.9 you could use the package torchmetrics
+        # from torchmetrics import JaccardIndex
+        # intersection_over_union = JaccardIndex(num_classes=self.R.vocab_size).cuda() if self.device.type != "cpu" else JaccardIndex(num_classes=self.R.vocab_size)
+        # return intersection_over_union(outputs, labels)
+        outputs = np.array(list(map(lambda output: np.unique(output), outputs.cpu())), dtype=object) # Remove duplicate from each caption
+        labels = np.array(list(map(lambda label: np.unique(label), labels.cpu())), dtype=object) # Remove duplicate from each caption
+        
+        unions = list(map(lambda index: len(np.union1d(outputs[index],labels[index])), range(labels.shape[0])))
+        intersections = list(map(lambda index: len(np.intersect1d(outputs[index],labels[index])), range(labels.shape[0])))
+        return torch.mean(torch.tensor(intersections).type(torch.float)/torch.tensor(unions).type(torch.float), axis=0)
+    
+    
+    def train(self, train_set: MyDataset, validation_set: MyDataset, lr: float, epochs: int, vocabulary: Vocabulary):
+        """Train the net
+
+        Args:
+            train_set (MyDataset): 
+                The associate training set.
+                
+            validation_set (MyDataset): 
+                The associate validation set.
+                
+            lr (float): 
+                The learning rate.
+                
+            epochs (int): 
+                The number of epochs.
+                
+            vocabulary (Vocabulary): 
+                The vocabulary associate to the Dataset
+        """
+        
+        # Initialize Loss: CrossEntropyLoss -> Softmax + NegativeLogLikelihoodLoss 
+        # Q. Why ignore_index is setted to <START> instead of <PAD>?
+        # A. In the training, both output of the CaRNet and Target is a padded tensor, but when we compute the loss it will evaluate the tensor with pack_padded_sequence.
+        #       And since <START> token is hardcoded as output at t_0 and it is contained into the Target we could avoid the computation of loss on it, since will be 1.                     
+        
+        criterion = nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()["<START>"],reduction="sum").cuda() if self.device.type == "cuda"  \
+                                            else nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()["<START>"],reduction="sum")
+        
+        # initializing some elements
+        best_val_acc = -1.  # the best accuracy computed on the validation data
+        best_epoch = -1  # the epoch in which the best accuracy above was computed
+        
+        # ensuring the classifier is in 'train' mode (pytorch)
+        self.switch_mode("training")
+
+        # creating the optimizer
+        optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr)
+
+        # loop on epochs
+        for e in range(0, epochs):
+
+            # epoch stats (computed by accumulating mini-batch stats)
+            epoch_train_acc = 0.
+            epoch_train_loss = 0.
+            epoch_num_train_examples = 0
+            batch_id_reporter = 0
+            for images,captions_ids,captions_length in  train_set:
+                optimizer.zero_grad() 
+                
+                batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size') -> last batch truncated
+                epoch_num_train_examples += batch_num_train_examples
+                
+                # Send data to the appropriate device
+                images = images.to(self.device)
+                captions_ids = captions_ids.to(self.device)
+                captions_length = captions_length.to(self.device)
+                
+                # computing the network output on the current mini-batch
+                # If Attention is on:
+                # In: (batch_dim, channels, height, width) Out: (batch_dim,H_portions, W_portions, encoder_dim) 
+                # Else:
+                # In: (batch_dim, channels, height, width) Out: (batch_dim, encoder_dim)
+                # Retrieve Features for each image
+                features = self.C(images)
+                
+                # Check if attention is provided, if yes the output will change accordly for fitting doubly stochastic gradient
+                if self.attention == False: # I know..some skilled dev. will hate me for this if-else statement. Forgive ME.
+                    outputs, _ = self.R(features, captions_ids, captions_length) # outputs > (B, L, |V|); 
+                else:
+                    outputs, _, alphas =  self.R(features, captions_ids, captions_length)
+                
+                outputs = pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True)  #(Batch, MaxCaptionLength, |Vocabulary|) -> (Batch * CaptionLength, |Vocabulary|)
+                
+                targets = pack_padded_sequence(captions_ids, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength) -> (Batch * CaptionLength)
+                
+                loss = criterion(outputs.data, targets.data)
+                
+                # Doubly stochastic gradient if attention is ON
+                if self.attention == True:
+                    loss += float(torch.sum((
+                                        0.5 * torch.sum((
+                                                            (1 - torch.sum(alphas, dim=1,keepdim=True)) ** 2 # caption_length sum
+                                                        ), dim=2, keepdim=True) # alpha_dim sum
+                                    ), dim=0).squeeze(1)) # batch_dim sum
+                    
+                # computing gradients and updating the network weights
+                loss.backward()  # computing gradients
+                optimizer.step()  # updating weights
+
+                # Training set accuracy evaluation
+                with torch.no_grad():
+                    self.switch_mode("evaluation")
+                    
+                    # computing the network output on the current mini-batch
+                    # If Attention is on:
+                    # In: (batch_dim, channels, height, width) Out: (batch_dim,H_portions, W_portions, encoder_dim) 
+                    # Else:
+                    # In: (batch_dim, channels, height, width) Out: (batch_dim, encoder_dim)
+                    # Retrieve Features for each image
+                    projections = self.C(images)
+                    
+                    # Create a padded tensor manually
+                    captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device)
+                    
+                    for idx, _ in enumerate(range(projections.shape[0])):
+                        # OUT: (1, CAPTION_LENGTH)
+                        if self.attention == True:
+                            _caption_no_pad, _ = self.R.generate_caption(projections[idx].unsqueeze(0),captions_ids.shape[1]) # IN: ((1, H_portions, W_portions, encoder_dim), 1)
+                        else:
+                            _caption_no_pad = self.R.generate_caption(projections[idx].unsqueeze(0),captions_ids.shape[1]) # IN: ((1, encoder_dim), 1)
+                        # Add for each batch element the caption. The surplus element are already feeded with zeros
+                        captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
+                        
+
+                    captions_output_padded = captions_output.type(torch.int32).to(self.device) # Out: (batch_dim, MAX_CAPTION_LENGTH)
+                    
+                    # computing performance
+                    batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length)
+
+                    # accumulating performance measures to get a final estimate on the whole training set
+                    epoch_train_acc += batch_train_acc * batch_num_train_examples
+
+                    # accumulating other stats
+                    epoch_train_loss += loss.item() * batch_num_train_examples
+                    
+                    self.switch_mode("training")
+                    
+                    # printing (mini-batch related) stats on screen
+                    print(f"  mini-batch:\tloss={loss.item():.4f}, tr_acc={batch_train_acc:.5f}")
+                    
+                    # Store result of this batch in a dataframe
+                    self.result_storer.add_train_info(epoch=int(e), batch_id=int(batch_id_reporter),loss=float(loss.item()),accuracy=float(batch_train_acc) )
+                    batch_id_reporter += 1
+            # Evaluate the accuracy of the validation set
+            val_acc = self.eval_net(validation_set,vocabulary)
+
+            # # saving the model if the validation accuracy increases
+            if val_acc > best_val_acc:
+                best_val_acc = val_acc
+                best_epoch = e + 1                
+                self.save("./.saved")
+
+            epoch_train_loss /= epoch_num_train_examples
+            # Store the result of the validation set in this epoch
+            self.result_storer.add_validation_info(epoch=int(e), accuracy=float(val_acc))
+            # printing (epoch related) stats on screen
+            print(f"epoch={e + 1}/{epochs}:\tloss={epoch_train_loss:.4f}, tr_acc={epoch_train_acc / epoch_num_train_examples:.5f}, val_acc={val_acc:.5f}, {'BEST!' if best_epoch == e+1 else ''}")
+        # store data in files
+        self.result_storer.flush()
+        
+    def eval_net(self, data_set, vocabulary):
+        """ Evaluate a data set
+
+        Args:
+            data_set (MyDataset): 
+                The associate data set.
+                
+            vocabulary (Vocabulary): 
+                The vocabulary associate to the Dataset
+
+        Returns:
+            (int):
+                Accuracy on given dataset
+        """
+        
+        self.switch_mode("evaluation")  # enforcing evaluation mode
+        with torch.no_grad():  # keeping off the autograd engine
+            _images = None
+            # loop on mini-batches to accumulate the network outputs (creating a new iterator)
+            for images,captions_ids,captions_length  in data_set:
+                images = images.to(self.device)
+                
+                captions_ids = captions_ids.to(self.device)
+                
+                # If Attention is on:
+                # In: (batch_dim, channels, height, width) Out: (batch_dim,H_portions, W_portions, encoder_dim)
+                # Else:
+                # In: (batch_dim, channels, height, width) Out: (batch_dim, encoder_dim) 
+                # Retrieve Features for each image
+                projections = self.C(images) 
+                
+                # Create a padded tensor manually
+                captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device)
+                
+                for idx, _ in enumerate(range(projections.shape[0])):
+                    # OUT: (1, CAPTION_LENGTH)
+                    if self.attention == True:
+                        _caption_no_pad, _ = self.R.generate_caption(projections[idx].unsqueeze(0),captions_ids.shape[1]) # IN: ((1, H_portions, W_portions, encoder_dim), 1)
+                    else:
+                        _caption_no_pad = self.R.generate_caption(projections[idx].unsqueeze(0),captions_ids.shape[1]) # IN: ((1, encoder_dim), 1)
+                   # Add for each batch element the caption. The surplus element are already feeded with zeros
+                    captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
+                
+                # Pick the 1st image of the last batch for printing out the result 
+                _image = images[0]
+                captions_output_padded = captions_output.type(torch.int32).to(self.device) # Out: (batch_dim, MAX_CAPTION_LENGTH)
+                
+                # computing performance
+                acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length)
+            
+            self.eval(_image,vocabulary)
+        self.switch_mode("training")
+        
+        return acc
+    
+    def __generate_image_caption(self, image: torch.Tensor, vocabulary: Vocabulary, image_name: str = "caption.png"):
+        """ Genareate an image with caption.
+
+        Args:
+            image (torch.Tensor): `(channels, height, width)`
+                The tensorial representation of the image in resnet50 form.
+                
+            vocabulary (Vocabulary): 
+                The vocabulary associated to the dataset.
+                
+            image_name (str, optional): Defaults to "caption.png".
+                The image of the generated file
+        """
+        self.switch_mode("evaluation")  # enforcing evaluation mode
+        
+        # If Attention is on:
+        # Out: 1st step (batch_dim,H_portions, W_portions, encoder_dim) -> 2nd step (batch_dim, H_portions * W_portions, encoder_dim) 
+        # Else:
+        # Out: (1, encoder_dim) 
+        features = self.C(image.unsqueeze(0))
+        
+        if self.attention == True:
+            caption, alphas = self.R.generate_caption(features,MAX_CAPTION_LENGTH)
+        else:
+            caption = self.R.generate_caption(features,MAX_CAPTION_LENGTH)
+    
+        # Generate image caption
+        caption = vocabulary.rev_translate(caption[0])
+        
+        # Adjust the color of the image wrt the transform operation of the resnet50
+        image[0] = image[0] * 0.229
+        image[1] = image[1] * 0.224 
+        image[2] = image[2] * 0.225 
+        image[0] += 0.485 
+        image[1] += 0.456 
+        image[2] += 0.406
+        
+        # Swap color channels
+        image = image.permute((1,2,0)) # IN: (height, width, channels)
+        
+        # If attention is ON perform the evaluation of attention over the immage
+        if self.attention == True:
+            self.__generate_image_attention(image, caption, alphas)
+
+        plt.figure(figsize=(15, 15))
+        plt.imshow(image.cpu())
+        plt.title(caption)
+        plt.savefig("caption.png")
+        plt.close()
+        
+        self.switch_mode("training")
+        
+    def __generate_image_attention(self, image: torch.tensor, caption, alphas, image_name: str = "attention.png"):
+        """Perform the evaluation of the attention over the image.
+
+        Args:
+            image (torch.Tensor): 
+                The tensorial representation of the image.
+                
+            caption (list(str)): 
+                The caption.
+                
+            alphas (torch.Tensor): 
+            
+            image_name (str, optional): Defaults to "attention.png".
+                The image of the generated file
+        """
+        self.switch_mode("evaluation") 
+        
+        fig = plt.figure(figsize=(15, 15))
+        _caption_len = len(caption)
+        for t in range(_caption_len):
+            # from 49 element to 7x7
+            _att = alphas[t].reshape(self.R.attention.number_of_splits,self.R.attention.number_of_splits)
+            
+            # Add a subplot accordly to the word in caption position
+            ax = fig.add_subplot(_caption_len//2, _caption_len//2, t+1)
+            
+            ax.set_title(f"{caption[t]}", fontsize=12)
+            
+            img = ax.imshow(image.cpu())
+            
+            # Add attention layer
+            ax.imshow(_att, cmap='gray', alpha=0.7, extent=img.get_extent())
+        plt.tight_layout()
+        plt.savefig(image_name)
+        plt.close()
+        
+        self.switch_mode("training")
+        
+    # Inspiration is taken from this example https://www.kaggle.com/mdteach/image-captioning-with-attention-pytorch
+    # Thanks ABISHEK BASHYAL :)
+    def eval(self, image: object, vocabulary: Vocabulary):
+        """Evaluate an image and retrieve the associated caption.
+
+        Args:
+            image (PIL.Image.Image or torch.Tensor):  if tensor `(channels, height, width)`
+                The image for which it evaluate the caption. 
+                
+            vocabulary (Vocabulary): 
+                The vocabulary.
+
+        Raises:
+            ValueError: If the image is not a tensor or an image.
+        """
+        # enforcing evaluation mode
+        self.switch_mode("evaluation")
+        
+        if isinstance(image, Image.Image):
+            operations = transforms.Compose([
+                transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"], MyDataset.image_trasformation_parameter["crop"]["size"])),  # Crops the given image at the center.
+                transforms.ToTensor(),
+                transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"])
+            ])
+            image = operations(image)
+        
+        if not(isinstance(image,torch.Tensor)): 
+            raise ValueError(f"Image is not the expected type, got: {type(image)}.")
+        
+        self.__generate_image_caption(image,vocabulary)
+        
+        self.switch_mode("training")
+        
+# Example of usage
+if __name__ == "__main__":
+    from torch.utils.data import DataLoader
+    from FactoryModels import *
+    ds = MyDataset("./dataset", percentage=1)
+    v = Vocabulary(ds,reload=True) 
+    
+    # Load Encoder and Decoder models
+    decoder = FactoryDecoder(Decoder.RNetvI)
+    encoder = FactoryEncoder(Encoder.CResNet50Attention)
+    
+    dc = ds.get_fraction_of_dataset(percentage=70, delete_transfered_from_source=True)
+    df = ds.get_fraction_of_dataset(percentage=30, delete_transfered_from_source=True)
+    # use dataloader facilities which requires a preprocessed dataset
+       
+    
+    dataloader_training = DataLoader(dc, batch_size=32,
+                        shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_training(data,v))
+    
+    dataloader_evaluation = DataLoader(df, batch_size=32,
+                        shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v))
+    
+    
+    net = CaRNet(encoder, decoder, "CaRNetvI",1596,512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cuda:0")
+    #net.load("CaRNetvI")
+    net.train(dataloader_training,dataloader_evaluation,1e-3,500,v)
diff --git a/NeuralModels/Dataset.py b/NeuralModels/Dataset.py
new file mode 100644
index 0000000..56c9154
--- /dev/null
+++ b/NeuralModels/Dataset.py
@@ -0,0 +1,269 @@
+# Typing trick for avoid circular import dependencies valid for python > 3.9
+# from __future__ import annotations
+# from typing import TYPE_CHECKING
+# if TYPE_CHECKING:
+#     from .Vocabulary import Vocabulary
+
+import os
+import pandas as pd 
+import torch
+from torch.utils.data import Dataset
+import torch.nn as nn
+from PIL import Image
+import re
+from torchvision import transforms
+from VARIABLE import MAX_CAPTION_LENGTH, IMAGES_SUBDIRECTORY_NAME, CAPTION_FILE_NAME
+from typing import Tuple, List, Iterable
+
+
+class MyDataset(Dataset):
+    """
+        Wrapper of Dataset Pytorch Object.
+        For our scopes the dataset folder must follow this rule:
+            
+            1) As a child of the directory, we must have a csv named `CAPTION_FILE_NAME` that follow this pattern:\n
+                `image_name| comment_number| comment`\n
+                Example:    1000092795.jpg| 0| Two young guys with shaggy hair look at their hands while hanging out in the yard .
+            
+            2) As brother of the csv file we must have the folder of the images, the directory name is a variable `IMAGES_SUBDIRECTORY_NAME`
+        
+        Assumption: 
+
+            1) The dataset will pick only the caption less then the variable `MAX_CAPTION_LENGTH`
+            
+    """
+    image_trasformation_parameter = {
+        "crop":{
+            "size": 224
+        },
+        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
+        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
+    }
+    
+    def __init__(self, directory_of_data:str , percentage:int = 100, already_computed_dataframe: pd.DataFrame = None):
+        """Create a new dataset from source files or from a preprocessed dataset.
+
+        Args:
+            directory_of_data (str, mandatory): 
+                The directory tagged as root for the dataset.
+            
+            percentage (int, optional): Default is 100.
+                The percentage of row that we want store in our object.
+                
+            already_computed_dataframe (pd.DataFrame, Optional): Default is None.
+                If the dataset is computed outside put it there.
+                REMARK Please follow the rule:
+                    | Index | image_name |  <List(str)> Caption |\n
+                    |:-----:|:----------:|:--------------------:|\n
+                    |   0   |  pippo.jpg | ["i","like","pizza"] |\n
+                    
+        Raises:
+            ValueError: if the dataset directory is invalid (Not Exist, Not a directory).
+        """
+        
+        # If the constructor receive a dataframe, we assume that it is already manipulated for doing our operation, no further op. needed.
+        if already_computed_dataframe is not None:
+            self.directory_of_data = directory_of_data
+            self._dataset = already_computed_dataframe
+            return
+        
+        # Input checking
+        if not os.path.exists(directory_of_data):
+            raise ValueError(f"{directory_of_data} not Exist!")
+        if not os.path.isdir(directory_of_data):
+            raise ValueError(f"{directory_of_data} is not a directory!")
+        
+        self.directory_of_data = directory_of_data
+        
+        # Load the dataset
+        _temp_dataset: pd.DataFrame = pd.read_csv(f"{directory_of_data}/{CAPTION_FILE_NAME}", sep="|", skipinitialspace=True)[["image_name","comment"]]
+        
+        # Split every caption in its words. 
+        _temp_dataset["comment"] = _temp_dataset["comment"].apply( lambda comment: re.findall("[\\w]+|\.|\,",str(comment).lower()))
+        
+        # Filter for retrieve only caption with a length less than MAX_CAPTION_LENGTH length
+        _temp_dataset = _temp_dataset[ _temp_dataset["comment"].map(len) <= MAX_CAPTION_LENGTH]
+        
+        # Pick only a given percentage of the row in the dataset
+        self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100)))
+        
+    def get_fraction_of_dataset(self, percentage: int, delete_transfered_from_source: bool = False):
+        """Get a fraction of the dataset 
+
+        Args:
+            percentage (int): 
+                The percentage of row that we want store in our new object.
+                
+            delete_transfered_from_source (bool, optional): Defaults to False.
+                Tell if you want to delete the row in the source object that are transfered to the new object.
+                
+        Returns:
+            (MyDataset): 
+                The new computed dataset object.
+        """
+        # Retrieve the number of rows 
+        _temp_df_moved: pd.DataFrame = self._dataset.head(int(len(self._dataset)*(percentage/100))).sample(frac=1)
+        
+        # Deep copy of the dataframe
+        _temp_df_copy = _temp_df_moved.copy()
+        
+        # If delete_transfered_from_source == True delete the rows in the source object.
+        if delete_transfered_from_source:
+            self._dataset: pd.DataFrame = self._dataset.drop(_temp_df_copy.index)
+        
+        # Return a fresh MyDataset object.
+        return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy)
+    
+    def get_all_distinct_words_in_dataset(self) -> List[str]:
+        """Return all the words in each caption of the dataset as a big list of strings (No Repetition).
+
+        Returns:
+            (List[str]): All the words in the dataset.
+        """
+        words = []
+        # Iterate over each sample in the dataset.
+        for idx,row in self._dataset.iterrows():
+            for word in row["comment"]:
+                if word not in words:
+                    words.append(word)
+        return words
+    
+    def __len__(self) -> int:
+        """Evaluate the length of the dataset.
+            The length is the number of rows in the dataset.
+
+        Returns:
+            int: The legth of the dataset.
+
+        """
+        return self._dataset.shape[0]
+    
+    def __getitem__(self, idx: int) -> Tuple[Image.Image, List[str]]:
+        """Get the associated image and caption of a given index.
+
+        Args:
+            idx (int): 
+                The index associated univocally to a row of the dataset.
+
+        Returns:
+            (Tuple[Image.Image, List[str]]): 
+                Image and caption of the input index.
+        """
+        image: Image.Image = Image.open(f"{self.directory_of_data}/{IMAGES_SUBDIRECTORY_NAME}/{self._dataset.iloc[idx]['image_name']}").convert('RGB')
+        caption: List[str] = self._dataset.iloc[idx]["comment"]
+        
+        return image, caption 
+    
+    # For python > 3.9 -> def pack_minibatch_training(self, data: List[Tuple[Image.Image, List[str]]], vocabulary: Vocabulary) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def pack_minibatch_training(self, data: List[Tuple[Image.Image, List[str]]], vocabulary) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Custom method for packing a mini-batch for training.
+
+        Args:
+            data (List[Tuple[image.Image, List[str]]]): 
+                A list of tuples coming from the calls of the __getitem__ method.
+                
+            vocabulary (Vocabulary): 
+                Vocabulary associated to the dataset.
+
+        Returns:
+            (Tuple[
+                    torch.Tensor,
+                    torch.Tensor, 
+                    torch.Tensor
+                  ]): [`(batch_dim, channels, height, width)`, `(batch_dim,min(MAX_CAPTION_LENGTH,captions[0]))`, `(batch_dim)`]
+                  
+                Tuple[0]: The images of the mini-batch converted to Tensor.
+                Tuple[1]: The caption of each image the mini-batch, the dim 2 depends on the maximum caption length inside the batch. 
+                Tuple[2]: The length of each caption +2 for <START> and <END> token.
+        """
+        # Sort the data list by caption length (descending order).
+        data.sort(key=lambda x: len(x[1]), reverse=True)
+    
+        images, captions = zip(*data)
+        
+        # Type annotation for zip extraction, no clear way to determine type with this kind of built-in method in a pythonic way.
+        images: List[Image.Image] = images
+        captions: List[List[str]] = captions
+        
+        # Trasnform the images from PIL.Image into a pytorch.Tensor
+        operations = transforms.Compose([
+                transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"],MyDataset.image_trasformation_parameter["crop"]["size"])), # Crop a random portion of image and resize it to a given size.
+                transforms.RandomHorizontalFlip(p=0.3), # Horizontally flip the given image randomly with a given probability.
+                transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor.  (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
+                transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"]),
+        ])
+        images = list(map(lambda image: operations(image),list(images))) # Out: List[(channels, height, width)]
+        # Merge images (from list of 3D tensor to a tensor).
+        images = torch.stack(images, 0) #  Out: (batch_dim, channels, height, width)
+        
+        # Evaluate captions: Devo
+        # Q. Why +2?
+        # A. For the <START> and <END> Token.
+        captions_length = torch.tensor([len(caption)+2 for caption in captions]) # Out: (batch_dim)
+        
+        # From to words to ids of vocabulary, add <START>.id at beginning and <END>.id at end.
+        captions = [vocabulary.translate(caption,"complete") for caption in captions]
+        
+        # Pad the captions with zeros id == <PAD>.id.
+        captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True) # Out: (batch_dim,min(MAX_CAPTION_LENGTH,captions[0]))
+        
+        
+        return images, captions.type(torch.LongTensor), captions_length.type(torch.int32)
+    
+    # For python > 3.9 -> def pack_minibatch_training(self, data: List[Tuple[Image.Image, List[str]]], vocabulary: Vocabulary) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def pack_minibatch_evaluation(self, data: List[Tuple[Image.Image, List[str]]], vocabulary) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Custom method for packing a mini-batch for evaluation.
+
+        Args:
+            data (List[Tuple[image.Image, List[str]]]): 
+                A list of tuples coming from the calls of the __getitem__ method.
+                
+            vocabulary (Vocabulary): 
+                Vocabulary associated to the dataset.
+
+        Returns:
+            (Tuple[
+                    torch.Tensor,
+                    torch.Tensor, 
+                    torch.Tensor
+                  ]): [`(batch_dim, channels, height, width)`, `(batch_dim,min(MAX_CAPTION_LENGTH,captions[0]))`, `(batch_dim)`]
+                  
+                Tuple[0]: The images of the mini-batch converted to Tensor.
+                Tuple[1]: The caption of each image the mini-batch, the dim 2 depends on the maximum caption length inside the batch. 
+                Tuple[2]: The length of each caption +2 for <START> and <END> token.
+        """
+        # Sort the data list by caption length (descending order).
+        data.sort(key=lambda x: len(x[1]), reverse=True)
+    
+        images, captions = zip(*data)
+        
+        # Type annotation for zip extraction, no clear way to determine type with this kind of built-in method in a pythonic way.
+        images: List[Image.Image] = images
+        captions: List[List[str]] = captions
+        
+        # Trasnform the images from PIL.Image into a pytorch.Tensor)
+        
+        operations = transforms.Compose([
+                transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"], MyDataset.image_trasformation_parameter["crop"]["size"])),  # Crops the given image at the center.
+                transforms.ToTensor(),
+                transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"])
+        ])
+
+        images = list(map(lambda image: operations(image),list(images))) # Out: List[(channels, height, width)]
+        # Merge images (from list of 3D tensor to a tensor).
+        images = torch.stack(images, 0) #  Out: (batch_dim, channels, height, width)
+        
+        # Evaluate captions: Devo
+        # Q. Why +2?
+        # A. For the <START> and <END> Token.
+        captions_length = torch.tensor([len(caption)+2 for caption in captions]) # Out: (batch_dim)
+        
+        # From to words to ids of vocabulary, add <START>.id at beginning and <END>.id at end.
+        captions = [vocabulary.translate(caption,"complete") for caption in captions]
+        
+        # Pad the captions with zeros id == <PAD>.id.
+        captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True) # Out: (batch_dim,min(MAX_CAPTION_LENGTH,captions[0]))
+        
+        
+        return images, captions.type(torch.LongTensor), captions_length.type(torch.int32)
+        
\ No newline at end of file
diff --git a/NeuralModels/Decoder/IDecoder.py b/NeuralModels/Decoder/IDecoder.py
new file mode 100644
index 0000000..1ff83da
--- /dev/null
+++ b/NeuralModels/Decoder/IDecoder.py
@@ -0,0 +1,79 @@
+#####   INTERFACE CLASS DON'T USE IT (You at most use only as Type Hint), JUST READ IT.
+################################################################
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple,List
+
+class IDecoder(nn.Module):
+    """
+        Class interface for a LSTM unit 
+        Args are intended as suggested.
+    """
+    
+    def __init__(self, *args):
+        """Define the interface of a generic constructor for the Decoder Net.
+
+        Args (Suggested):
+        
+            hidden_dim (int): 
+                The Capacity of the LSTM Cell. 
+                
+            padding_index (int): 
+                The index of the padding id, given from the vocabulary associated to the dataset.
+                
+            vocab_size (int)): 
+                The size of the vocabulary associated to the dataset.
+                
+            embedding_dim (int): 
+                The number of features associated to a word.
+                
+            device (str, optional): Default "cpu"
+                The device on which the operations will be performed. 
+        """
+        super(IDecoder, self).__init__()                
+
+    def forward(self, *args) -> Tuple[torch.Tensor, List[int]]:
+        """Interface for the forward operation of the RNN.
+                  
+        Args (Suggested): 
+        
+            images (torch.Tensor): `(batch_dim, encoder_dim)`
+                The features associated to each image of the batch. 
+            
+            captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)`
+                The caption associated to each image of the batch. 
+                    _REMARK Each caption is in the full form: <START> + .... + <END>_
+                
+            caption_length (list(int)): 
+                The length of each caption in the batch.
+            
+        Returns:    `[(batch_size, max_captions_length, vocab_size), list(int)]`
+        
+            (torch.Tensor): The hidden state of each time step from t_1 to t_N. 
+            
+            (list(int)): The length of each decoded caption. 
+                REMARK The <START> is provided as input at t_0.
+                REMARK The <END> token will be removed from the input of the LSTM.
+        """             
+        pass
+    
+    def generate_caption(self, *args) -> torch.Tensor:
+        """ Interface for generate a caption
+
+        Args (Suggested):
+        
+            images (torch.Tensor): `(1, encoder_dim)`
+                The features associated to the image. 
+                
+            max_caption_length (int): 
+                The maximum ammisible length of the caption.
+
+        Returns:
+        
+            (torch.Tensor): `(1, <variable>)`
+                The caption associated to the image given. 
+                    REMARK It includes <START> at t_0 by default.
+        """
+        pass
\ No newline at end of file
diff --git a/NeuralModels/Decoder/RNetvH.py b/NeuralModels/Decoder/RNetvH.py
new file mode 100644
index 0000000..5724a79
--- /dev/null
+++ b/NeuralModels/Decoder/RNetvH.py
@@ -0,0 +1,154 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple,List
+
+class RNetvH(nn.Module):
+    """
+        Class implementing LSTM unit with Hidden state initialized with custom features vector and Cell state initialized with ZEROS.
+    """
+    
+    def __init__(self, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, device: str = "cpu"):
+        """Define the constructor for the RNN Net
+
+        Args:
+        
+            hidden_dim (int): 
+                Capacity of the LSTM Cell.
+                
+            padding_index (int): 
+                The index of the padding id, given from the vocabulary associated to the dataset.
+                
+            vocab_size (int)): 
+                The size of the vocabulary associated to the dataset.
+                
+            embedding_dim (int): 
+                The number of features associated to a word.
+                
+            device (str, optional): Default "cpu"
+                The device on which the operations will be performed. 
+        """
+        super(RNetvH, self).__init__()
+
+        print(f"Construction of RNetvH:\n \
+                LSTM Capacity: {hidden_dim},\n \
+                Padding Index: {padding_index},\n \
+                Vocabulary Size: {vocab_size},\n \
+                Embedding dimension: {embedding_dim},\n \
+                Device: {device}")
+        
+        self.device = torch.device(device)
+        self.hidden_dim = hidden_dim
+        self.vocab_size = vocab_size
+        # Embedding layer that turns words into a vector.
+        self.words_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index)
+        
+        # The LSTM takes embedded word vectors (of a specified size) as input
+        # and outputs hidden states of size hidden_dim
+        self.lstm_unit = torch.nn.LSTMCell(embedding_dim, hidden_dim)
+        
+        # The linear layer that maps the hidden state
+        # to the number of words we want as output = vocab_size
+        self.linear_1 = nn.Linear(hidden_dim, vocab_size)
+                
+
+    def forward(self, images: torch.Tensor, captions: torch.Tensor, captions_length: List[int]) -> Tuple[torch.Tensor, List[int]]:
+        """Compute the forward operation of the RNN.
+                input of the LSTM cell for each time step:
+                    t_{-1}: NONE 
+                    t_0: Deterministict <START> 
+                    .
+                    .
+                    .
+                    t_{N-1}: The embedding vector associated to the S_{N-1} id.
+
+        Args (Suggested): 
+        
+            images (torch.Tensor): `(batch_dim, encoder_dim)`
+                The features associated to each image of the batch. 
+            
+            captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)`
+                The caption associated to each image of the batch. 
+                    REMARK Each caption is in the full form: <START> + .... + <END>
+                    REMARK The Tensor is padded with zeros
+                    
+            caption_length (List(int)): 
+                The length of each caption in the batch.
+            
+        Returns:    `[(batch_dim, max_captions_length, vocab_size), List(int)]`
+        
+            (torch.Tensor): 
+                The output of LSTM for each time step from t_1 to t_N, + <START> at t_0
+                    REMARK <START> is the 1st element in the output caption for each element in batch.
+            
+            (List(int)): 
+                The length of each decoded caption. 
+                    REMARK The <START> is provided as input at t_0.
+                    REMARK The <END> token will be removed from the input of the LSTM.
+        """             
+        # Check if encoder_dim and self.hidden_dim are equal, assert by construction
+        if images.shape[1] != self.hidden_dim:
+            raise ValueError("The dimensionality of the encoder output is not equal to the dimensionality of the hidden state.")
+        
+        # Retrieve batch size 
+        batch_dim = images.shape[0] # images is of shape (batch_dim, embedding_dim)
+        
+        # Create embedded word vector for each word in the captions
+        inputs = self.words_embedding(captions) # In: (batch_dim, max_captions_length, embedding_dim) ->  Out: (batch_dim, captions length, embedding_dim)
+        
+        # Initialize the hidden state and the cell state at time t_{-1} 
+        _h, _c = ( images, torch.zeros((captions.shape[0],self.hidden_dim)).to(self.device)) # In: ((batch_dim, hidden_dim),(batch_dim, hidden_dim)) -> Out ((batch_dim, hidden_dim), (batch_dim, hidden_dim))
+        
+        # Deterministict <START> Output as first word of the caption t_{0}
+        start = torch.zeros(self.vocab_size)
+        start[1] = 1
+        start = start.to(self.device)  # Out: (1, vocab_size)
+        
+        # Bulk insert of <START> to all the elements of the batch 
+        outputs = start.repeat(batch_dim,1,1).to(self.device) # Out: (batch_dim, 1, vocab_size)
+          
+        # Feed LSTMCell with image features and retrieve the state
+        
+        # How it works the loop?
+        # For each time step t \in {0, N-1}, where N is the caption length  
+        
+        for idx in range(0,inputs.shape[1]): 
+            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))  # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions
+            _outputs = self.linear_1(_h) # In: (batch_dim, hidden_dim), Out: (batch_dim, vocab_size)
+            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell
+        
+        return outputs, list(map(lambda length: length-1, captions_length))  
+    
+    def generate_caption(self, image: torch.Tensor, captions_length: int) -> torch.Tensor:
+        """Given the features vector of the image, perform a decoding (Generate a caption)
+
+        Args:
+        
+            image (torch.Tensor): `(1, encoder_dim)`
+                The features associated to the image. 
+                
+            max_caption_length (int): 
+                The maximum ammisible length of the caption.
+
+        Returns:
+        
+            (torch.Tensor): `(1, <variable>)`
+                The caption associated to the image given. 
+                    REMARK It includes <START> at t_0 by default.
+        """
+        
+        sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded <START>
+        input = self.words_embedding(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) # Out: (1, embedding_dim)
+        with torch.no_grad(): 
+            _h ,_c = ( image, torch.zeros((1,self.hidden_dim)).to(self.device))
+            for _ in range(captions_length-1):
+                _h, _c = self.lstm_unit(input, (_h ,_c))           # Out : ((1, 1, hidden_dim) , (1, 1, hidden_dim))
+                outputs = self.linear_1(_h)            # Out:  (1, vocab_size)
+                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if self.device.type == "cuda" else   F.softmax(outputs,dim=1).max(1)  # predicted: The predicted id
+                sampled_ids.append(predicted)
+                input = self.words_embedding(predicted)                       # Out: (1, embeddings_dim)
+                input = input.to(torch.device(self.device))                 # In: (1, embedding_dim)
+                if predicted == 2:
+                    break
+            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (1, captions_length)
+        return sampled_ids
\ No newline at end of file
diff --git a/NeuralModels/Decoder/RNetvHC.py b/NeuralModels/Decoder/RNetvHC.py
new file mode 100644
index 0000000..449434a
--- /dev/null
+++ b/NeuralModels/Decoder/RNetvHC.py
@@ -0,0 +1,152 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple,List
+
+class RNetvHC(nn.Module):
+    """
+        Class implementing LSTM unit with Cell and Hidden state initialized with custom features vector 
+    """
+    def __init__(self, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, device: str = "cpu"):
+        """Define the constructor for the RNN Net
+
+        Args:
+        
+            hidden_dim (int): 
+                Capacity of the LSTM Cell.
+                
+            padding_index (int): 
+                The index of the padding id, given from the vocabulary associated to the dataset.
+                
+            vocab_size (int): 
+                The size of the vocabulary associated to the dataset.
+                
+            embedding_dim (int): 
+                The number of features associated to a word.
+                
+            device (str, optional): Default "cpu"
+                The device on which the operations will be performed. 
+        """
+        super(RNetvHC, self).__init__()
+
+        print(f"Construction of RNetvH:\n \
+                LSTM Capacity: {hidden_dim},\n \
+                Padding Index: {padding_index},\n \
+                Vocabulary Size: {vocab_size},\n \
+                Embedding dimension: {embedding_dim},\n \
+                Device: {device}")
+        
+        self.device = torch.device(device)
+        self.hidden_dim = hidden_dim
+        self.vocab_size = vocab_size
+        # Embedding layer that turns words into a vector.
+        self.words_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index)
+        
+        # The LSTM takes embedded word vectors (of a specified size) as input
+        # and outputs hidden states of size hidden_dim
+        self.lstm_unit = torch.nn.LSTMCell(embedding_dim, hidden_dim)
+        
+        # The linear layer that maps the hidden state
+        # to the number of words we want as output = vocab_size
+        self.linear_1 = nn.Linear(hidden_dim, vocab_size)
+                
+
+    def forward(self, images: torch.Tensor, captions: torch.Tensor, captions_length: List[int]) -> Tuple[torch.Tensor, List[int]]:
+        """Compute the forward operation of the RNN.
+                input of the LSTM cell for each time step:
+                    t_{-1}: NONE 
+                    t_0: Deterministict <START> 
+                    .
+                    .
+                    .
+                    t_{N-1}: The embedding vector associated to the S_{N-1} id.
+
+        Args (Suggested): 
+        
+            images (torch.Tensor): `(batch_dim, encoder_dim)`
+                The features associated to each image of the batch. 
+            
+            captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)`
+                The caption associated to each image of the batch. 
+                    _REMARK Each caption is in the full form: <START> + .... + <END>_
+                    REMARK The Tensor is padded with zeros
+                    
+            caption_length (List(int)): 
+                The length of each caption in the batch.
+            
+        Returns:    `[(batch_dim, max_captions_length, vocab_size), List(int)]`
+        
+            (torch.Tensor): 
+                The output of LSTM for each time step from t_1 to t_N, + <START> at t_0
+                    REMARK <START> is the 1st element in the output caption for each element in batch.
+                    
+            (List(int)): The length of each decoded caption. 
+                REMARK The <START> is provided as input at t_0.
+                REMARK The <END> token will be removed from the input of the LSTM.
+        """             
+        # Check if encoder_dim and self.hidden_dim are equal, assert by construction
+        if images.shape[1] != self.hidden_dim:
+            raise ValueError("The dimensionality of the encoder output is not equal to the dimensionality of the hidden state.")
+        
+        # Retrieve batch size 
+        batch_dim = images.shape[0] 
+        
+        # Create embedded word vector for each word in the captions
+        inputs = self.words_embedding(captions) # In: (batch_dim, max_captions_length, embedding_dim) ->  Out: (batch_dim, captions length, embedding_dim)
+        
+        # Initialize the hidden state and the cell state at time t_{-1}
+        _h, _c = (images, images) #  In: ((batch_dim, hidden_dim),(batch_dim, hidden_dim)) -> Out ((batch_dim, hidden_dim), (batch_dim, hidden_dim))
+        
+        # Deterministict <START> Output as first word of the caption t_{0}
+        start = torch.zeros(self.vocab_size)
+        start[1] = 1
+        start = start.to(self.device)  # Out: (1, vocab_size)
+        
+        # Bulk insert of <START> to all the elements of the batch 
+        outputs = start.repeat(batch_dim,1,1).to(self.device) # Out: (batch_dim, 1, vocab_size)
+          
+        # Feed LSTMCell with image features and retrieve the state
+        
+        # How it works the loop?
+        # For each time step t \in {0, N-1}, where N is the caption length 
+                
+        for idx in range(0,inputs.shape[1]): 
+            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))  # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions
+            _outputs = self.linear_1(_h) # In: (batch_dim, hidden_dim), Out: (batch_dim, vocab_size)
+            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell
+        
+        return outputs, list(map(lambda length: length-1, captions_length))  
+    
+    def generate_caption(self, image: torch.Tensor, captions_length: int) -> torch.Tensor:
+        """Given the features vector of the image, perform a decoding (Generate a caption)
+
+        Args:
+        
+            image (torch.Tensor): `(1, encoder_dim)`
+                The features associated to the image. 
+                
+            max_caption_length (int): 
+                The maximum ammisible length of the caption.
+
+        Returns:
+        
+            (torch.Tensor): `(1, <variable>)`
+                The caption associated to the image given. 
+                    REMARK It includes <START> at t_0 by default.
+        """
+        
+        sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded <START>
+        input = self.words_embedding(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) # Out: (1, embedding_dim)
+        with torch.no_grad(): 
+            _h ,_c = (image,image)
+            for _ in range(captions_length-1):
+                _h, _c = self.lstm_unit(input, (_h ,_c))           # Out : ((1, hidden_dim) , (1, hidden_dim))
+                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
+                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if self.device.type == "cuda" else   F.softmax(outputs,dim=1).max(1)  # predicted: The predicted id
+                sampled_ids.append(predicted)
+                input = self.words_embedding(predicted)                       # Out: (1, embedding_dim)
+                input = input.to(torch.device(self.device))                     # Out: (1, embedding_dim)
+                if predicted == 2:
+                    break
+            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (1, captions_length)
+        return sampled_ids
\ No newline at end of file
diff --git a/NeuralModels/Decoder/RNetvHCAttention.py b/NeuralModels/Decoder/RNetvHCAttention.py
new file mode 100644
index 0000000..f2786f9
--- /dev/null
+++ b/NeuralModels/Decoder/RNetvHCAttention.py
@@ -0,0 +1,200 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple,List
+from ..Attention.IAttention import IAttention
+
+
+class RNetvHCAttention(nn.Module):
+    """
+        Class implementing LSTM unit with Attention model
+    """
+    def __init__(self, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, device: str = "cpu", attention: IAttention = None):
+        """Define the constructor for the RNN Net
+
+        Args:
+            hidden_dim (int): 
+                The Capacity of the LSTM Cell.
+            padding_index (int): 
+                The index of the padding id, given from the vocabulary associated to the dataset.
+            vocab_size (int)): 
+                The size of the vocabulary associated to the dataset.
+            embedding_size (int): 
+                The number of dimension associated to the input of the LSTM cell.
+            device (str, optional): Default "cpu"
+                The device on which the operations will be performed. 
+        """
+        super(RNetvHCAttention, self).__init__()
+
+        print(f"Construction of RNetvHCAttention:\n \
+                LSTM Capacity: {hidden_dim},\n \
+                Padding Index: {padding_index},\n \
+                Vocabulary Size: {vocab_size},\n \
+                Embedding dimension: {embedding_dim},\n \
+                Attention Dimension: {attention.attention_dim},\n \
+                Device: {device}")
+        
+        self.device = torch.device(device)
+        # Embedding layer that turns words into a vector of a specified size
+        self.words_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index)
+        
+        self.attention = attention 
+        
+        self.attention_dim = attention.attention_dim
+        
+        self.encoder_dim = attention.encoder_dim
+        
+        self.vocab_size = vocab_size
+        
+        self.hidden_dim = hidden_dim
+        
+        # The initial memory state and hidden state of the LSTM are predicted by an average of the annotation vectors fed through two separate MLPs (init,c and init,h):
+        self.h_0 = nn.Linear(self.encoder_dim, hidden_dim)
+        self.c_0 = nn.Linear(self.encoder_dim, hidden_dim)
+        
+        # The LSTM takes embedded word vectors (of a specified size) as input
+        # and outputs hidden states of size hidden_dim
+        self.lstm_unit = torch.nn.LSTMCell(self.encoder_dim + embedding_dim, hidden_dim)
+        
+        
+        # The linear layer that maps the hidden state output dimension
+        # to the number of words we want as output, vocab_size
+        self.linear_1 = nn.Linear(hidden_dim, vocab_size)
+        
+        # the soft attention model predicts a gating scalar β from previous hidden state ht_1 at each time step t
+        # Par. 4.2.1
+        self.f_beta = nn.Linear(hidden_dim, self.encoder_dim)
+        self.sigmoid = nn.Sigmoid()
+
+    def init_h_0_c_0(self, images: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Init hidden and cell state at t_0 
+
+        Args:
+            images (torch.Tensor): `(batch_dim, H_portions * W_portions, encoder_dim)`
+                The images coming from the encoder.
+
+        Returns:
+            (torch.Tensor, torch.Tensor): `[(batch_dim, hidden_dim), (batch_dim, hidden_dim)]`
+                Hiddent state and cell state ready for the 1st input
+        """
+        images = images.mean(dim=1) # Dim=0 -> batch_dim, Dim=1 -> H_portions * W_portions, Dim=2 -> encoder_dim
+        return self.h_0(images), self.c_0(images)
+    
+    
+    def forward(self, images: torch.Tensor, captions: torch.Tensor, captions_length: List[int]) -> Tuple[torch.Tensor, List[int], torch.Tensor]:
+        """Compute the forward operation of the RNN.
+                input of the LSTM cell for each time step:
+                    t_{-1}: NONE 
+                    t_0: Deterministict <START> 
+                    .
+                    .
+                    .
+                    t_{N-1}: The embedding vector associated to the S_{N-1} id.
+
+        Args:
+            images (torch.Tensor): `(batch_dim, H_portions, W_portions, encoder_dim)`
+                The features associated to each image of the batch. 
+            
+            captions (torch.Tensor):  `(batch_dim, max_captions_length, embedding_dim)`
+                The caption associated to each element of the batch.
+                    REMARK Each caption is in the full form: <START> + .... + <END>
+                    REMARK The Tensor is padded with zeros
+                    
+            caption_length ([int]): 
+                The length of each caption in the batch. 
+               
+        Returns:
+            (torch.Tensor): `(batch_dim, max_captions_length, vocab_size)`
+                The output of LSTM for each time step from t_1 to t_N, + <START> at t_0
+                    REMARK <START> is the 1st element in the output caption for each element in batch. 
+                
+            (List(int)): 
+                The length of each decoded caption. 
+                    REMARK The <START> is provided as input at t_0.
+                    REMARK The <END> token will be removed from the inputs of the LSTM.
+            
+            (torch.Tensor): `(batch_dim, max_captions_length, alphas)`
+                All the alphas evaluated over timestep t (from t_0 to t_{N-1}), for each image in the batch.
+        """             
+        
+        # Retrieve batch size 
+        batch_dim = images.shape[0] # images is of shape (batch_dim, H_portions, W_portions, encoder_dim)
+        
+        # Create embedded word vector for each word in the captions
+        inputs = self.words_embedding(captions) # In:       Out: (batch_dim, captions length, embedding_dim)
+        
+        
+        # Initialize the hidden state and the cell state at time t_{-1}
+        images = images.reshape(batch_dim,-1, images.shape[3]) # Out: (batch_dim, H_portions * W_portions, encoder_dim)
+        _h, _c = self.init_h_0_c_0(images) # _h : (batch_dim, hidden_dim), _c : (batch_dim, hidden_dim)
+        
+        # Deterministict <START> Output as first word of the caption t_{0}
+        start = torch.zeros(self.vocab_size).unsqueeze(0)
+        start[0][1] = 1
+        start = start.to(self.device)  # Out: (1, vocab_size)
+        
+        # Bulk insert of <START> to all the elements of the batch 
+        outputs = start.repeat(batch_dim,1,1).to(self.device) # Out: (batch_dim, 1, vocab_size)
+        
+        # Tensor for storing alphas at each timestep t, structure (batch_dim, MaxN, number_of_splits^2) -> number_of_splits intended for a single Measure like Heigth and assuming square images
+        alphas_t = torch.zeros((batch_dim,inputs.shape[1],self.attention.number_of_splits**2)).to(self.device)
+        
+        # Feed LSTMCell with image features and retrieve the state
+        
+        # How it works the loop?
+        # For each time step t \in {0, N-1}, where N is the caption length 
+        
+        
+        for idx in range(0,inputs.shape[1]): 
+            attention_encoding, alphas_t_i = self.attention(images, _h) # Out: attention_encoding->(batch_dim,encoder_dim), alphas_t_i->(batch_dim, number_of_splits)
+            gate = self.sigmoid(self.f_beta(_h))  # IN: (batch_dim, hidden_dim) -> Out: (batch_dim, encoder_dim)
+            attention_encoding = gate * attention_encoding # Gating z_t
+            alphas_t[:,idx,:] = alphas_t_i
+            _h, _c = self.lstm_unit(torch.cat([inputs[:,idx,:], attention_encoding], dim=1), (_h,_c))  # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions
+            _outputs = self.linear_1(_h) # In: (batch_dim, hidden_dim), Out: (batch_dim, vocab_size)
+            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell for all the elements in batch
+        
+        return outputs, list(map(lambda length: length-1, captions_length)),alphas_t
+    
+    def generate_caption(self, image: torch.Tensor, captions_length: int) -> torch.Tensor:
+        """Given the features vector retrieved by the encoder, perform a decoding (Generate a caption)
+
+        Args:
+        
+            image (torch.Tensor):  `(1, H_portions, W_portions, encoder_dim)`
+                The image.
+                
+            captions_length (int): 
+                The length of the caption.
+
+        Returns:
+        
+            (torch.Tensor): 
+                The caption associated to the image given. 
+                    It includes <START> at t_0 by default.
+                    
+            (torch.Tensor):
+                The alphas evaluated at each time t
+                
+        """
+        
+        sampled_ids = [torch.Tensor([1]).type(torch.int64).to(self.device)] # Hardcoded <START>
+        input = self.words_embedding(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1))
+        alphas = torch.zeros(captions_length, self.attention.number_of_splits **2) # Out: (MaxCaptionLength, number_of_splits)
+        with torch.no_grad(): 
+            image = image.reshape(1,-1, image.shape[3]) # Out: (1, H_portions * W_portions, encoder_dim)
+            _h, _c = self.init_h_0_c_0(image)
+            for idx in range(captions_length-1):
+                attention_encoding, alphas[idx,:] = self.attention(image, _h)
+                gate = self.sigmoid(self.f_beta(_h))  # IN: (1, hidden_dim) -> Out: (1, encoder_dim)
+                attention_encoding = gate * attention_encoding # Gating z_t
+                _h, _c = self.lstm_unit(torch.cat([input,attention_encoding], dim=1), (_h ,_c))           # _h: (1, hidden_dim)
+                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
+                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if self.device.type == "cuda" else   F.softmax(outputs,dim=1).max(1)  # predicted: The predicted id
+                sampled_ids.append(predicted)
+                input = self.words_embedding(predicted)                       # In: (1, embedding_dim)
+                input = input.to(torch.device(self.device))                       # In: (1, embedding_dim)
+                if predicted == 2:
+                    break
+            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (1, captions_length)
+        return sampled_ids, alphas
\ No newline at end of file
diff --git a/NeuralModels/Decoder/RNetvI.py b/NeuralModels/Decoder/RNetvI.py
new file mode 100644
index 0000000..786ae80
--- /dev/null
+++ b/NeuralModels/Decoder/RNetvI.py
@@ -0,0 +1,152 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple,List
+
+class RNetvI(nn.Module):
+    """
+        Class implementing LSTM unit with Cell and Hidden state initialized at ZEROS and features coming from external as 1st input
+    """
+    
+    def __init__(self, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, device: str = "cpu"):
+        """Define the constructor for the RNN Net
+
+        Args:
+        
+            hidden_dim (int): 
+                Capacity of the LSTM Cell.
+                
+            padding_index (int): 
+                The index of the padding id, given from the vocabulary associated to the dataset.
+                
+            vocab_size (int)): 
+                The size of the vocabulary associated to the dataset.
+                
+            embedding_dim (int): 
+                The number of features associated to a word.
+                
+            device (str, optional): Default "cpu"
+                The device on which the operations will be performed. 
+        """
+        super(RNetvI, self).__init__()
+
+        print(f"Construction of RNetvI:\n \
+                LSTM Capacity: {hidden_dim},\n \
+                Padding Index: {padding_index},\n \
+                Vocabulary Size: {vocab_size},\n \
+                Embedding dimension: {embedding_dim},\n \
+                Device: {device}")
+        
+        self.device = torch.device(device)
+        self.hidden_dim = hidden_dim
+        self.vocab_size = vocab_size
+        # Embedding layer that turns words into a vector.
+        self.words_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index)
+        
+        # The LSTM takes embedded word vectors (of a specified size) as input
+        # and outputs hidden states of size hidden_dim
+        self.lstm_unit = torch.nn.LSTMCell(embedding_dim, hidden_dim)
+        
+        # The linear layer that maps the hidden state
+        # to the number of words we want as output = vocab_size
+        self.linear_1 = nn.Linear(hidden_dim, vocab_size)
+                
+
+    def forward(self, images: torch.Tensor, captions: torch.Tensor, captions_length: List[int]) -> Tuple[torch.Tensor, List[int]]:
+        """Compute the forward operation of the RNN.
+                input of the LSTM cell for each time step:
+                    t_{-1}: features vector 
+                    t_0: Deterministict <START> 
+                    .
+                    .
+                    .
+                    t_{N-1}: The embedding vector associated to the S_{N-1} id.
+                    
+        Args (Suggested): 
+        
+            images (torch.Tensor): `(batch_dim, encoder_dim)`
+                The features associated to each image of the batch. 
+            
+            captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)`
+                The caption associated to each image of the batch. 
+                    _REMARK Each caption is in the full form: <START> + .... + <END>_
+                    REMARK The Tensor is padded with zeros
+                    
+            caption_length (List(int)): 
+                The length of each caption in the batch.
+            
+        Returns:    `[(batch_dim, max_captions_length, vocab_size), List(int)]`
+        
+            (torch.Tensor): 
+                The output of LSTM for each time step from t_1 to t_N, + <START> at t_0
+                    REMARK <START> is the 1st element in the output caption for each element in batch.
+            
+            (List(int)): 
+                The length of each decoded caption. 
+                    REMARK The <START> is provided as input at t_0.
+                    REMARK The <END> token will be removed from the input of the LSTM.
+        """             
+        
+        # Retrieve batch size 
+        batch_dim = images.shape[0] # images is of shape (batch_dim, embedding_dim)
+        
+        # Create embedded word vector for each word in the captions
+        inputs = self.words_embedding(captions) # In:       Out: (batch_dim, captions length, embedding_dim)
+        
+        # Initialize the hidden state and the cell state at time t_{-1}
+        _h, _c = self.lstm_unit(images) # _h : (batch_dim, hidden_dim), _c : (batch_dim, hidden_dim)
+        
+        # Deterministict <START> Output as first word of the caption t_{0}
+        start = torch.zeros(self.vocab_size)
+        start[1] = 1
+        start = start.to(self.device)  # Out: (1, vocab_size)
+        
+        # Bulk insert of <START> to all the elements of the batch 
+        outputs = start.repeat(batch_dim,1,1).to(self.device) # Out: (batch_dim, 1, vocab_size)
+          
+        # Feed LSTMCell with image features and retrieve the state
+        
+        # How it works the loop?
+        # For each time step t \in {0, N-1}, where N is the caption length 
+        
+        
+        for idx in range(0,inputs.shape[1]): 
+            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))  # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions
+            _outputs = self.linear_1(_h) # In: (batch_dim, hidden_dim), Out: (batch_dim, vocab_size)
+            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell for all the elements in batch
+        
+        return outputs, list(map(lambda length: length-1, captions_length))  
+    
+    def generate_caption(self, image: torch.Tensor, captions_length: int) -> torch.Tensor:
+        """Given the features vector of the image, perform a decoding (Generate a caption)
+
+        Args:
+        
+            image (torch.Tensor): `(1, encoder_dim)`
+                The features associated to the image. 
+                
+            max_caption_length (int): 
+                The maximum ammisible length of the caption.
+
+        Returns:
+        
+            (torch.Tensor): `(1, <variable>)`
+                The caption associated to the image given. 
+                    REMARK It includes <START> at t_0 by default.
+        """
+        
+        sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded <START>
+        input = self.words_embedding(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) # Out: (1, embeddings_dim)
+        with torch.no_grad(): 
+            _h ,_c = self.lstm_unit(image)
+            for _ in range(captions_length-1):
+                _h, _c = self.lstm_unit(input, (_h ,_c))           # _h: (1, 1, hidden_dim)
+                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
+                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if self.device.type == "cuda" else   F.softmax(outputs,dim=1).max(1)  # predicted: The predicted id
+                sampled_ids.append(predicted)
+                input = self.words_embedding(predicted)                       # inputs: (1, embedding_dim)
+                input = input.to(torch.device(self.device))                       # In: (1, embedding_dim)
+                if predicted == 2:
+                    break
+            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (1, captions_length)
+        return sampled_ids
diff --git a/NeuralModels/Encoder/CResNet50.py b/NeuralModels/Encoder/CResNet50.py
new file mode 100644
index 0000000..09ffdf9
--- /dev/null
+++ b/NeuralModels/Encoder/CResNet50.py
@@ -0,0 +1,55 @@
+import torch.nn as nn
+import torch
+import torchvision.models as models
+
+class CResNet50(nn.Module):
+    """
+        Encoder Built with a resnet50 with the last layer removed.
+    """
+    
+    def __init__(self, encoder_dim: int, device: str = "cpu"):
+        """Constructor of the Encoder
+
+        Args:
+            encoder_dim (int): 
+                The dimensionality of the features vector extracted from the image
+            
+            device (str, optional): Default "cpu".
+                The device on which the operations will be performed.
+        """
+        super(CResNet50, self).__init__()
+        
+        self.encoder_dim = encoder_dim
+        self.device = torch.device(device)
+        resnet = models.resnet50(pretrained=True)
+        for param in resnet.parameters(): # Freezing weights 
+            param.requires_grad_(False)
+        
+        print(f"Construction of CResNet50:\n \
+                Encoder dimension: {encoder_dim},\n \
+                Device: {device}")
+        
+        modules = list(resnet.children())[:-1]   # remove last fc layer, expose the GlobalAveragePooling
+        self.resnet = nn.Sequential(*modules)
+        
+        self.linear = nn.Linear(resnet.fc.in_features, encoder_dim) # define a last fc layer 
+        
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """Forward operation of the nn
+
+        Args:
+            images (torch.tensor):  `(batch_dim, channels, heigth, width)`
+                The tensor of the images.
+
+        Returns:
+            [torch.tensor]: `(batch_dim, encoder_dim)`
+                Features Projection for each image in the batch.
+                
+        """
+        
+        features = self.resnet(images) # Out: (batch_dim, 2048, 1, 1), 2048 is a Design choice of ResNet50 of last conv.layer.
+        
+        features = features.reshape(features.size(0), -1).to(self.device)
+        features = self.linear(features) # In: (batch_dim, 2048)
+        
+        return features
\ No newline at end of file
diff --git a/NeuralModels/Encoder/CResNet50Attention.py b/NeuralModels/Encoder/CResNet50Attention.py
new file mode 100644
index 0000000..06949af
--- /dev/null
+++ b/NeuralModels/Encoder/CResNet50Attention.py
@@ -0,0 +1,57 @@
+import torch.nn as nn
+import torch
+import torchvision.models as models
+
+class CResNet50Attention(nn.Module):
+    def __init__(self, encoder_dim: int, number_of_splits: int = 7, device: str = "cpu"):
+        """Constructor of the Encoder NN
+
+        Args:
+            encoder_dim (int): Unused. Internally resnet with 2 layers removed represent each pixel in a vector of 2048 components.
+            
+            number_of_splits (int):
+                How many pieces do you want to split the images, for border.
+                    Examples:
+                        number_of_splits = 7 -> The images will be splitted into 49 pieces (7x7). 
+                
+            device (str, optional): Default "cpu".
+                The device on which the operations will be performed. 
+        """
+        super(CResNet50Attention, self).__init__()
+        
+        self.device = torch.device(device)
+        resnet = models.resnet50(pretrained=True)
+        for param in resnet.parameters(): # Freezing weights 
+            param.requires_grad_(False)
+        
+        modules = list(resnet.children())[:-2]   # Expose the last convolutional layer. 2048 Filters of size 1x1. Output of the ConvLayer -> (H_in/32,W_in/32,2048) 
+        
+        self.encoder_dim = 2048 
+        
+        self.number_of_splits = number_of_splits
+        
+        print(f"Construction of CResNet50Attention:\n \
+                Encoder dimension: {self.encoder_dim},\n \
+                Number of splits: {number_of_splits**2},\n \
+                Device: {device}")
+        # Q. Why (H_in/32, W_in/32)
+        # A. Due to the resnet50 implementation, each convolutional layer will reduce the dimensionality of Heigth and Width by 2 times.
+        
+        self.resnet = nn.Sequential(*modules)
+        
+        
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """Forward operation of the nn
+
+        Args:
+            images (torch.tensor):  `(batch_dim, Channels, Width, Height)`
+                The tensor of the images.
+
+        Returns:
+            [torch.tensor]: `(batch_dim, H_splits, W_splits, encoder_dim)`
+                Features Projection Tensor 
+        """
+        
+        features = self.resnet(images) # Out: (batch_dim, 2048,Heigth/32, Width/32) 
+        features = features.permute(0, 2, 3, 1)  # (batch_dim, H_splits, W_splits, 2048)
+        return features
\ No newline at end of file
diff --git a/NeuralModels/Encoder/IEncoder.py b/NeuralModels/Encoder/IEncoder.py
new file mode 100644
index 0000000..745af14
--- /dev/null
+++ b/NeuralModels/Encoder/IEncoder.py
@@ -0,0 +1,35 @@
+#####   INTERFACE CLASS DON'T USE IT (You at most use only as Type Hint), JUST READ IT.
+################################################################
+
+import torch.nn as nn
+import torch
+import torchvision.models as models
+
+class IEncoder(nn.Module):
+    """
+        Interface for a generic Encoder
+    """
+    def __init__(self,  *args):
+        """Constructor of the Encoder NN
+
+        Args:
+            encoder_dim (int): 
+                The dimensionality of the features vector extracted from the image
+            
+            device (str, optional): Default "cpu".
+                The device on which the operations will be performed.
+        """
+        super(IEncoder, self).__init__()
+        
+    def forward(self, *args) -> torch.Tensor:
+        """Interface of forward operation of the nn
+
+        Args:
+            images (torch.tensor):  `(batch_dim, channels, heigth, width)`
+                The tensor of the images.
+
+        Returns:
+            [torch.tensor]: `(batch_dim, encoder_dim)`
+                Features Projection for each image in the batch.
+        """
+        pass
\ No newline at end of file
diff --git a/NeuralModels/FactoryModels.py b/NeuralModels/FactoryModels.py
new file mode 100644
index 0000000..f39d9e0
--- /dev/null
+++ b/NeuralModels/FactoryModels.py
@@ -0,0 +1,183 @@
+from .Encoder.CResNet50 import CResNet50
+from .Encoder.CResNet50Attention import CResNet50Attention
+from .Decoder.RNetvHC import RNetvHC
+from .Decoder.RNetvI import RNetvI
+from .Decoder.RNetvH import RNetvH
+from .Decoder.RNetvHCAttention import RNetvHCAttention
+from .CaRNet import CaRNet
+from .Attention.SoftAttention import SoftAttention
+from enum import Enum
+
+# Open source is a development methodology; free software is a social movement. 
+# - Richard Stallman
+
+####### How to continue implementation?
+## Everyone is free of enrich this library, remember to follow the IInterface.py for each type of Elements
+## At the end of your code session, add your element to the factory. 
+## Follow the rule:
+
+class Attention(Enum):
+    """
+        Attention type list.
+    """
+    Attention = "Attention"
+    
+    def __str__(self):
+        return self.name
+    
+    def __repr__(self):
+        return str(self)
+    
+    @staticmethod
+    def argparse(s):
+        try:
+            return Attention[s]
+        except KeyError:
+            return s
+    
+def FactoryAttention(attention: Attention):
+    """ Attention Factory 
+
+    Args:
+        attention (Attention): 
+            The expected attention to produce
+
+    Raises:
+        NotImplementedError: Raise when external ask for an implementation that is not covered yet.
+
+    Returns:
+        (IAttetion):   
+            A Class reference
+    """
+    if attention == Attention.Attention:
+        return SoftAttention
+    raise NotImplementedError("This attention model is not implemented yet")
+
+####################################################################
+
+class Encoder(Enum):
+    """
+        Encoder type list.
+    """
+    CResNet50 = "CResNet50"
+    CResNet50Attention = "CResNet50Attention"
+    
+    def __str__(self):
+        return self.name
+    
+    def __repr__(self):
+        return str(self)
+    
+    @staticmethod
+    def argparse(s):
+        try:
+            return Encoder[s]
+        except KeyError:
+            return s
+        
+def FactoryEncoder(encoder: Encoder):
+    """ Encoder Factory 
+
+    Args:
+        encoder (Encoder): 
+            The expected encoder to produce
+
+    Raises:
+        NotImplementedError: Raise when external ask for an implementation that is not covered yet.
+
+    Returns:
+        (IEncoder):   
+            A Class reference
+    """
+    if encoder == Encoder.CResNet50:
+        return CResNet50
+    if encoder == Encoder.CResNet50Attention:
+        return CResNet50Attention
+    raise NotImplementedError("This encoder is not implemented yet")
+
+####################################################################
+
+class Decoder(Enum):
+    """
+        Decoder type list.
+    """
+    RNetvI = "RNetvI"
+    RNetvH = "RNetvH"
+    RNetvHC = "RNetvHC"
+    RNetvHCAttention = "RNetvHCAttention"
+    
+    def __str__(self):
+        return self.name
+    
+    def __repr__(self):
+        return str(self)
+    
+    @staticmethod
+    def argparse(s):
+        try:
+            return Decoder[s]
+        except KeyError:
+            return s
+        
+def FactoryDecoder(decoder: Decoder):
+    """ Decoder Factory 
+
+    Args:
+        decoder (Decoder): 
+            The expected decoder to produce
+
+    Raises:
+        NotImplementedError: Raise when external ask for an implementation that is not covered yet.
+
+    Returns:
+        (IDecoder):   
+            A Class reference
+    """
+    if decoder == decoder.RNetvI:
+        return RNetvI
+    if decoder == decoder.RNetvH:
+        return RNetvH
+    if decoder == decoder.RNetvHC:
+        return RNetvHC
+    if decoder == decoder.RNetvHCAttention:
+        return RNetvHCAttention
+    raise NotImplementedError("This decoder is not implemented yet")
+
+#####################################################################
+
+class NeuralNet(Enum):
+    """
+        NeuralNet type list.
+    """
+    CaRNet = "CaRNet"
+    
+    def __str__(self):
+        return self.name
+    
+    def __repr__(self):
+        return str(self)
+    
+    @staticmethod
+    def argparse(s):
+        try:
+            return NeuralNet[s]
+        except KeyError:
+            return s
+
+def FactoryNeuralNet(net: NeuralNet):
+    """ NeuralNet Factory 
+
+    Args:
+        net (NeuralNet): 
+            The expected neural net to produce
+
+    Raises:
+        NotImplementedError: Raise when external ask for an implementation that is not covered yet.
+
+    Returns:
+        (NeuralNet):   
+            A Class reference
+    """
+    if net == NeuralNet.CaRNet:
+        return CaRNet
+    raise NotImplementedError("This neural net is not implemented yet")
\ No newline at end of file
diff --git a/NeuralModels/Metrics.py b/NeuralModels/Metrics.py
new file mode 100644
index 0000000..32b7185
--- /dev/null
+++ b/NeuralModels/Metrics.py
@@ -0,0 +1,58 @@
+import pandas as pd
+
+class Result():
+    """
+            Class for storing result of the net
+    """
+    def __init__(self):
+        """Constructor of the class
+        """
+        self.train_results = pd.DataFrame([], columns=["Epoch", "IDBatch", "Loss", "Accuracy"])
+        convert_dict = {'Epoch': int,
+                        "IDBatch": int,
+                        "Loss":float,
+                        'Accuracy': float
+               }
+        self.train_results = self.train_results.astype(convert_dict)
+        
+        convert_dict = {'Epoch': int,
+                        'Accuracy': float
+               }
+        self.validation_results = pd.DataFrame([], columns=["Epoch", "Accuracy"])
+        self.validation_results = self.validation_results.astype(convert_dict)
+        
+    def add_train_info(self, epoch: int, batch_id: int, loss: float, accuracy: float):
+        """Add a row to the dataframe of the training set info
+
+        Args:
+            epoch (int): 
+                The epoch.
+            batch_id (int): 
+                The id of the batch.
+            loss (float): 
+                Loss of the given epoch-batch.
+            accuracy (float): 
+                Accuracy of the given epoch-batch.
+        """
+        self.train_results = self.train_results.append({"Epoch":epoch,"IDBatch": batch_id, "Loss": loss, "Accuracy":accuracy}, ignore_index=True)
+        
+    def add_validation_info(self, epoch: int, accuracy: float):
+        """Add a row to the dataframe of the validation set info
+
+        Args:
+            epoch (int): 
+                The epoch.
+            accuracy (float): 
+                Accuracy of the given epoch-batch.
+        """
+        self.validation_results = self.validation_results.append({"Epoch":epoch, "Accuracy":accuracy}, ignore_index=True)
+        
+    def flush(self, directory: str = "."): 
+        """Flush the dataframes to non-volatile memory in a csv format
+
+        Args:
+            directory (str, optional):  Defaults to ".".
+                The directory to store the files as csv.
+        """
+        self.train_results.to_csv(f'{directory}/train_results.csv', encoding='utf-8', index=False)
+        self.validation_results.to_csv(f'{directory}/validation_results.csv', encoding='utf-8', index=False)
\ No newline at end of file
diff --git a/NeuralModels/Vocabulary.py b/NeuralModels/Vocabulary.py
new file mode 100644
index 0000000..df41b16
--- /dev/null
+++ b/NeuralModels/Vocabulary.py
@@ -0,0 +1,170 @@
+# Typing trick for avoid circular import dependencies valid for python > 3.9
+# from __future__ import annotations
+# from typing import TYPE_CHECKING
+# if TYPE_CHECKING:
+#     from .Dataset import MyDataset
+    
+import torch
+from typing import List
+import os
+import pickle
+
+class Vocabulary():
+    """
+        Implementation of the vocabulary.
+        
+        Assumption: 
+        
+            1) The vocabulary is enriched with 4 special words:\n
+                <PAD>: Padding ------> ID: 0\n
+                <START>: Start Of String ------> ID: 1\n
+                <END>: End Of String ------> ID: 2\n
+                <UNK>: Out of vocabulary word ------> ID: 3\n
+
+                Example: <START> I Love Pizza <END> <PAD> <PAD> -> Translate into ids -> 1 243 5343 645655 2 0 0 
+    """
+    
+    
+    def __init__(self, source_dataset = None): # for python > 3.9 -> def __init__(self, source_dataset: MyDataset):
+        """Vocabulary constructor
+
+        Args:
+            source_dataset (MyDataset): 
+                The source Dataset, if None try to load a vocabulary from the hidden .saved folder
+        """
+        
+        if source_dataset is None:
+            print("Try to load the vocabulary from file..")
+            
+            if not os.path.exists(".saved/word2id.pickle") or not os.path.exists(".saved/embeddings.pickle"):
+                raise FileNotFoundError("You request a loading from file but the file doesn't exist, first generate the vocabulary!")
+            
+            with open('.saved/word2id.pickle', 'rb') as word2id, open('.saved/embeddings.pickle', 'rb') as embeddings:
+                self.word2id = pickle.load(word2id)
+                self.embeddings = pickle.load(embeddings)
+                self.dictionary_length = len(self.word2id.keys())
+                return 
+        
+        # Load for the 1st time all the possible words from the dataset
+        dataset_words = source_dataset.get_all_distinct_words_in_dataset()
+        
+        # Dictionary length 
+        self.dictionary_length = len(dataset_words)+4 # Dictionary word + 4 Flavored Token (PAD + START + END + UNK)
+        
+        self.word2id = {}
+        self.embeddings = torch.zeros((self.dictionary_length, self.dictionary_length))  # DIM1: dict rows + 4 flavored token (PAD + START + END + UNK) | DIM2: Dict Rows +4 flavored token (PAD + START + END + UNK) as 1-hot
+        
+        # Initialize the token:
+        # <PAD>, <START>, <END>, <UNK>
+        self.word2id["<PAD>"] = 0
+        self.word2id["<START>"] = 1
+        self.word2id["<END>"] = 2
+        self.word2id["<UNK>"] = 3
+        
+        counter = 4 
+        for word in dataset_words:
+            self.word2id[word] = counter
+            counter += 1
+        
+        # Identiry matrix == 1-hot vector :)
+        self.embeddings = torch.eye(self.dictionary_length)
+
+        with open('.saved/word2id.pickle', 'wb') as word2id, open('.saved/embeddings.pickle', 'wb') as embeddings:
+            pickle.dump(self.word2id, word2id, protocol=pickle.HIGHEST_PROTOCOL)
+            pickle.dump(self.embeddings, embeddings, protocol=pickle.HIGHEST_PROTOCOL)
+            
+    def predefined_token_idx(self) -> dict:
+        """Return the predefined token indexes.
+
+        Returns:
+            dict: The token dictionary
+        """
+        return {
+            "<PAD>":0,
+            "<START>":1,
+            "<END>":2,
+            "<UNK>":3
+        }
+    
+    def translate(self, word_sequence : List[str], type : str = "complete") -> torch.tensor:
+        """Given a sequence of word, translate into id list according to the vocabulary.
+
+        Args:
+            word_sequence (list(str)): 
+                The sequence of words to translate
+            
+            type (str, optional): Default is complete
+                The type of translation.
+        
+        Returns:
+            (torch.Tensor): `(1,caption_length)`
+                The caption in IDs form. 
+                `if` complete: <1> + ...Caption... + <2> 
+                `else`: <1> + ...Caption...
+        """
+        
+        # Initialize the translator
+        
+        if type == "uncomplete":
+            _sequence = torch.zeros(len(word_sequence)+1, dtype=torch.int32) # <START> + ...Caption...
+            
+        if type == "complete":
+            _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # <START> + ...Caption... + <END> 
+            _sequence[-1] = self.word2id["<END>"]
+            
+        _sequence[0] = self.word2id["<START>"]
+        
+        counter = 1 # Always skip <START> 
+        
+        # Evaluate all the word into the caption and translate it to an embeddings
+        for word in word_sequence:
+            if word.lower() in self.word2id.keys():
+                _sequence[counter] = self.word2id[word.lower()]
+            else:
+                _sequence[counter] = self.word2id["<UNK>"]
+            counter += 1
+        
+        return _sequence
+    
+    def rev_translate(self, words_id : torch.tensor) -> List[str]:
+        """Given a sequence of word, translate into id list according to the vocabulary.
+
+        Args:
+            words_id (torch.Tensor): `(1,caption_length)`
+                The sequence of IDs.
+        Returns:
+            (List(str)):
+                The caption in words form.
+        """
+        return [list(self.word2id.keys())[idx] for idx in words_id[:].tolist()]   # word_id (1,caption_length)
+    
+    
+    def __len__(self):
+        """The total of words in this Vocabulary."""
+
+        return len(self.word2id.keys())
+    
+    
+# ----------------------------------------------------------------
+# Usage example
+
+if __name__ == '__main__':
+    #Load the vocabulary
+    pippo = MyDataset(...)
+    v = Vocabulary(source_dataset=pippo)
+    # Make a translation
+    print(v.translate(["I","like","PLay","piano","."]))
+    
+    
+    
+        
+        
+        
+        
+            
+        
+        
+    
+    
+        
+    
\ No newline at end of file
diff --git a/NeuralModel/__init__.py b/NeuralModels/__init__.py
similarity index 100%
rename from NeuralModel/__init__.py
rename to NeuralModels/__init__.py
diff --git a/VARIABLE.py b/VARIABLE.py
new file mode 100644
index 0000000..6319683
--- /dev/null
+++ b/VARIABLE.py
@@ -0,0 +1,9 @@
+# Dataset Variable
+MAX_CAPTION_LENGTH = 15
+IMAGES_SUBDIRECTORY_NAME = "flickr30k_images"
+CAPTION_FILE_NAME = "results.csv"
+
+# Vocabulary
+EMBEDDINGS_REPRESENTATION = "1-HOT"
+
+
diff --git a/attention.png b/attention.png
new file mode 100644
index 0000000..8bb154b
Binary files /dev/null and b/attention.png differ
diff --git a/caption.png b/caption.png
new file mode 100644
index 0000000..b963742
Binary files /dev/null and b/caption.png differ
diff --git a/img1.png b/img1.png
new file mode 100644
index 0000000..e1c93dc
Binary files /dev/null and b/img1.png differ
diff --git a/main.py b/main.py
index e69de29..148768e 100644
--- a/main.py
+++ b/main.py
@@ -0,0 +1,178 @@
+from torch.utils.data import DataLoader
+from NeuralModels.FactoryModels import *
+from NeuralModels.Dataset import MyDataset
+from NeuralModels.Vocabulary import Vocabulary
+import argparse
+import sys, os
+from PIL import Image
+
+def parse_command_line_arguments():
+
+    parser = argparse.ArgumentParser(description='CLI for C[aA]RNet, some static definition are placed in the VARIABLE.py file')
+    
+    parser.add_argument('decoder', type=Decoder.argparse, choices=list(Decoder),
+                        help="What type of decoder do you want use?")
+    
+    parser.add_argument('mode', choices=['train', 'eval'],
+                        help='train or evaluate C[aA]RNet.')
+    
+    parser.add_argument('encoder_dim', type=int,
+                        help = 'Size of the encoder output. IF Attention is True, fixed at 2048. IF CaRNetvI as net, encoder_dim == |vocabulary|.')
+    
+    parser.add_argument('hidden_dim', type=int,
+                        help = 'Capacity of the LSTM Cell.')
+    
+    parser.add_argument('--attention', default=False, type=bool,
+                        help='Use attention model. IF True, vHCAttention decoder and CResNet50Attention encoder are mandatories. (default: False)')
+    
+    
+    parser.add_argument('--attention_dim', type=int, default=0, 
+                        help="The attention capacity. Valid only if attention is true. (default: 0)")
+    
+    parser.add_argument('--dataset_folder', type=str, default="./dataset",
+                        help='Data set folder. Used only if mode = train (Default: "./dataset")')
+    
+    parser.add_argument('--image_path', type=str, default="",
+                        help = "The absolute path of the image that we want to retrieve the caption. Used only if mode = eval (Default: ''")
+    
+    parser.add_argument('--splits', type=int, nargs="+", default=[60,30,10],
+                        help='Fraction of data to be used in train set, val set and test set (default: 60 30,10)')
+    
+    parser.add_argument('--batch_size', type=int, default=32,
+                        help='mini-batch size (default: 32)')
+    
+    parser.add_argument('--epochs', type=int, default=500,
+                        help='number of training epochs (default: 500)')
+    
+    parser.add_argument('--lr', type=float, default=1e-3,
+                        help='learning rate (Adam) (default: 1e-3)')
+    
+    parser.add_argument('--workers', type=int, default=4,
+                        help='number of working units used to load the data (default: 4)')
+    
+    parser.add_argument('--device', default='cpu', type=str,
+                        help='device to be used for computations (in {cpu, cuda:0, cuda:1, ...}, default: cpu)')
+
+    parsed_arguments = parser.parse_args()
+
+    return parsed_arguments
+
+
+
+if __name__ == "__main__":
+    print("Coded with love by christiandimaio aka gnekt :* \n ")
+    args = parse_command_line_arguments()
+    
+    for k, v in args.__dict__.items():
+        print(k + '=' + str(v))
+    
+    #################################### Define Encoder/Decoder
+    encoder = None 
+    decoder = None
+    attention = None
+    if args.attention == True:
+        # Attention is true, encoder = CResNet50Attention, decoder = RNetvHCAttention
+        encoder = FactoryEncoder(Encoder.CResNet50Attention)
+        decoder = FactoryDecoder(Decoder.RNetvHCAttention)
+        attention = FactoryAttention(Attention.Attention)
+        args.net_name = "CARNetvHCAttention"
+        
+    if args.attention == False:
+        args.net_name = f"Ca{args.decoder}"
+        encoder = FactoryEncoder(Encoder.CResNet50)
+        decoder = FactoryDecoder(args.decoder)
+    ####################################
+    
+    #################################### Construct Data
+    print("Construct data..")
+    
+    if args.mode == "train":
+        print("Define dataset..")
+        dataset = MyDataset(args.dataset_folder, percentage=8) # Percentage is fixed cause the dataset is HUGE, 8% is enough for sperimental test.
+        print("OK.")
+        
+        print("Define vocabulary..")
+        vocabulary = Vocabulary(dataset)
+        print("OK.")
+        
+        # Obtain train, validation and test set
+        print("Obtain train, validation and test set..")
+        train_set = dataset.get_fraction_of_dataset(percentage=args.splits[0], delete_transfered_from_source=True)
+        validation_set = dataset.get_fraction_of_dataset(percentage=args.splits[1], delete_transfered_from_source=True)
+        test_set  = dataset.get_fraction_of_dataset(percentage=args.splits[2], delete_transfered_from_source=True)
+        print("OK.")
+        
+        # Define the associate dataloader
+        print("Define the associate dataloader")
+        dataloader_training = DataLoader(train_set, batch_size=args.batch_size,
+                        shuffle=True, num_workers=args.workers, collate_fn = lambda data: dataset.pack_minibatch_training(data,vocabulary))
+        dataloader_validation = DataLoader(validation_set, batch_size=args.batch_size,
+                        shuffle=True, num_workers=args.workers, collate_fn = lambda data: dataset.pack_minibatch_evaluation(data,vocabulary))
+        dataloader_test = DataLoader(test_set, batch_size=args.batch_size,
+                        shuffle=True, num_workers=args.workers, collate_fn = lambda data: dataset.pack_minibatch_evaluation(data,vocabulary))
+        print("OK.")
+        
+    if args.mode == "eval":
+        print("Define vocabulary..")
+        vocabulary = Vocabulary()
+        print("Ok.")
+
+        print("Load the image..")
+        if not os.path.exists(args.image_path) or os.path.isdir(args.image_path):
+            raise ValueError(f"Got {args.image_path} as file path, error!")
+        image: Image.Image = Image.open(args.image_path).convert('RGB') 
+        print("Ok.")
+    ####################################
+    
+    #################################### Define Net
+    print("Create the net..")
+    net = FactoryNeuralNet(NeuralNet.CaRNet)(
+        encoder=encoder,
+        decoder=decoder,
+        attention=attention, # != None only if Attention is requested
+        attention_dim = args.attention_dim, # != 0 only if Attention is True
+        net_name=args.net_name,
+        encoder_dim = args.encoder_dim if args.decoder is not Decoder.RNetvI else vocabulary.embeddings.shape[1], # if Attention is True encoder_dim hasn't any meaning, cause it is 2048 internally by construction.
+        hidden_dim= args.hidden_dim,
+        padding_index= vocabulary.predefined_token_idx()["<PAD>"],
+        vocab_size= len(vocabulary.word2id.keys()),
+        embedding_dim = vocabulary.embeddings.shape[1],
+        device=args.device
+    )
+    print("OK.")
+    #################################### Load a previous trained net, if exist
+    
+    print("Check if it is present a previous version of the Net..")
+    try:
+        net.load("./.saved")
+        print("Found.")
+    except Exception as ex:
+        print("An exception has occurred.")
+        print(ex)
+        if args.mode == "eval": # If the mode is eval the script cannot continue
+            print("Since you want an evaluation, the script cannot continue, please retrain the network.")
+            sys.exit(0)
+        # In training it creates new files.
+        print("Not Found.")
+        print("Since the selected mode is training, a new instance of the net will saved during the training activity.")
+    
+    #################################### Training or Evaluate
+    
+    if args.mode == "train":
+        print("Start training..")
+        net.train(
+                train_set=dataloader_training,
+                validation_set=dataloader_validation,
+                lr=args.lr,
+                epochs=args.epochs,
+                vocabulary=vocabulary
+            )
+        # Evaluate Test set
+        print("Done")
+        print(f"Test set Accuracy: {net.eval_net(dataloader_test, vocabulary):.4f}")
+        
+    if args.mode == "eval":
+        print("Start evaluation..")
+        net.eval(image, vocabulary)
+        print("OK.")
+    ####################################
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..8be15c5
--- /dev/null
+++ b/test.py
@@ -0,0 +1,35 @@
+from piou import Cli, Option
+
+cli = Cli(description='A CLI tool')
+
+
+
+
+sub_cmd = cli.add_sub_parser(cmd='sub', help='A sub command')
+sub_cmd.add_option('--test', help='Test mode')
+
+
+@sub_cmd.command(cmd='bar', help='Run bar command')
+def sub_bar_main(**kwargs):
+    pass
+
+
+@cli.command(cmd='foo', help='Run foo command')
+def foo_main(
+        bar: int = Option(..., help='Bar positional argument (required)'),
+        baz: str = Option(..., '-b', '--baz', help='Baz keyword argument (required)'),
+        foo: str = Option(None, '--foo', help='Foo keyword argument'),
+):
+    """
+    A longer description on what the function is doing.  
+    You can run it with:
+    ```bash
+     poetry run python -m piou.test.simple foo 1 -b baz
+    ```
+    And you are good to go!
+    """
+    pass
+
+
+if __name__ == '__main__':
+    cli.run()
\ No newline at end of file