diff --git a/.gitignore b/.gitignore
index 8a4a0ca..6966b61 100644
--- a/.gitignore
+++ b/.gitignore
@@ -119,3 +119,4 @@ dmypy.json
 dataset/*
 .saved/*
 *.Identifier
+*.zip
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..4107551
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Usare IntelliSense per informazioni sui possibili attributi.
+    // Al passaggio del mouse vengono visualizzate le descrizioni degli attributi esistenti.
+    // Per altre informazioni, visitare: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: File corrente",
+            "type": "python",
+            "request": "launch",
+            "program": "/home/christian/Documenti/GitHub/Image-Captioning/v1/NeuralNet.py",
+            "console": "integratedTerminal"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/light_version/Dataset.py b/bck_old/Dataset.py
similarity index 99%
rename from light_version/Dataset.py
rename to bck_old/Dataset.py
index 19ce07d..3cd200c 100644
--- a/light_version/Dataset.py
+++ b/bck_old/Dataset.py
@@ -117,6 +117,7 @@ def pack_minibatch_evaluation(self, data):
         images = torch.stack(images, 0)
         
         caption_lengths = [len(caption) for caption in captions]
+        captions
         captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True)
         return images,captions.type(torch.LongTensor),caption_lengths 
         
\ No newline at end of file
diff --git a/light_version/NeuralNet.py b/bck_old/NeuralNet.py
similarity index 90%
rename from light_version/NeuralNet.py
rename to bck_old/NeuralNet.py
index 08256b4..a72c6b7 100644
--- a/light_version/NeuralNet.py
+++ b/bck_old/NeuralNet.py
@@ -9,7 +9,7 @@
 
 device = "cuda:0"
 class EncoderCNN(nn.Module):
-    def __init__(self, embed_size):
+    def __init__(self, embedding_size):
         super(EncoderCNN, self).__init__()
         resnet = models.resnet50(pretrained=True)
         for param in resnet.parameters():
@@ -17,38 +17,29 @@ def __init__(self, embed_size):
         
         modules = list(resnet.children())[:-1]   # remove last fc layer
         self.resnet = nn.Sequential(*modules)
-        self.linear = nn.Linear(resnet.fc.in_features, 50) 
+        self.linear = nn.Linear(resnet.fc.in_features, embedding_size) 
         
     def forward(self, images):
-        
         features = self.resnet(images) 
-        features = features.reshape(features.size(0), -1)
+        features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.)
         features = self.linear(features)
         return features
     
 class DecoderRNN(nn.Module):
-    def __init__(self, hidden_size, padding_index, vocab_size, embeddings ):
+    def __init__(self, hidden_size, padding_index, vocab_size, embeddings, embedding_size):
         """Set the hyper-parameters and build the layers."""
         super(DecoderRNN, self).__init__()
-        # Keep track of hidden_size for initialization of hidden state
-        self.hidden_size = hidden_size
         
         # Embedding layer that turns words into a vector of a specified size
-        self.word_embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True, padding_idx = 0)
+        self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index)
         
         # The LSTM takes embedded word vectors (of a specified size) as input
         # and outputs hidden states of size hidden_dim
-        self.lstm = nn.LSTM(input_size=50, \
-                            hidden_size=1024, # LSTM hidden units 
-                            num_layers=1, # number of LSTM layer
-                            batch_first=True,  # input & output will have batch size as 1st dimension
-                            dropout=0, # Not applying dropout 
-                            bidirectional=False, # unidirectional LSTM
-                           )
+        self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size)
         
         # The linear layer that maps the hidden state output dimension
         # to the number of words we want as output, vocab_size
-        self.linear_1 = nn.Linear(1024, vocab_size)
+        self.linear_1 = nn.Linear(hidden_size, vocab_size)
 
     def init_hidden_state(self, encoder_out):
         """
diff --git a/light_version/Vocabulary.py b/bck_old/Vocabulary.py
similarity index 100%
rename from light_version/Vocabulary.py
rename to bck_old/Vocabulary.py
diff --git a/heavy_version/Models/Dataset.py b/heavy_version/Models/Dataset.py
deleted file mode 100644
index b0cac76..0000000
--- a/heavy_version/Models/Dataset.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from xml.dom import ValidationErr
-
-from Sample import Sample
-import os
-import pandas as pd 
-import torch
-import numpy as np 
-from enum import Enum
-from torch.utils.data import Dataset, DataLoader
-import torch.nn as nn
-
-class DatasetState(Enum):
-    """A dataset could be in 3 possible, mutual exclusive, state:
-        - Raw -> Sample are raw, no preprocessing operation performed
-        - Training -> All Samples are pre-processed for training
-        - Evaluation -> All Samples are pre-processed for evaluation
-
-    Args:
-        Enum (int): Raw or Training or Evaluation
-    """
-    Raw = 0
-    Training = 1
-    Evaluation = 2
-    
-# TO-Do
-# Aggiungere a README, la modalita` in cui si elabora il dataset e`: ho una cartella il cui contenuto e`: 
-#       1) un file result.csv che contiene i dati come il formato gia` definito
-#       2) una cartella images nella quale ci sono tutte le immagini, tutte le sottocartelle di images non verranno considerate
-
-class MyDataset(Dataset):
-    # The dataset will have this shape
-    # | id_sample | sample | dirty |
-    # |-----------|--------|-------|
-    # |           |        |       |
-    # |           |        |       |
-    # |           |        |       |
-    # 
-    # id_sample is an unique identifier of the sample
-    # sample is the <Sample> object associated 
-    # dirty is boolean and it means: this sample was already taken from the method get_fraction_of_dataset, this implies that externally somebody already taken this sample.
-    
-    def __init__(self, directory_of_data:str = None, percentage:int = 100,already_computed_dataframe: pd.DataFrame = None, state: DatasetState = DatasetState.Raw):
-        """Create a new dataset from source files
-
-        Args:
-            directory_of_data (str): [description]
-        """
-        self.state: DatasetState = state
-        if already_computed_dataframe is not None:
-            self.dataset = already_computed_dataframe
-            return 
-        
-        if not os.path.exists(directory_of_data):
-            raise ValueError(f"{directory_of_data} not Exist!")
-        if not os.path.isdir(directory_of_data):
-            raise ValueError(f"{directory_of_data} is not a directory!")
-        
-        _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]]
-        _temp_dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100)))
-        samples = _temp_dataset.apply(lambda row: Sample(int(row.name)+1,f"{directory_of_data}/images/{row.image_name}",row.comment),axis=1)
-        
-        self.dataset: pd.DataFrame = pd.DataFrame(list(zip([i for i in range(len(samples))],samples,[False for _ in range(len(samples))])), columns=["id_sample","sample","dirty"])
-        
-    
-    def suffle_data_set(self):
-        self.dataset.apply(torch.randperm, axis=0)
-    
-    def get_fraction_of_dataset(self, percentage: int, also_dirty: bool = False): 
-        if not also_dirty:
-            _temp_df = self.dataset[self.dataset["dirty"] == False]
-        _temp_df = _temp_df.apply(np.random.permutation, axis=0)
-        _temp_df_moved = _temp_df.head(int(len(_temp_df)*(percentage/100)))
-        _temp_df_copy = _temp_df_moved.copy()
-        self.dataset.loc[_temp_df_moved["id_sample"],"dirty"] = True
-        return MyDataset(already_computed_dataframe=_temp_df_copy)
-        
-
-    def make_dirty(self) -> bool:
-        self.dataset["dirty"] = True
-    
-    def make_clean(self) -> bool:
-        self.dataset["dirty"] = False
-    
-    # torch.utils.data.Dataset is an abstract class representing a dataset. Your custom dataset should inherit Dataset and override the following methods:
-
-    # __len__ so that len(dataset) returns the size of the dataset.
-    # __getitem__ to support the indexing such that dataset[i] can be used to get i-ith sample.
-    
-    def __len__(self):
-        return self.dataset.shape[0]
-    
-    def __getitem__(self, idx):
-        
-        if self.state == DatasetState.Raw:
-            raise ValidationErr("The getitem built-in method cannot be executed when the dataset is in a RAW state.\n Please do some preprocessing on it before __getitem__ call.")
-        
-        
-        sample: Sample = self.dataset.iloc[idx]["sample"]
-        image, caption = sample.image, sample.caption
-        
-        return image,caption 
-    
-    def pack_minibatch(self, data):
-        
-        # Sort a data list by caption length (descending order).
-        data.sort(key=lambda x: len(x[1]), reverse=True)
-    
-        images, captions = zip(*data)
-        
-        # Merge images (from tuple of 3D tensor to 4D tensor).
-        images = torch.stack(images, 0)
-        
-        caption_lengths = [len(caption) for caption in captions]
-        captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True)
-        return images,captions.type(torch.LongTensor),caption_lengths 
-#-------------------------------
-# Usage
-
-if __name__ == "__main__":
-    from Vocabulary import Vocabulary
-    from PreProcess import PreProcess
-    ds = MyDataset("./dataset/flickr30k_images/flickr30k_images")
-    df = ds.get_fraction_of_dataset(percentage=10)
-    print("pippo")
-    
-    # use dataloader facilities which requires a preprocessed dataset
-    v = Vocabulary(verbose=True)    
-    df_pre_processed,v_enriched = PreProcess.DatasetForTraining.process(dataset=df,vocabulary=v)
-    
-    dataloader = DataLoader(df, batch_size=4,
-                        shuffle=False, num_workers=0, collate_fn=df.pack_minibatch)
-    
-    for i_batch,images,captions in enumerate(dataloader):
-        print(i_batch, captions)
\ No newline at end of file
diff --git a/heavy_version/Models/Interface/__init__.py b/heavy_version/Models/Interface/__init__.py
deleted file mode 100644
index 16d05d6..0000000
--- a/heavy_version/Models/Interface/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Interfaces that allow to create multiple instances of an abstract class for different esigence.
\ No newline at end of file
diff --git a/heavy_version/Models/NeuralNet.py b/heavy_version/Models/NeuralNet.py
deleted file mode 100644
index 5aec7b4..0000000
--- a/heavy_version/Models/NeuralNet.py
+++ /dev/null
@@ -1,220 +0,0 @@
-import torch
-import torch.nn as nn
-import torchvision.models as models
-from torch.nn.utils.rnn import pack_padded_sequence
-import torch.nn.functional as F
-
-class EncoderCNN(nn.Module):
-    def __init__(self, embed_size):
-        super(EncoderCNN, self).__init__()
-        resnet = models.resnet50(pretrained=True)
-        for param in resnet.parameters():
-            param.requires_grad_(False)
-        
-        modules = list(resnet.children())[:-1]   # remove last fc layer
-        self.resnet = nn.Sequential(*modules)
-        self.embed = nn.Linear(resnet.fc.in_features, embed_size) # attach a linear layer ()
-
-    def forward(self, images):
-        with torch.no_grad():
-            features = self.resnet(images) 
-        features = features.reshape(features.size(0), -1)
-        features = self.embed(features)
-        return features
-    
-class DecoderRNN(nn.Module):
-    def __init__(self, hidden_size, padding_index, vocab_size, embeddings ):
-        """Set the hyper-parameters and build the layers."""
-        super(DecoderRNN, self).__init__()
-        # Keep track of hidden_size for initialization of hidden state
-        self.hidden_size = hidden_size
-        
-        # Embedding layer that turns words into a vector of a specified size
-        self.word_embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True, padding_idx = 0)
-        
-        # The LSTM takes embedded word vectors (of a specified size) as input
-        # and outputs hidden states of size hidden_dim
-        self.lstm = nn.LSTM(input_size=50, \
-                            hidden_size=hidden_size, # LSTM hidden units 
-                            num_layers=1, # number of LSTM layer
-                            batch_first=True,  # input & output will have batch size as 1st dimension
-                            dropout=0, # Not applying dropout 
-                            bidirectional=False, # unidirectional LSTM
-                           )
-        
-        # The linear layer that maps the hidden state output dimension
-        # to the number of words we want as output, vocab_size
-        self.linear = nn.Linear(hidden_size, vocab_size)                     
-
-    def init_hidden(self, batch_size):
-        """ At the start of training, we need to initialize a hidden state;
-        there will be none because the hidden state is formed based on previously seen data.
-        So, this function defines a hidden state with all zeroes
-        The axes semantics are (num_layers, batch_size, hidden_dim)
-        """
-        return (torch.zeros((1, batch_size, self.hidden_size)), \
-                torch.zeros((1, batch_size, self.hidden_size)))
-        
-    
-    def forward(self, features, captions,caption_lengths):
-        """ Define the feedforward behavior of the model """      
-        
-        # Initialize the hidden state
-        batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
-        self.hidden = self.init_hidden(batch_size)
-        
-        # Create embedded word vectors for each word in the captions
-        embeddings = self.word_embeddings(captions) # embeddings new shape : (batch_size, captions length -1, embed_size)
-        
-        # Stack the features and captions
-        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1) # embeddings new shape : (batch_size, caption length, embed_size)
-        
-        packed = pack_padded_sequence(embeddings, caption_lengths, batch_first=True) 
-        # Get the output and hidden state by passing the lstm over our word embeddings
-        # the lstm takes in our embeddings and hidden state
-        lstm_out, self.hidden = self.lstm(packed) # lstm_out shape : (batch_size, caption length, hidden_size)
-
-        # Fully connected layer
-        outputs = self.linear(lstm_out[0]) # outputs shape : (batch_size, caption length, vocab_size)
-
-        return outputs
-    
-    def sample(self, features, states=None):
-        """Generate captions for given image features using greedy search."""
-        sampled_ids = []
-        inputs = features.unsqueeze(1)
-        inputs = inputs.reshape((1,1,inputs.shape[0]))
-        self.init_hidden(1)
-        with torch.no_grad(): 
-            for _ in range(30):
-                hiddens, states = self.lstm(inputs, states)           # hiddens: (batch_size, 1, hidden_size)
-                outputs = self.linear(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
-                _, predicted = outputs.max(1)                     # predicted: (batch_size)
-                sampled_ids.append(predicted)
-                inputs = self.word_embeddings(predicted)                       # inputs: (batch_size, embed_size)
-                inputs = inputs.unsqueeze(1)                         # inputs: (batch_size, 1, embed_size)
-            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
-        return sampled_ids
-
-def save(self, file_name):
-    """Save the classifier."""
-
-    torch.save(self.net.state_dict(), file_name)
-
-def load(self, file_name):
-    """Load the classifier."""
-
-    # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
-    self.net.load_state_dict(torch.load(file_name, map_location=self.device))
-        
-def train(train_set, validation_set, lr, epochs, vocabulary):
-        device = torch.device("cuda:0")
-        criterion = nn.CrossEntropyLoss()
-        
-        # initializing some elements
-        best_val_acc = -1.  # the best accuracy computed on the validation data
-        best_epoch = -1  # the epoch in which the best accuracy above was computed
-
-        encoder = EncoderCNN(50)
-        decoder = DecoderRNN(2048,0,len(vocabulary.word2id.keys()),vocabulary.embeddings)
-        
-        encoder.to(device)
-        decoder.to(device)
-        
-        # ensuring the classifier is in 'train' mode (pytorch)
-        decoder.train()
-
-        # creating the optimizer
-        optimizer = torch.optim.Adam(list(decoder.parameters()), lr)
-
-        # loop on epochs!
-        for e in range(0, epochs):
-
-            # epoch-level stats (computed by accumulating mini-batch stats)
-            epoch_train_acc = 0.
-            epoch_train_loss = 0.
-            epoch_num_train_examples = 0
-
-            for images,captions,captions_length in train_set:
-                decoder.zero_grad()
-                encoder.zero_grad() 
-                # zeroing the memory areas that were storing previously computed gradients
-                batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size')
-                epoch_num_train_examples += batch_num_train_examples
-
-                images = images.to(device)
-                captions_length = captions_length.to(device)
-                targets = targets.to(device)
-
-                # computing the network output on the current mini-batch
-                features = encoder(images)
-                outputs = decoder(features, captions,captions_length)
-                
-                targets = pack_padded_sequence(captions, captions_length, batch_first=True)[0]
-                
-                # computing the loss function
-                try:
-                    loss = criterion(outputs, targets)
-                except Exception as ex:
-                    print(ex)
-                # computing gradients and updating the network weights
-                loss.backward()  # computing gradients
-                optimizer.step()  # updating weights
-
-                print(f"mini-batch:\tloss={loss.item():.4f}")
-                torch.save(decoder.state_dict(),".saved/decoder.pt")
-                features = encoder(images)
-                caption = decoder.sample(features[0])
-                print(vocabulary.rev_translate(captions))
-                print(vocabulary.rev_translate(caption))
-                # computing the performance of the net on the current training mini-batch
-                # with torch.no_grad():  # keeping these operations out of those for which we will compute the gradient
-                #     self.net.eval()  # switching to eval mode
-
-                #     # computing performance
-                #     batch_train_acc = self.__performance(outputs, y)
-
-                #     # accumulating performance measures to get a final estimate on the whole training set
-                #     epoch_train_acc += batch_train_acc * batch_num_train_examples
-
-                #     # accumulating other stats
-                #     epoch_train_loss += loss.item() * batch_num_train_examples
-
-                #     self.net.train()  # going back to train mode
-
-                #     # printing (mini-batch related) stats on screen
-                #     print("  mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc))
-
-            # val_acc = self.eval_classifier(validation_set)
-
-            # # saving the model if the validation accuracy increases
-            # if val_acc > best_val_acc:
-            #     best_val_acc = val_acc
-            #     best_epoch = e + 1
-            #     self.save("classifier.pth")
-
-            # epoch_train_loss /= epoch_num_train_examples
-
-            # # printing (epoch related) stats on screen
-            # print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}"
-            #        + (", BEST!" if best_epoch == e + 1 else ""))
-            #       .format(e + 1, epochs, epoch_train_loss,
-            #               epoch_train_acc / epoch_num_train_examples, val_acc))
-
-# Example of usage
-if __name__ == "__main__":
-    from Vocabulary import Vocabulary
-    from PreProcess import PreProcess
-    from Dataset import MyDataset
-    from torch.utils.data import DataLoader
-    ds = MyDataset("./dataset", percentage=2)
-    df = ds.get_fraction_of_dataset(percentage=100)
-    
-    # use dataloader facilities which requires a preprocessed dataset
-    v = Vocabulary(verbose=True)    
-    df_pre_processed,v_enriched = PreProcess.DatasetForTraining.process(dataset=df,vocabulary=v)
-    
-    dataloader = DataLoader(df, batch_size=10,
-                        shuffle=True, num_workers=0, collate_fn=df.pack_minibatch)
-    
-    train(dataloader, dataloader, 1e-2, 10, v_enriched)
\ No newline at end of file
diff --git a/heavy_version/Models/PreProcess.py b/heavy_version/Models/PreProcess.py
deleted file mode 100644
index 3a04af7..0000000
--- a/heavy_version/Models/PreProcess.py
+++ /dev/null
@@ -1,191 +0,0 @@
-from abc import ABC, abstractmethod
-from enum import Enum
-from PIL import Image
-from torchvision import transforms
-import torch
-from Dataset import Dataset, DatasetState
-from Sample import Sample
-from Vocabulary import Vocabulary
-import re
-from typing import Tuple
-
-class ABCPreProcess(ABC):
-    """Class which implements preprocessing methods for a given object
-    """
-    
-    @abstractmethod
-    def process(object_i, **parameters):
-        pass
-    
-
-class PreProcessImageForTraining(ABCPreProcess):
-    
-    @staticmethod
-    def process(object_i: Image, parameters) -> torch.FloatTensor:
-        """
-        Function that pre-process an image for training.
-
-        Args:
-            object_i (Image): [description], 
-            parameters:{
-                crop:{
-                    "size": (int), Expected output size of the crop, for each edge.
-                    "scale: Tuple(float,float),  :ower and upper bounds for the random area of the crop, before resizing.
-                    "ratio": Tuple(float,float),  Lower and upper bounds for the random aspect ratio of the crop, before resizing.
-                }
-                "mean": (float),
-                "std_dev": (float)
-            }
-            
-        Returns:
-            torch.FloatTensor: torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
-        """
-        operations = transforms.Compose([
-                transforms.RandomResizedCrop(parameters["crop"]["size"], scale=parameters["crop"]["scale"], ratio=parameters["crop"]["ratio"]), # Crop a random portion of image and resize it to a given size.
-                transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability.
-                transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor.  (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
-                transforms.Normalize(mean=parameters["mean"], std=parameters["std_dev"]),
-        ])
-        return operations(object_i)
-        
-    
-    
-class PreProcessImageForEvaluation(ABCPreProcess):
-    
-    @staticmethod
-    def process(object_i: Image, **parameters) -> torch.FloatTensor:
-        """Function that pre-process an image for evaluation.
-            Args:
-                object_i (Image): [description], 
-                parameters:{
-                    crop:{
-                        "size": (int), Desired output size of the crop, for each edge.
-                        "center: (int)  Desired output size of the crop after centering
-                    }
-                    "mean": (float),
-                    "std_dev": (float)
-              }
-                
-            Returns:
-                torch.FloatTensor: torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
-        """
-        operations = transforms.Compose([
-                transforms.Resize(parameters["crop"]["size"]), 
-                transforms.CenterCrop(parameters["crop"]["center"]),  # Crops the given image at the center.
-                transforms.ToTensor(),
-                transforms.Normalize(mean=parameters["mean"], std=parameters["std_dev"]),
-        ])
-        return operations(object_i)
-    
-class PreProcessCaption(ABCPreProcess):
-    
-    @staticmethod
-    def process(caption: str, **parameters) -> list[str]:
-        """Process a caption for being used in the network
-
-        Args:
-            caption (str): The caption to be processed.
-
-        Returns:
-            torch.tensor: A tensor 
-        """
-        tokenized_caption = re.findall("[\\w]+|\.|\,", caption.lower())
-        return tokenized_caption
-
-
-class PreProcessDatasetForTraining(ABCPreProcess):
-    
-    image_trasformation_parameter = {
-        "crop":{
-            "size": 224,
-            "scale": (0.08,1.0),
-            "ratio": (3. / 4., 4. / 3.),
-        },
-        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
-        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
-    }
-    @staticmethod
-    def process(dataset: Dataset, vocabulary: Vocabulary) -> Tuple[Dataset,Vocabulary]:
-        
-        # Control block
-        if dataset.state == DatasetState.Training:
-            torch.warnings.warn("The Dataset is already prepared for Training, another pre-process training could lead to some inconsistence.")
-            
-        if dataset.state == DatasetState.Evaluation:
-            torch.warnings.warn("The Dataset is already prepared for Evaluation, pre-process for training could lead to some inconsistence.")
-        
-        # PreProcess block
-        for sample in dataset.dataset["sample"]:
-            sample.alter_caption(PreProcess.Caption.process(sample.caption))
-            sample.alter_image(PreProcess.ImageForTraining.process(sample.image, PreProcessDatasetForTraining.image_trasformation_parameter))
-            
-        # Enrich the vocabulary
-        vocabulary.make_enrich = True
-        vocabulary.bulk_enrich([sample.caption for sample in dataset.dataset["sample"][:]])
-        vocabulary.make_enrich = False
-        
-        # Do the In Place Translation for the caption for each sample in the dataset
-        dataset.dataset.apply(lambda record: record["sample"].alter_caption(vocabulary.translate(record["sample"].caption)), axis=1)
-        
-        dataset.state = DatasetState.Training
-        return dataset, vocabulary
-    
-
-class PreProcessDatasetForEvaluation(ABCPreProcess):
-    
-    image_trasformation_parameter = {
-        "crop":{
-            "size": 256,
-            "center": 224
-        },
-        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
-        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
-    }
-    @staticmethod
-    def process(dataset: Dataset, vocabulary: Vocabulary) -> Tuple[Dataset,Vocabulary]:
-        
-        # Control block
-        if dataset.state == DatasetState.Training:
-            torch.warnings.warn("The Dataset is already prepared for Training, another pre-process training could lead to some inconsistence.")
-            
-        if dataset.state == DatasetState.Evaluation:
-            torch.warnings.warn("The Dataset is already prepared for Evaluation, pre-process for training could lead to some inconsistence.")
-        
-        # PreProcess block
-        for sample in dataset.dataset["sample"]:
-            sample.alter_caption(PreProcess.Caption.process(sample.caption))
-            sample.alter_image(PreProcess.ImageForTraining.process(sample.image, PreProcessDatasetForTraining.image_trasformation_parameter))
-        
-            
-        # Enrich the vocabulary
-        vocabulary.make_enrich = True
-        vocabulary.bulk_enrich([sample.caption for sample in dataset.dataset["sample"][:]])
-        vocabulary.make_enrich = False
-        
-        # Do the In Place Translation for the caption for each sample in the dataset
-        dataset.dataset.apply(lambda record: record["sample"].alter_caption(vocabulary.translate(record["sample"].caption)), axis=1)
-        
-        dataset.state = DatasetState.Evaluation
-        return dataset, vocabulary
-    
-class PreProcess():
-    ImageForTraining = PreProcessImageForTraining
-    ImageForEvaluation = PreProcessImageForEvaluation
-    Caption = PreProcessCaption
-    DatasetForTraining = PreProcessDatasetForTraining
-    DatasetForEvaluation = PreProcessDatasetForTraining
-    
-# ----------------------------------------------------------------
-# How to use 
-
-if __name__ == '__main__':
-    
-    ds = Dataset("./dataset")
-    df = ds.get_fraction_of_dataset(percentage=10)
-    
-    v = Vocabulary(verbose=True)
-    # Make a translation
-    print(v.translate(["I","like","PLay","piano","."]))
-    
-    df_pre_processed,v_enriched = PreProcess.DatasetForTraining.process(dataset=df,vocabulary=v)
-    print(df_pre_processed)
\ No newline at end of file
diff --git a/heavy_version/Models/Sample.py b/heavy_version/Models/Sample.py
deleted file mode 100644
index 22a07ab..0000000
--- a/heavy_version/Models/Sample.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from PIL import Image
-import os
-
-class Sample():
-    """Model class for managing a sigle sample of the dataset.
-    """
-    
-    def __init__(self, id: int, image_path_file: str, caption: str, verbose: bool = False):
-        """Constructor of a sample of the dataset.
-
-        Args:
-            id (int): The id associated with this sample.
-                        [Constraint: The id must be unique (Caller responsibility).]
-                        [Constraint: The id must be greater than 0.]
-            image_path_file (str): The image path, in relative path format. (Assumed the main.py as the entry point)
-            caption (str): Raw caption associated with this sample. 
-                        [Constraint: Caption has to be a string with length greater than 1 characters.]
-            verbose (bool, optional): The class will be verbose if True. Defaults to False.
-        Raises:
-            FileNotFoundError: The given relative path to the image is invalid.
-            ValueError: The caption is invalid.
-            ValueError: The id is invalid.
-        """
-         
-        # Validation of constructor parameters
-        if not os.path.isfile(image_path_file):
-            raise FileNotFoundError("The given path_file resemble a non-existing file.")
-        
-        if len(caption.strip()) <= 1:
-            raise ValueError(f"The caption has a length of {len(caption.strip())} characters, which is not supported.")
-        
-        if id <= 0:
-            raise ValueError(f"The id must be greater than 0. \n Given {id}.")
-        
-        if verbose:
-            print(f"Image path: {image_path_file}")
-            print(f"Loading..")
-        
-        # Loading the image
-        self._image = None
-        try:
-            self._image = Image.open(image_path_file).convert('RGB') # Load and convert to RGB
-        except Exception as e:
-            raise e
-        if self._image is None:
-            raise Exception("Could not load image.")
-        if verbose:
-            print("Ok.")
-        
-        # Loading the caption 
-        self.caption = caption
-        
-        # Loading the id
-        self.id = id
-        
-        # Set verbosity 
-        self._verbose = verbose
-        
-        # Tell externally if this sample is altered (pre-processed, or other..) if False, otherwise the data is inside are raw if True
-        self.is_raw = True
-        
-    @property
-    def image(self) -> Image:
-        """Getter of the image property
-
-        Returns:
-            Image: The image object.
-        """
-        return self._image
-    
-    def alter_image(self, altered_image):
-        """Alter the sample image by place a new one (could be the same but modified or a tensorial form of the image) 
-
-        Args:
-            image (object): The new image with possible differente representation
-        """
-        self._image = altered_image
-        self.is_raw = False
-        
-    def alter_caption(self, altered_caption):
-        """Alter the caption, now could be a string as before or a list of string, ready for being processed by the NN.
-
-        Args:
-            image (Image): The new image
-        """
-        self.caption = altered_caption
-        self.is_raw = False
-        
-    
-        
\ No newline at end of file
diff --git a/heavy_version/Models/Vocabulary.py b/heavy_version/Models/Vocabulary.py
deleted file mode 100644
index 49f56ae..0000000
--- a/heavy_version/Models/Vocabulary.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import os
-import torch
-import warnings
-
-class Vocabulary():
-    # The vocabulary implementation is done with a pre-trained word embedding GLOVE50d
-    # each word is represented by a record in a dataframe with this structure
-    
-    
-    def __init__(self, verbose: bool = False):
-        
-        self.enriched = False       # Tell that all the word coming from the dataset are in the vocabulary if it is set to True
-        self._make_enrich = False         # Allow the user to enrich the vocabulary if it is set to True
-        # Check if the enriched vocabulary(glove + PAD + SOS + EOS + UNK + dataset vocabulary) already exists
-        if os.path.exists(".saved/rich_embeddings.pt") and os.path.exists(".saved/rich_word2id.pt"):
-            self.embeddings = torch.load(".saved/rich_embeddings.pt")
-            self.word2id = torch.load(".saved/rich_word2id.pt")
-            self.enriched = True
-            return
-        
-        # Check if the base vocabulary(glove + PAD + SOS + EOS + UNK) already exists
-        if os.path.exists(".saved/base_embeddings.pt") and os.path.exists(".saved/base_word2id.pt"):
-            self.embeddings = torch.load(".saved/base_embeddings.pt")
-            self.word2id = torch.load(".saved/base_word2id.pt")
-            return
-        
-        # Since the constructor arrived here, we need to load for the 1st time the glove word embeddings
-        
-        self.word2id = {}
-        self.embeddings = torch.zeros((400004, 50)) # DIM1: Glove50 rows + 4 flavored token (PAD + SOS + EOS + UNK) | DIM2: Embedding Size 50d
-        
-        # Initialize the token:
-        # <PAD>, <SOS>, <EOS>, <UNK>
-        self.word2id["<PAD>"] = 0
-        self.word2id["<SOS>"] = 1
-        self.word2id["<EOS>"] = 2
-        self.word2id["<UNK>"] = 3
-        
-        self.embeddings[self.word2id["<PAD>"]] = torch.zeros(50, dtype=torch.float32)
-        self.embeddings[self.word2id["<SOS>"]] = torch.rand(50, dtype=torch.float32)
-        self.embeddings[self.word2id["<EOS>"]] = torch.rand(50, dtype=torch.float32)
-        self.embeddings[self.word2id["<UNK>"]] = torch.rand(50, dtype=torch.float32)
-        
-        counter = 4
-        with open('.saved/glove.6B.50d.txt', 'r', encoding='utf-8') as _vocabulary_file:
-            for line in _vocabulary_file:
-                line = line.strip().split()
-                self.word2id[line[0]] = counter
-                self.embeddings[counter] = torch.tensor([float(dimension) for dimension in line[1:]], dtype=torch.float32)
-                counter += 1
-        torch.save(self.embeddings,".saved/base_embeddings.pt")
-        torch.save(self.word2id,".saved/base_word2id.pt")
-        print("break")
-    
-    def predefined_token_idx(self) -> dict:
-        return {
-            "<PAD>":0,
-            "<SOS>":1,
-            "<EOS>":2,
-            "<UNK>":3
-        }
-    
-    def translate(self, word_sequence : list[str]) -> torch.tensor:
-        """Given a sequence of word, translate into id list according to the vocabulary.
-
-        Args:
-            word_sequence (str): [description]
-        """
-        # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset.
-        if not self.enriched:
-            warnings.warn("The vocabulary is not enriched with dataset words that could be not in glove, pay attention to what you want to do with this representation.")
-        
-        # Initialize the translator
-        _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # +2 because of <SOS> and <EOS> token
-        _sequence[0] = self.word2id["<SOS>"]
-        _sequence[-1] = self.word2id["<EOS>"]
-        
-        counter = 1 # SKIP THE <SOS> TOKEN 
-        for word in word_sequence:
-            if word.lower() in self.word2id.keys():
-                _sequence[counter] = self.word2id[word.lower()]
-            else:
-                _sequence[counter] = self.word2id["<UNK>"]
-            counter += 1
-        return _sequence
-    
-    def rev_translate(self, words_id : torch.tensor) -> list[str]:
-        """Given a sequence of word, translate into id list according to the vocabulary.
-
-        Args:
-            word_sequence (str): [description]
-        """
-        # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset.
-        return [list(self.word2id.keys())[idx] for idx in words_id[0,:].tolist()]   # word_id (1,caption_length)
-    
-    
-    @property
-    def make_enrich(self):
-        return self._make_enrich
-    
-    @make_enrich.setter
-    def make_enrich(self, value: bool):
-        if not isinstance(value,bool):
-            raise TypeError("The value that you want to put on make_enrich is not a boolean. Pay attention!")
-        
-        if value is False:
-            if self.make_enrich and not self.enriched: # If before the setter call make_enrich was True, probably the vocabulary was enriched by somebody, so the vocabulary now is in the state Enriched
-                self.enriched = True
-            # The enriched version of the vocabulary need to be dumped in memory
-            torch.save(self.embeddings,".saved/rich_embeddings.pt")
-            torch.save(self.word2id,".saved/rich_word2id.pt")
-            self._make_enrich = False
-        else:
-            self._make_enrich = value    
-        
-    def enrich(self, words: list[str]) -> bool:
-        
-        if not self.make_enrich:
-            raise ValueError(f"The vocabulary is not set to be enriched, before the enrichment set the flag 'make_enrich' to True.")
-        
-        _new_word = []
-        for word in words:
-            if word.lower() in self.word2id.keys():
-                continue
-            _new_word.append(word)
-        
-        if len(_new_word) == 0:
-            return False
-        
-        _enrichment_of_embedding = torch.zeros((len(_new_word),50))
-        
-        _id_carry = len(self.word2id.keys()) # The new ids start from len(.) cause the ids start from 0 and not from 1
-        
-        for number,word in enumerate(_new_word):
-            self.word2id[word.lower()] = _id_carry
-            _enrichment_of_embedding[number] = torch.rand(50, dtype=torch.float32)
-            _id_carry += 1
-             
-        # Append the enrichment at the end of the embeddings_matrix
-        self.embeddings = torch.cat([self.embeddings,_enrichment_of_embedding], dim=0)
-        return True
-    
-    def bulk_enrich(self, sequences: list[list[str]]) -> bool:
-        _words_flatten = [word for sequence in sequences for word in sequence] # flatten a list of list,  credits to wjandrea on stackoverflow <3
-        return self.enrich(_words_flatten)
-    
-    
-    def __len__(self):
-        """The total number of words in this Vocabulary."""
-
-        return len(self.word2id.keys())
-    
-    
-# ----------------------------------------------------------------
-# Usage example
-
-if __name__ == '__main__':
-    #Load the vocabulary
-    v = Vocabulary(verbose=True)
-    # Make a translation
-    print(v.translate(["I","like","PLay","piano","."]))
-    # Enrich the vocabulary
-    v.make_enrich = True
-    dataset = ["I","Like","PLay","PIPPOplutopaperino"]
-    v.enrich(dataset)
-    v.make_enrich = False
-    # Enrich the vocabulary with a bulk insert 
-    v.make_enrich = True
-    dataset = [["I","Like","PLay","PIPPOplutopaperino"],["I","Like","PLay","pizza"]]
-    v.bulk_enrich(dataset)
-    v.make_enrich = False
-    
-    
-    
-        
-        
-        
-        
-            
-        
-        
-    
-    
-        
-    
\ No newline at end of file
diff --git a/heavy_version/Models/__init__.py b/heavy_version/Models/__init__.py
deleted file mode 100644
index 0ae43be..0000000
--- a/heavy_version/Models/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# All the Model which represents a specific entity in the framework of this project.
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..5e0b8da
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,20 @@
+certifi==2021.10.8
+charset-normalizer==2.0.10
+dataclasses==0.8
+idna==3.3
+kaggle==1.5.12
+numpy==1.19.5
+pandas==1.1.5
+Pillow==8.4.0
+pkg_resources==0.0.0
+python-dateutil==2.8.2
+python-slugify==5.0.2
+pytz==2021.3
+requests==2.27.1
+six==1.16.0
+text-unidecode==1.3
+torch==1.3.0+cu100
+torchvision==0.4.1+cu100
+tqdm==4.62.3
+typing_extensions==4.0.1
+urllib3==1.26.8
diff --git a/v1 copy/Dataset.py b/v1 copy/Dataset.py
new file mode 100644
index 0000000..c2a0ed8
--- /dev/null
+++ b/v1 copy/Dataset.py	
@@ -0,0 +1,135 @@
+import os
+import pandas as pd 
+import torch
+import numpy as np 
+from enum import Enum
+from torch.utils.data import Dataset, DataLoader
+import torch.nn as nn
+from PIL import Image
+import re
+from torchvision import transforms
+
+class MyDataset(Dataset):
+    
+    training_image_trasformation_parameter = {
+        "crop":{
+            "size": 224,
+            "scale": (0.08,1.0),
+            "ratio": (3. / 4., 4. / 3.),
+        },
+        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
+        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
+    }
+    
+    evaluation_image_trasformation_parameter = {
+        "crop":{
+            "size": 256,
+            "center": 224
+        },
+        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
+        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
+    }
+    
+    def __init__(self, directory_of_data:str = None, percentage:int = 100, already_computed_dataframe: pd.DataFrame = None):
+        """Create a new dataset from source files
+
+        Args:
+            directory_of_data (str): [description]
+        """
+        if already_computed_dataframe is not None:
+            self.directory_of_data = directory_of_data
+            self._dataset = already_computed_dataframe
+            return
+        
+        if not os.path.exists(directory_of_data):
+            raise ValueError(f"{directory_of_data} not Exist!")
+        if not os.path.isdir(directory_of_data):
+            raise ValueError(f"{directory_of_data} is not a directory!")
+        
+        _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]]
+        self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100)))
+        self.directory_of_data = directory_of_data
+        
+    def get_fraction_of_dataset(self, percentage: int): 
+        _temp_df_moved = self._dataset.head(int(len(self._dataset)*(percentage/100))).sample(frac=1)
+        _temp_df_copy = _temp_df_moved.copy()
+        return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy)
+    
+    def get_all_distinct_words_in_dataset(self):
+        words = []
+        for idx,row in self._dataset.iterrows():
+            for word in re.findall("[\\w]+|\.|\,", row["comment"].lower()):
+                if word not in words:
+                    words.append(word)
+        return words
+    
+    def __len__(self):
+        return self._dataset.shape[0]
+    
+    def __getitem__(self, idx):
+        
+        image, caption = Image.open(f"{self.directory_of_data}/flickr30k_images/{self._dataset.iloc[idx]['image_name']}").convert('RGB'), \
+                            re.findall("[\\w]+|\.|\,", self._dataset.iloc[idx]["comment"].lower())
+        
+        return image, caption 
+    
+    def pack_minibatch_training(self, data, vocabulary):
+        
+        # Sort a data list by caption length (descending order).
+        data.sort(key=lambda x: len(x[1]), reverse=True)
+    
+        images, captions = zip(*data)
+        
+        operations = transforms.Compose([
+                transforms.RandomResizedCrop(MyDataset.training_image_trasformation_parameter["crop"]["size"], scale=MyDataset.training_image_trasformation_parameter["crop"]["scale"], ratio=MyDataset.training_image_trasformation_parameter["crop"]["ratio"]), # Crop a random portion of image and resize it to a given size.
+                transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability.
+                transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor.  (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
+                transforms.Normalize(mean=MyDataset.training_image_trasformation_parameter["mean"], std=MyDataset.training_image_trasformation_parameter["std_dev"]),
+        ])
+        images = list(map(lambda image: operations(image),list(images)))
+        
+        # Merge images (from tuple of 3D tensor to 4D tensor).
+        images = torch.stack(images, 0) # (Batch Size, Color, Height, Width)
+        
+        captions_length = [len(caption) for caption in captions] # (Batch Size,)
+        
+        captions_training_ids = [vocabulary.translate(caption,"uncomplete")for caption in captions] # (Batch Size, Caption)
+        
+        captions_target_ids  = [vocabulary.translate(caption,"complete") for caption in captions]
+        
+        captions_training_ids = nn.utils.rnn.pad_sequence(captions_training_ids, padding_value=0, batch_first=True)
+        
+        captions_target_ids  = nn.utils.rnn.pad_sequence(captions_target_ids, padding_value=0, batch_first=True)
+        
+        return images,captions_training_ids.type(torch.LongTensor),captions_target_ids.type(torch.LongTensor)
+    
+    def pack_minibatch_evaluation(self, data, vocabulary):
+        
+        # Sort a data list by caption length (descending order).
+        data.sort(key=lambda x: len(x[1]), reverse=True)
+    
+        images, captions = zip(*data)
+        
+        operations = transforms.Compose([
+                transforms.Resize(MyDataset.evaluation_image_trasformation_parameter["crop"]["size"]), 
+                transforms.CenterCrop(MyDataset.evaluation_image_trasformation_parameter["crop"]["center"]),  # Crops the given image at the center.
+                transforms.ToTensor(),
+                transforms.Normalize(mean=MyDataset.evaluation_image_trasformation_parameter["mean"], std=MyDataset.evaluation_image_trasformation_parameter["std_dev"])
+        ])
+
+        images = list(map(lambda image: operations(image),list(images)))
+        
+        # Merge images (from tuple of 3D tensor to 4D tensor).
+        images = torch.stack(images, 0) # (Batch Size, Color, Height, Width)
+                           
+        
+        captions_evaluation_ids = [vocabulary.translate(caption,"uncomplete")for caption in captions] # (Batch Size, Caption)
+        
+        captions_target_ids  = [vocabulary.translate(caption,"complete") for caption in captions]
+        
+        captions_evaluation_ids = nn.utils.rnn.pad_sequence(captions_evaluation_ids, padding_value=0, batch_first=True)
+        
+        captions_target_ids  = nn.utils.rnn.pad_sequence(captions_target_ids, padding_value=0, batch_first=True)
+        
+        return images,captions_evaluation_ids.type(torch.LongTensor),captions_target_ids.type(torch.LongTensor)
+        
\ No newline at end of file
diff --git a/v1 copy/NeuralNet.py b/v1 copy/NeuralNet.py
new file mode 100644
index 0000000..9ac00c9
--- /dev/null
+++ b/v1 copy/NeuralNet.py	
@@ -0,0 +1,345 @@
+#####################################################
+## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! 
+# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :)
+##
+##
+##  pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html
+
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+from torch.nn.utils.rnn import pack_padded_sequence
+import torch.nn.functional as F
+
+device="cpu"
+class EncoderCNN(nn.Module):
+    def __init__(self, projection_size):
+        super(EncoderCNN, self).__init__()
+        resnet = models.resnet50(pretrained=True)
+        for param in resnet.parameters():
+            param.requires_grad_(False)
+        
+        modules = list(resnet.children())[:-1]   # remove last fc layer
+        self.resnet = nn.Sequential(*modules)
+        self.linear = nn.Linear(resnet.fc.in_features, projection_size) 
+        
+    def forward(self, images):
+        features = self.resnet(images) 
+        features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.)
+        features = self.linear(features)
+        return features
+    
+class DecoderRNN(nn.Module):
+    def __init__(self, hidden_size, padding_index, vocab_size, embedding_size):
+        """[summary]
+
+        Args:
+            hidden_size ([type]): [description]
+            padding_index ([type]): [description]
+            vocab_size ([type]): [description]
+            embedding_size ([type]): [description]
+        """
+        super(DecoderRNN, self).__init__()
+    
+        # Embedding layer that turns words into a vector of a specified size
+        self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index)
+        
+        # The LSTM takes embedded word vectors (of a specified size) as input
+        # and outputs hidden states of size hidden_dim
+        self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size)
+        
+        # The linear layer that maps the hidden state output dimension
+        # to the number of words we want as output, vocab_size
+        self.linear_1 = nn.Linear(hidden_size, vocab_size)
+                
+
+    def forward(self, features, captions):
+        """[summary]
+
+        Args:
+            features (torch.tensor(batch_size, hidden_size)): [description]
+            captions (torch.tensor(batch_size, max_captions_length, word_embedding)): [description]
+
+        Returns:
+            [torch.tensor(batch_size, max_captions_length, vocab_size)]: [description]
+        """             
+        
+        # Initialize the hidden state
+        batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
+        
+        # Create embedded word vector for each word in the captions
+        inputs = self.word_embeddings(captions) # In:       Out: (batch_size, captions length, embed_size)
+        
+        # Feed LSTMCell with image features and retrieve the state
+        
+        _h, _c = self.lstm_unit(features) # _h : (Batch size, Hidden size)
+        
+        # Deterministict <SOS> Output as first word of the caption :)
+        start = torch.zeros(self.word_embeddings.num_embeddings)
+        start[1] = 1
+        outputs = start.repeat(batch_size,1,1).to(torch.device(device)) # Bulk insert of <SOS> embeddings to all the elements of the batch 
+          
+        
+        
+        # How it works the loop?
+        # For each time step t \in {0, N-1}, where N is the caption length 
+        
+        # Since the sequences are padded, how the forward is performed? Since the <EOS> don't need to be feeded as input?
+        # The assumption is that the captions are of lenght N-1, so the captions provide by external as input are without <EOS> token
+        
+        for idx in range(0,inputs.shape[1]): 
+            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))
+            _outputs = self.linear_1(_h)
+            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1)
+        
+        return outputs # (Batch Size, N, |Vocabulary|)
+    
+    def generate_caption(self, features, max_caption_length):
+        """Generate captions for given image features using greedy search."""
+       
+        sampled_ids = [torch.tensor([1]).to(torch.device(device))] # Hardcoded <SOS>
+        input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,-1))
+        with torch.no_grad(): 
+            _h ,_c = self.lstm_unit(features.unsqueeze(0))
+            for _ in range(max_caption_length-1):
+                _h, _c = self.lstm_unit(input, (_h ,_c))           # _h: (1, 1, hidden_size)
+                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
+                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if device == "cuda" else   F.softmax(outputs,dim=1).max(1)                # predicted: (batch_size)
+                sampled_ids.append(predicted)
+                input = self.word_embeddings(predicted)                       # inputs: (batch_size, embed_size)
+                input = input.to(torch.device(device))                       # inputs: (batch_size, 1, embed_size)
+                if predicted == 2:
+                    break
+            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
+        return sampled_ids
+
+    
+class CaRNet1(nn.Module):
+    
+    def __init__(self, hidden_size, padding_index, vocab_size, embedding_size, device = "cpu"):
+        """[summary]
+
+        Args:
+            hidden_size ([type]): [description]
+            padding_index ([type]): [description]
+            vocab_size ([type]): [description]
+            embedding_size ([type]): [description]
+        """
+        super(CaRNet1, self).__init__()
+        self.padding_index = padding_index
+        self.device = torch.device(device)
+        self.C = EncoderCNN(embedding_size)
+        self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size)
+
+        self.C.to(self.device)
+        self.R.to(self.device)
+        
+    def save(self, file_name):
+        """Save the classifier."""
+        torch.save(self.C.state_dict(), f".saved/v1/{file_name}_C.pth")
+        torch.save(self.R.state_dict(), f".saved/v1/{file_name}_R.pth")
+
+    def load(self, file_name):
+        """Load the classifier."""
+
+        # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
+        self.C.load_state_dict(torch.load(f".saved/v1/{file_name}_C.pth", map_location=self.device))
+        self.R.load_state_dict(torch.load(f".saved/v1/{file_name}_R.pth", map_location=self.device))
+    
+    def forward(self,images,captions):
+        features = self.C(images)
+        return self.R(features, captions)
+    
+    def __accuracy(self, outputs, labels):
+        """[summary]
+
+        Args:
+            outputs ([type]): [description]
+            labels ([type]): [description]
+
+        Returns:
+            [type]: [description]
+        """
+        # Assume outputs and labels have same shape and already padded
+        # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch!
+        # With this technique we evaluate all major case:
+        # 1) Output caption is longer than expected : Output.ID - <PAD>.ID != 0
+        # 2) Output is less longer than expect : <PAD>.ID - Target.ID != 0
+        # 3) Output has equal dimension but different label : Output.ID - Target.ID != 0, 
+        # Hp. 1 : Output<PAD>.ID - Target<PAD>.ID = 0 need to be considered as good match because it means that both output and target end before this token
+        # Hp. 2 : Both Outputs and Target need to be dropped on the first word because <SOS> is evaluated in a deterministic fashion :)
+        # computing the accuracy
+        
+        right_predictions = torch.eq(outputs[:,1:], labels[:,1:])
+        acc = torch.mean(right_predictions.to(torch.float32).sum(axis=1) / right_predictions.shape[1] ).item()  # Accuracy = TP+TN / ALL
+        return acc
+    
+        # TO DO: Devo usare la confusion matrix????????? 
+    
+    def train(self, train_set, validation_set, lr, epochs, vocabulary):
+            
+            criterion = nn.CrossEntropyLoss(ignore_index=self.padding_index,reduction="sum").cuda() if self.device.type == "cuda"  \
+                                                else nn.CrossEntropyLoss(ignore_index=0,reduction="sum")
+            
+            # initializing some elements
+            best_val_acc = -1.  # the best accuracy computed on the validation data
+            best_epoch = -1  # the epoch in which the best accuracy above was computed
+
+            
+            
+            # ensuring the classifier is in 'train' mode (pytorch)
+            self.C.train()
+            self.R.train()
+
+            # creating the optimizer
+            optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr)
+
+            # loop on epochs!
+            for e in range(0, epochs):
+
+                # epoch-level stats (computed by accumulating mini-batch stats)
+                epoch_train_acc = 0.
+                epoch_train_loss = 0.
+                epoch_num_train_examples = 0
+
+                for images,captions_training_ids,captions_target_ids in train_set:
+                    optimizer.zero_grad() 
+                    
+                    batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size')
+                    epoch_num_train_examples += batch_num_train_examples
+                    
+                    
+                    images = images.to(self.device)
+                    captions_training_ids = captions_training_ids.to(self.device) # captions > (B, L)
+                    captions_target_ids = captions_target_ids.to(self.device) # captions > (B, |L|-1) without end token
+
+                    # computing the network output on the current mini-batch
+                    features = self.C(images)
+                    outputs = self.R(features, captions_training_ids) # outputs > (B, L, |V|); 
+                    
+                    # (B, L, |V|) -> (B * L, |V|) and captions > (B * L)
+                    loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions_target_ids.reshape(-1))
+                    
+                    # computing gradients and updating the network weights
+                    loss.backward()  # computing gradients
+                    optimizer.step()  # updating weights
+                    
+                    # with torch.no_grad():
+                    #     self.C.eval()
+                    #     self.R.eval()
+                    #     features = self.C(images)
+                    #     import random
+                    #     numb = random.randint(0,2)
+                    #     caption = self.R.generate_caption(features[numb],30)
+                    #     print(vocabulary.rev_translate(captions_target_ids[numb]))
+                    #     print(vocabulary.rev_translate(caption[0]))
+                    #     self.C.train()
+                    #     self.R.train()
+                    
+                    with torch.no_grad():
+                        self.C.eval()
+                        self.R.eval()
+                        
+                        # Compute captions as ids for all the training images
+                        projections = self.C(images)
+                        
+                        captions_output = torch.zeros((projections.shape[0],captions_target_ids.shape[1])).to(torch.device(device))
+                        
+                        for idx,projection in enumerate(range(projections.shape[0])):
+                            _caption_no_pad = self.R.generate_caption(projections[idx],captions_target_ids.shape[1])
+                            captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
+                            # Fill the remaining portion of caption eventually with zeros
+                            # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
+    
+                        captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors
+                        
+                        # computing performance
+                        batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_target_ids)
+
+                        # accumulating performance measures to get a final estimate on the whole training set
+                        epoch_train_acc += batch_train_acc * batch_num_train_examples
+
+                        # accumulating other stats
+                        epoch_train_loss += loss.item() * batch_num_train_examples
+                        self.C.train()
+                        self.R.train()
+                        
+                        # printing (mini-batch related) stats on screen
+                        print("  mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc))
+                        
+                val_acc = self.eval_classifier(validation_set)
+
+                # saving the model if the validation accuracy increases
+                if val_acc > best_val_acc:
+                    best_val_acc = val_acc
+                    best_epoch = e + 1
+                    self.save("CaRNetv1")
+
+                epoch_train_loss /= epoch_num_train_examples
+
+                # printing (epoch related) stats on screen
+                print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}"
+                    + (", BEST!" if best_epoch == e + 1 else ""))
+                    .format(e + 1, epochs, epoch_train_loss,
+                            epoch_train_acc / epoch_num_train_examples, val_acc))
+
+    def eval_classifier(self, data_set):
+        """Evaluate the classifier on the given data set."""
+
+        # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state)
+        training_mode_originally_on = self.C.training and self.R.training
+        if training_mode_originally_on:
+            self.C.eval()
+            self.R.eval()  # enforcing evaluation mode
+
+        
+
+        with torch.no_grad():  # keeping off the autograd engine
+
+            # loop on mini-batches to accumulate the network outputs (creating a new iterator)
+            for images,_,captions_validation_target_ids in data_set:
+                images = images.to(self.device)
+                
+                captions_validation_target_ids = captions_validation_target_ids.to(self.device)
+
+                projections = self.C(images)
+                        
+                captions_output = torch.zeros((projections.shape[0],captions_validation_target_ids.shape[1])).to(torch.device(device))
+                
+                for idx,projection in enumerate(range(projections.shape[0])):
+                    _caption_no_pad = self.R.generate_caption(projections[idx],captions_validation_target_ids.shape[1])
+                    captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
+                    # Fill the remaining portion of caption eventually with zeros
+                    # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
+
+                captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors
+                
+                # computing performance
+                acc = self.__accuracy(captions_output_padded.squeeze(1), captions_validation_target_ids)
+
+        if training_mode_originally_on:
+            self.C.train()  # restoring the training state, if needed
+            self.R.train()
+        return acc
+# Example of usage
+if __name__ == "__main__":
+    from Vocabulary import Vocabulary
+    from Dataset import MyDataset
+    from torch.utils.data import DataLoader
+    ds = MyDataset("./dataset/flickr30k_images/", percentage=8)
+    v = Vocabulary(ds,reload=True) 
+    dc = ds.get_fraction_of_dataset(percentage=70)
+    df = ds.get_fraction_of_dataset(percentage=30)
+    # use dataloader facilities which requires a preprocessed dataset
+       
+    
+    dataloader_training = DataLoader(dc, batch_size=100,
+                        shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_training(data,v))
+    
+    dataloader_evaluation = DataLoader(df, batch_size=50,
+                        shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v))
+    
+    net = CaRNet1(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cpu")
+    net.load("CaRNetv1")
+    net.train(dataloader_training,dataloader_evaluation,1e-3,500,v)
diff --git a/v1 copy/Vocabulary.py b/v1 copy/Vocabulary.py
new file mode 100644
index 0000000..725945a
--- /dev/null
+++ b/v1 copy/Vocabulary.py	
@@ -0,0 +1,131 @@
+import os
+import torch
+import warnings
+from Dataset import MyDataset
+from typing import List
+
+class Vocabulary():
+    # The vocabulary implementation is done with a pre-trained word embedding GLOVE50d
+    # each word is represented by a record in a dataframe with this structure
+    
+    
+    def __init__(self, source_dataset: MyDataset, verbose: bool = False, reload: bool = False):
+        
+        self.enriched = False       # Tell that all the word coming from the dataset are in the vocabulary if it is set to True
+        self._make_enrich = False         # Allow the user to enrich the vocabulary if it is set to True
+        # Check if the enriched vocabulary(glove + PAD + SOS + EOS + UNK + dataset vocabulary) already exists
+        if os.path.exists(".saved/rich_embeddings_v1.pt") and os.path.exists(".saved/rich_word2id_v1.pt") and not reload:
+            self.embeddings = torch.load(".saved/rich_embeddings_v1.pt")
+            self.word2id = torch.load(".saved/rich_word2id_v1.pt")
+            self.enriched = True
+            return
+        
+        # Since the constructor arrived here, we need to load for the 1st time all the possible words from the dataset
+        dataset_words = source_dataset.get_all_distinct_words_in_dataset()
+        
+        # Dictionary length 
+        self.dictionary_length = len(dataset_words)+4 # Dictionary word + 4 Flavored Token (PAD + SOS + EOS + UNK)
+        
+        self.word2id = {}
+        self.embeddings = torch.zeros((self.dictionary_length, self.dictionary_length))  # DIM1: dict rows + 4 flavored token (PAD + SOS + EOS + UNK) | DIM2: Dict Rows +4 flavored token (PAD + SOS + EOS + UNK) as 1-hot
+        
+        # Initialize the token:
+        # <PAD>, <SOS>, <EOS>, <UNK>
+        self.word2id["<PAD>"] = 0
+        self.word2id["<SOS>"] = 1
+        self.word2id["<EOS>"] = 2
+        self.word2id["<UNK>"] = 3
+        
+        counter = 4 
+        for word in dataset_words:
+            self.word2id[word] = counter
+            counter += 1
+            
+        self.embeddings = torch.eye(self.dictionary_length)
+    
+    def predefined_token_idx(self) -> dict:
+        return {
+            "<PAD>":0,
+            "<SOS>":1,
+            "<EOS>":2,
+            "<UNK>":3
+        }
+    
+    def translate(self, word_sequence : List[str], type : str = "complete") -> torch.tensor:
+        """Given a sequence of word, translate into id list according to the vocabulary.
+
+        Args:
+            word_sequence (str): [description]
+        """
+        
+        # Initialize the translator
+        
+        if type == "uncomplete":
+            _sequence = torch.zeros(len(word_sequence)+1, dtype=torch.int32) # <SOS> + ...Caption...
+            
+        if type == "complete":
+            _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # <SOS> + ...Caption... + <EOS> 
+            _sequence[-1] = self.word2id["<EOS>"]
+            
+        _sequence[0] = self.word2id["<SOS>"]
+        
+        counter = 1 # Always skip <SOS> 
+        
+        # Evaluate all the word into the caption and translate it to an embeddings
+        for word in word_sequence:
+            if word.lower() in self.word2id.keys():
+                _sequence[counter] = self.word2id[word.lower()]
+            else:
+                _sequence[counter] = self.word2id["<UNK>"]
+            counter += 1
+        
+        return _sequence
+    
+    def rev_translate(self, words_id : torch.tensor) -> List[str]:
+        """Given a sequence of word, translate into id list according to the vocabulary.
+
+        Args:
+            word_sequence (str): [description]
+        """
+        # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset.
+        return [list(self.word2id.keys())[idx] for idx in words_id[:].tolist()]   # word_id (1,caption_length)
+    
+    
+    def __len__(self):
+        """The total number of words in this Vocabulary."""
+
+        return len(self.word2id.keys())
+    
+    
+# ----------------------------------------------------------------
+# Usage example
+
+if __name__ == '__main__':
+    #Load the vocabulary
+    v = Vocabulary(verbose=True)
+    # Make a translation
+    print(v.translate(["I","like","PLay","piano","."]))
+    # Enrich the vocabulary
+    v.make_enrich = True
+    dataset = ["I","Like","PLay","PIPPOplutopaperino"]
+    v.enrich(dataset)
+    v.make_enrich = False
+    # Enrich the vocabulary with a bulk insert 
+    v.make_enrich = True
+    dataset = [["I","Like","PLay","PIPPOplutopaperino"],["I","Like","PLay","pizza"]]
+    v.bulk_enrich(dataset)
+    v.make_enrich = False
+    
+    
+    
+        
+        
+        
+        
+            
+        
+        
+    
+    
+        
+    
\ No newline at end of file
diff --git a/light_version/__init__.py b/v1 copy/__init__.py
similarity index 100%
rename from light_version/__init__.py
rename to v1 copy/__init__.py
diff --git a/v1/Dataset.py b/v1/Dataset.py
new file mode 100644
index 0000000..c2a0ed8
--- /dev/null
+++ b/v1/Dataset.py
@@ -0,0 +1,135 @@
+import os
+import pandas as pd 
+import torch
+import numpy as np 
+from enum import Enum
+from torch.utils.data import Dataset, DataLoader
+import torch.nn as nn
+from PIL import Image
+import re
+from torchvision import transforms
+
+class MyDataset(Dataset):
+    
+    training_image_trasformation_parameter = {
+        "crop":{
+            "size": 224,
+            "scale": (0.08,1.0),
+            "ratio": (3. / 4., 4. / 3.),
+        },
+        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
+        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
+    }
+    
+    evaluation_image_trasformation_parameter = {
+        "crop":{
+            "size": 256,
+            "center": 224
+        },
+        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
+        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
+    }
+    
+    def __init__(self, directory_of_data:str = None, percentage:int = 100, already_computed_dataframe: pd.DataFrame = None):
+        """Create a new dataset from source files
+
+        Args:
+            directory_of_data (str): [description]
+        """
+        if already_computed_dataframe is not None:
+            self.directory_of_data = directory_of_data
+            self._dataset = already_computed_dataframe
+            return
+        
+        if not os.path.exists(directory_of_data):
+            raise ValueError(f"{directory_of_data} not Exist!")
+        if not os.path.isdir(directory_of_data):
+            raise ValueError(f"{directory_of_data} is not a directory!")
+        
+        _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]]
+        self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100)))
+        self.directory_of_data = directory_of_data
+        
+    def get_fraction_of_dataset(self, percentage: int): 
+        _temp_df_moved = self._dataset.head(int(len(self._dataset)*(percentage/100))).sample(frac=1)
+        _temp_df_copy = _temp_df_moved.copy()
+        return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy)
+    
+    def get_all_distinct_words_in_dataset(self):
+        words = []
+        for idx,row in self._dataset.iterrows():
+            for word in re.findall("[\\w]+|\.|\,", row["comment"].lower()):
+                if word not in words:
+                    words.append(word)
+        return words
+    
+    def __len__(self):
+        return self._dataset.shape[0]
+    
+    def __getitem__(self, idx):
+        
+        image, caption = Image.open(f"{self.directory_of_data}/flickr30k_images/{self._dataset.iloc[idx]['image_name']}").convert('RGB'), \
+                            re.findall("[\\w]+|\.|\,", self._dataset.iloc[idx]["comment"].lower())
+        
+        return image, caption 
+    
+    def pack_minibatch_training(self, data, vocabulary):
+        
+        # Sort a data list by caption length (descending order).
+        data.sort(key=lambda x: len(x[1]), reverse=True)
+    
+        images, captions = zip(*data)
+        
+        operations = transforms.Compose([
+                transforms.RandomResizedCrop(MyDataset.training_image_trasformation_parameter["crop"]["size"], scale=MyDataset.training_image_trasformation_parameter["crop"]["scale"], ratio=MyDataset.training_image_trasformation_parameter["crop"]["ratio"]), # Crop a random portion of image and resize it to a given size.
+                transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability.
+                transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor.  (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
+                transforms.Normalize(mean=MyDataset.training_image_trasformation_parameter["mean"], std=MyDataset.training_image_trasformation_parameter["std_dev"]),
+        ])
+        images = list(map(lambda image: operations(image),list(images)))
+        
+        # Merge images (from tuple of 3D tensor to 4D tensor).
+        images = torch.stack(images, 0) # (Batch Size, Color, Height, Width)
+        
+        captions_length = [len(caption) for caption in captions] # (Batch Size,)
+        
+        captions_training_ids = [vocabulary.translate(caption,"uncomplete")for caption in captions] # (Batch Size, Caption)
+        
+        captions_target_ids  = [vocabulary.translate(caption,"complete") for caption in captions]
+        
+        captions_training_ids = nn.utils.rnn.pad_sequence(captions_training_ids, padding_value=0, batch_first=True)
+        
+        captions_target_ids  = nn.utils.rnn.pad_sequence(captions_target_ids, padding_value=0, batch_first=True)
+        
+        return images,captions_training_ids.type(torch.LongTensor),captions_target_ids.type(torch.LongTensor)
+    
+    def pack_minibatch_evaluation(self, data, vocabulary):
+        
+        # Sort a data list by caption length (descending order).
+        data.sort(key=lambda x: len(x[1]), reverse=True)
+    
+        images, captions = zip(*data)
+        
+        operations = transforms.Compose([
+                transforms.Resize(MyDataset.evaluation_image_trasformation_parameter["crop"]["size"]), 
+                transforms.CenterCrop(MyDataset.evaluation_image_trasformation_parameter["crop"]["center"]),  # Crops the given image at the center.
+                transforms.ToTensor(),
+                transforms.Normalize(mean=MyDataset.evaluation_image_trasformation_parameter["mean"], std=MyDataset.evaluation_image_trasformation_parameter["std_dev"])
+        ])
+
+        images = list(map(lambda image: operations(image),list(images)))
+        
+        # Merge images (from tuple of 3D tensor to 4D tensor).
+        images = torch.stack(images, 0) # (Batch Size, Color, Height, Width)
+                           
+        
+        captions_evaluation_ids = [vocabulary.translate(caption,"uncomplete")for caption in captions] # (Batch Size, Caption)
+        
+        captions_target_ids  = [vocabulary.translate(caption,"complete") for caption in captions]
+        
+        captions_evaluation_ids = nn.utils.rnn.pad_sequence(captions_evaluation_ids, padding_value=0, batch_first=True)
+        
+        captions_target_ids  = nn.utils.rnn.pad_sequence(captions_target_ids, padding_value=0, batch_first=True)
+        
+        return images,captions_evaluation_ids.type(torch.LongTensor),captions_target_ids.type(torch.LongTensor)
+        
\ No newline at end of file
diff --git a/v1/NeuralNet.py b/v1/NeuralNet.py
new file mode 100644
index 0000000..9ac00c9
--- /dev/null
+++ b/v1/NeuralNet.py
@@ -0,0 +1,345 @@
+#####################################################
+## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! 
+# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :)
+##
+##
+##  pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html
+
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+from torch.nn.utils.rnn import pack_padded_sequence
+import torch.nn.functional as F
+
+device="cpu"
+class EncoderCNN(nn.Module):
+    def __init__(self, projection_size):
+        super(EncoderCNN, self).__init__()
+        resnet = models.resnet50(pretrained=True)
+        for param in resnet.parameters():
+            param.requires_grad_(False)
+        
+        modules = list(resnet.children())[:-1]   # remove last fc layer
+        self.resnet = nn.Sequential(*modules)
+        self.linear = nn.Linear(resnet.fc.in_features, projection_size) 
+        
+    def forward(self, images):
+        features = self.resnet(images) 
+        features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.)
+        features = self.linear(features)
+        return features
+    
+class DecoderRNN(nn.Module):
+    def __init__(self, hidden_size, padding_index, vocab_size, embedding_size):
+        """[summary]
+
+        Args:
+            hidden_size ([type]): [description]
+            padding_index ([type]): [description]
+            vocab_size ([type]): [description]
+            embedding_size ([type]): [description]
+        """
+        super(DecoderRNN, self).__init__()
+    
+        # Embedding layer that turns words into a vector of a specified size
+        self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index)
+        
+        # The LSTM takes embedded word vectors (of a specified size) as input
+        # and outputs hidden states of size hidden_dim
+        self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size)
+        
+        # The linear layer that maps the hidden state output dimension
+        # to the number of words we want as output, vocab_size
+        self.linear_1 = nn.Linear(hidden_size, vocab_size)
+                
+
+    def forward(self, features, captions):
+        """[summary]
+
+        Args:
+            features (torch.tensor(batch_size, hidden_size)): [description]
+            captions (torch.tensor(batch_size, max_captions_length, word_embedding)): [description]
+
+        Returns:
+            [torch.tensor(batch_size, max_captions_length, vocab_size)]: [description]
+        """             
+        
+        # Initialize the hidden state
+        batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
+        
+        # Create embedded word vector for each word in the captions
+        inputs = self.word_embeddings(captions) # In:       Out: (batch_size, captions length, embed_size)
+        
+        # Feed LSTMCell with image features and retrieve the state
+        
+        _h, _c = self.lstm_unit(features) # _h : (Batch size, Hidden size)
+        
+        # Deterministict <SOS> Output as first word of the caption :)
+        start = torch.zeros(self.word_embeddings.num_embeddings)
+        start[1] = 1
+        outputs = start.repeat(batch_size,1,1).to(torch.device(device)) # Bulk insert of <SOS> embeddings to all the elements of the batch 
+          
+        
+        
+        # How it works the loop?
+        # For each time step t \in {0, N-1}, where N is the caption length 
+        
+        # Since the sequences are padded, how the forward is performed? Since the <EOS> don't need to be feeded as input?
+        # The assumption is that the captions are of lenght N-1, so the captions provide by external as input are without <EOS> token
+        
+        for idx in range(0,inputs.shape[1]): 
+            _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c))
+            _outputs = self.linear_1(_h)
+            outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1)
+        
+        return outputs # (Batch Size, N, |Vocabulary|)
+    
+    def generate_caption(self, features, max_caption_length):
+        """Generate captions for given image features using greedy search."""
+       
+        sampled_ids = [torch.tensor([1]).to(torch.device(device))] # Hardcoded <SOS>
+        input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,-1))
+        with torch.no_grad(): 
+            _h ,_c = self.lstm_unit(features.unsqueeze(0))
+            for _ in range(max_caption_length-1):
+                _h, _c = self.lstm_unit(input, (_h ,_c))           # _h: (1, 1, hidden_size)
+                outputs = self.linear_1(_h)            # outputs:  (1, vocab_size)
+                _ , predicted = F.softmax(outputs,dim=1).cuda().max(1)  if device == "cuda" else   F.softmax(outputs,dim=1).max(1)                # predicted: (batch_size)
+                sampled_ids.append(predicted)
+                input = self.word_embeddings(predicted)                       # inputs: (batch_size, embed_size)
+                input = input.to(torch.device(device))                       # inputs: (batch_size, 1, embed_size)
+                if predicted == 2:
+                    break
+            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
+        return sampled_ids
+
+    
+class CaRNet1(nn.Module):
+    
+    def __init__(self, hidden_size, padding_index, vocab_size, embedding_size, device = "cpu"):
+        """[summary]
+
+        Args:
+            hidden_size ([type]): [description]
+            padding_index ([type]): [description]
+            vocab_size ([type]): [description]
+            embedding_size ([type]): [description]
+        """
+        super(CaRNet1, self).__init__()
+        self.padding_index = padding_index
+        self.device = torch.device(device)
+        self.C = EncoderCNN(embedding_size)
+        self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size)
+
+        self.C.to(self.device)
+        self.R.to(self.device)
+        
+    def save(self, file_name):
+        """Save the classifier."""
+        torch.save(self.C.state_dict(), f".saved/v1/{file_name}_C.pth")
+        torch.save(self.R.state_dict(), f".saved/v1/{file_name}_R.pth")
+
+    def load(self, file_name):
+        """Load the classifier."""
+
+        # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
+        self.C.load_state_dict(torch.load(f".saved/v1/{file_name}_C.pth", map_location=self.device))
+        self.R.load_state_dict(torch.load(f".saved/v1/{file_name}_R.pth", map_location=self.device))
+    
+    def forward(self,images,captions):
+        features = self.C(images)
+        return self.R(features, captions)
+    
+    def __accuracy(self, outputs, labels):
+        """[summary]
+
+        Args:
+            outputs ([type]): [description]
+            labels ([type]): [description]
+
+        Returns:
+            [type]: [description]
+        """
+        # Assume outputs and labels have same shape and already padded
+        # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch!
+        # With this technique we evaluate all major case:
+        # 1) Output caption is longer than expected : Output.ID - <PAD>.ID != 0
+        # 2) Output is less longer than expect : <PAD>.ID - Target.ID != 0
+        # 3) Output has equal dimension but different label : Output.ID - Target.ID != 0, 
+        # Hp. 1 : Output<PAD>.ID - Target<PAD>.ID = 0 need to be considered as good match because it means that both output and target end before this token
+        # Hp. 2 : Both Outputs and Target need to be dropped on the first word because <SOS> is evaluated in a deterministic fashion :)
+        # computing the accuracy
+        
+        right_predictions = torch.eq(outputs[:,1:], labels[:,1:])
+        acc = torch.mean(right_predictions.to(torch.float32).sum(axis=1) / right_predictions.shape[1] ).item()  # Accuracy = TP+TN / ALL
+        return acc
+    
+        # TO DO: Devo usare la confusion matrix????????? 
+    
+    def train(self, train_set, validation_set, lr, epochs, vocabulary):
+            
+            criterion = nn.CrossEntropyLoss(ignore_index=self.padding_index,reduction="sum").cuda() if self.device.type == "cuda"  \
+                                                else nn.CrossEntropyLoss(ignore_index=0,reduction="sum")
+            
+            # initializing some elements
+            best_val_acc = -1.  # the best accuracy computed on the validation data
+            best_epoch = -1  # the epoch in which the best accuracy above was computed
+
+            
+            
+            # ensuring the classifier is in 'train' mode (pytorch)
+            self.C.train()
+            self.R.train()
+
+            # creating the optimizer
+            optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr)
+
+            # loop on epochs!
+            for e in range(0, epochs):
+
+                # epoch-level stats (computed by accumulating mini-batch stats)
+                epoch_train_acc = 0.
+                epoch_train_loss = 0.
+                epoch_num_train_examples = 0
+
+                for images,captions_training_ids,captions_target_ids in train_set:
+                    optimizer.zero_grad() 
+                    
+                    batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size')
+                    epoch_num_train_examples += batch_num_train_examples
+                    
+                    
+                    images = images.to(self.device)
+                    captions_training_ids = captions_training_ids.to(self.device) # captions > (B, L)
+                    captions_target_ids = captions_target_ids.to(self.device) # captions > (B, |L|-1) without end token
+
+                    # computing the network output on the current mini-batch
+                    features = self.C(images)
+                    outputs = self.R(features, captions_training_ids) # outputs > (B, L, |V|); 
+                    
+                    # (B, L, |V|) -> (B * L, |V|) and captions > (B * L)
+                    loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions_target_ids.reshape(-1))
+                    
+                    # computing gradients and updating the network weights
+                    loss.backward()  # computing gradients
+                    optimizer.step()  # updating weights
+                    
+                    # with torch.no_grad():
+                    #     self.C.eval()
+                    #     self.R.eval()
+                    #     features = self.C(images)
+                    #     import random
+                    #     numb = random.randint(0,2)
+                    #     caption = self.R.generate_caption(features[numb],30)
+                    #     print(vocabulary.rev_translate(captions_target_ids[numb]))
+                    #     print(vocabulary.rev_translate(caption[0]))
+                    #     self.C.train()
+                    #     self.R.train()
+                    
+                    with torch.no_grad():
+                        self.C.eval()
+                        self.R.eval()
+                        
+                        # Compute captions as ids for all the training images
+                        projections = self.C(images)
+                        
+                        captions_output = torch.zeros((projections.shape[0],captions_target_ids.shape[1])).to(torch.device(device))
+                        
+                        for idx,projection in enumerate(range(projections.shape[0])):
+                            _caption_no_pad = self.R.generate_caption(projections[idx],captions_target_ids.shape[1])
+                            captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
+                            # Fill the remaining portion of caption eventually with zeros
+                            # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
+    
+                        captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors
+                        
+                        # computing performance
+                        batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_target_ids)
+
+                        # accumulating performance measures to get a final estimate on the whole training set
+                        epoch_train_acc += batch_train_acc * batch_num_train_examples
+
+                        # accumulating other stats
+                        epoch_train_loss += loss.item() * batch_num_train_examples
+                        self.C.train()
+                        self.R.train()
+                        
+                        # printing (mini-batch related) stats on screen
+                        print("  mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc))
+                        
+                val_acc = self.eval_classifier(validation_set)
+
+                # saving the model if the validation accuracy increases
+                if val_acc > best_val_acc:
+                    best_val_acc = val_acc
+                    best_epoch = e + 1
+                    self.save("CaRNetv1")
+
+                epoch_train_loss /= epoch_num_train_examples
+
+                # printing (epoch related) stats on screen
+                print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}"
+                    + (", BEST!" if best_epoch == e + 1 else ""))
+                    .format(e + 1, epochs, epoch_train_loss,
+                            epoch_train_acc / epoch_num_train_examples, val_acc))
+
+    def eval_classifier(self, data_set):
+        """Evaluate the classifier on the given data set."""
+
+        # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state)
+        training_mode_originally_on = self.C.training and self.R.training
+        if training_mode_originally_on:
+            self.C.eval()
+            self.R.eval()  # enforcing evaluation mode
+
+        
+
+        with torch.no_grad():  # keeping off the autograd engine
+
+            # loop on mini-batches to accumulate the network outputs (creating a new iterator)
+            for images,_,captions_validation_target_ids in data_set:
+                images = images.to(self.device)
+                
+                captions_validation_target_ids = captions_validation_target_ids.to(self.device)
+
+                projections = self.C(images)
+                        
+                captions_output = torch.zeros((projections.shape[0],captions_validation_target_ids.shape[1])).to(torch.device(device))
+                
+                for idx,projection in enumerate(range(projections.shape[0])):
+                    _caption_no_pad = self.R.generate_caption(projections[idx],captions_validation_target_ids.shape[1])
+                    captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad
+                    # Fill the remaining portion of caption eventually with zeros
+                    # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid.
+
+                captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors
+                
+                # computing performance
+                acc = self.__accuracy(captions_output_padded.squeeze(1), captions_validation_target_ids)
+
+        if training_mode_originally_on:
+            self.C.train()  # restoring the training state, if needed
+            self.R.train()
+        return acc
+# Example of usage
+if __name__ == "__main__":
+    from Vocabulary import Vocabulary
+    from Dataset import MyDataset
+    from torch.utils.data import DataLoader
+    ds = MyDataset("./dataset/flickr30k_images/", percentage=8)
+    v = Vocabulary(ds,reload=True) 
+    dc = ds.get_fraction_of_dataset(percentage=70)
+    df = ds.get_fraction_of_dataset(percentage=30)
+    # use dataloader facilities which requires a preprocessed dataset
+       
+    
+    dataloader_training = DataLoader(dc, batch_size=100,
+                        shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_training(data,v))
+    
+    dataloader_evaluation = DataLoader(df, batch_size=50,
+                        shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v))
+    
+    net = CaRNet1(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cpu")
+    net.load("CaRNetv1")
+    net.train(dataloader_training,dataloader_evaluation,1e-3,500,v)
diff --git a/v1/Vocabulary.py b/v1/Vocabulary.py
new file mode 100644
index 0000000..725945a
--- /dev/null
+++ b/v1/Vocabulary.py
@@ -0,0 +1,131 @@
+import os
+import torch
+import warnings
+from Dataset import MyDataset
+from typing import List
+
+class Vocabulary():
+    # The vocabulary implementation is done with a pre-trained word embedding GLOVE50d
+    # each word is represented by a record in a dataframe with this structure
+    
+    
+    def __init__(self, source_dataset: MyDataset, verbose: bool = False, reload: bool = False):
+        
+        self.enriched = False       # Tell that all the word coming from the dataset are in the vocabulary if it is set to True
+        self._make_enrich = False         # Allow the user to enrich the vocabulary if it is set to True
+        # Check if the enriched vocabulary(glove + PAD + SOS + EOS + UNK + dataset vocabulary) already exists
+        if os.path.exists(".saved/rich_embeddings_v1.pt") and os.path.exists(".saved/rich_word2id_v1.pt") and not reload:
+            self.embeddings = torch.load(".saved/rich_embeddings_v1.pt")
+            self.word2id = torch.load(".saved/rich_word2id_v1.pt")
+            self.enriched = True
+            return
+        
+        # Since the constructor arrived here, we need to load for the 1st time all the possible words from the dataset
+        dataset_words = source_dataset.get_all_distinct_words_in_dataset()
+        
+        # Dictionary length 
+        self.dictionary_length = len(dataset_words)+4 # Dictionary word + 4 Flavored Token (PAD + SOS + EOS + UNK)
+        
+        self.word2id = {}
+        self.embeddings = torch.zeros((self.dictionary_length, self.dictionary_length))  # DIM1: dict rows + 4 flavored token (PAD + SOS + EOS + UNK) | DIM2: Dict Rows +4 flavored token (PAD + SOS + EOS + UNK) as 1-hot
+        
+        # Initialize the token:
+        # <PAD>, <SOS>, <EOS>, <UNK>
+        self.word2id["<PAD>"] = 0
+        self.word2id["<SOS>"] = 1
+        self.word2id["<EOS>"] = 2
+        self.word2id["<UNK>"] = 3
+        
+        counter = 4 
+        for word in dataset_words:
+            self.word2id[word] = counter
+            counter += 1
+            
+        self.embeddings = torch.eye(self.dictionary_length)
+    
+    def predefined_token_idx(self) -> dict:
+        return {
+            "<PAD>":0,
+            "<SOS>":1,
+            "<EOS>":2,
+            "<UNK>":3
+        }
+    
+    def translate(self, word_sequence : List[str], type : str = "complete") -> torch.tensor:
+        """Given a sequence of word, translate into id list according to the vocabulary.
+
+        Args:
+            word_sequence (str): [description]
+        """
+        
+        # Initialize the translator
+        
+        if type == "uncomplete":
+            _sequence = torch.zeros(len(word_sequence)+1, dtype=torch.int32) # <SOS> + ...Caption...
+            
+        if type == "complete":
+            _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # <SOS> + ...Caption... + <EOS> 
+            _sequence[-1] = self.word2id["<EOS>"]
+            
+        _sequence[0] = self.word2id["<SOS>"]
+        
+        counter = 1 # Always skip <SOS> 
+        
+        # Evaluate all the word into the caption and translate it to an embeddings
+        for word in word_sequence:
+            if word.lower() in self.word2id.keys():
+                _sequence[counter] = self.word2id[word.lower()]
+            else:
+                _sequence[counter] = self.word2id["<UNK>"]
+            counter += 1
+        
+        return _sequence
+    
+    def rev_translate(self, words_id : torch.tensor) -> List[str]:
+        """Given a sequence of word, translate into id list according to the vocabulary.
+
+        Args:
+            word_sequence (str): [description]
+        """
+        # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset.
+        return [list(self.word2id.keys())[idx] for idx in words_id[:].tolist()]   # word_id (1,caption_length)
+    
+    
+    def __len__(self):
+        """The total number of words in this Vocabulary."""
+
+        return len(self.word2id.keys())
+    
+    
+# ----------------------------------------------------------------
+# Usage example
+
+if __name__ == '__main__':
+    #Load the vocabulary
+    v = Vocabulary(verbose=True)
+    # Make a translation
+    print(v.translate(["I","like","PLay","piano","."]))
+    # Enrich the vocabulary
+    v.make_enrich = True
+    dataset = ["I","Like","PLay","PIPPOplutopaperino"]
+    v.enrich(dataset)
+    v.make_enrich = False
+    # Enrich the vocabulary with a bulk insert 
+    v.make_enrich = True
+    dataset = [["I","Like","PLay","PIPPOplutopaperino"],["I","Like","PLay","pizza"]]
+    v.bulk_enrich(dataset)
+    v.make_enrich = False
+    
+    
+    
+        
+        
+        
+        
+            
+        
+        
+    
+    
+        
+    
\ No newline at end of file
diff --git a/v1/__init__.py b/v1/__init__.py
new file mode 100644
index 0000000..e69de29