gnekt
diff --git a/‎Models/Dataset.py‎ renamed to ‎heavy_version/Models/Dataset.py‎ b/‎Models/Dataset.py‎ renamed to ‎heavy_version/Models/Dataset.py‎
diff --git a/‎Interface/__init__.py‎ renamed to ‎heavy_version/Models/Interface/__init__.py‎ b/‎Interface/__init__.py‎ renamed to ‎heavy_version/Models/Interface/__init__.py‎
diff --git a/‎Models/NeuralNet.py‎ renamed to ‎heavy_version/Models/NeuralNet.py‎
Lines changed: 11 additions & 9 deletions b/‎Models/NeuralNet.py‎ renamed to ‎heavy_version/Models/NeuralNet.py‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎Models/PreProcess.py‎ renamed to ‎heavy_version/Models/PreProcess.py‎ b/‎Models/PreProcess.py‎ renamed to ‎heavy_version/Models/PreProcess.py‎
diff --git a/‎Models/Sample.py‎ renamed to ‎heavy_version/Models/Sample.py‎ b/‎Models/Sample.py‎ renamed to ‎heavy_version/Models/Sample.py‎
diff --git a/‎Models/Vocabulary.py‎ renamed to ‎heavy_version/Models/Vocabulary.py‎ b/‎Models/Vocabulary.py‎ renamed to ‎heavy_version/Models/Vocabulary.py‎
diff --git a/‎Models/__init__.py‎ renamed to ‎heavy_version/Models/__init__.py‎ b/‎Models/__init__.py‎ renamed to ‎heavy_version/Models/__init__.py‎
diff --git a/‎light_version/Dataset.py‎
Lines changed: 122 additions & 0 deletions b/‎light_version/Dataset.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎light_version/NeuralNet.py‎
Lines changed: 196 additions & 0 deletions b/‎light_version/NeuralNet.py‎
Lines changed: 196 additions & 0 deletions
@@ -108,16 +108,19 @@ def load(self, file_name):
     self.net.load_state_dict(torch.load(file_name, map_location=self.device))
 
 def train(train_set, validation_set, lr, epochs, vocabulary):
-        
+        device = torch.device("cuda:0")
         criterion = nn.CrossEntropyLoss()
 
         # initializing some elements
         best_val_acc = -1.  # the best accuracy computed on the validation data
         best_epoch = -1  # the epoch in which the best accuracy above was computed
 
         encoder = EncoderCNN(50)
-        decoder = DecoderRNN(2048,0,len(v_enriched.word2id.keys()),v_enriched.embeddings)
-    
+        decoder = DecoderRNN(2048,0,len(vocabulary.word2id.keys()),vocabulary.embeddings)
+        
+        encoder.to(device)
+        decoder.to(device)
+        
         # ensuring the classifier is in 'train' mode (pytorch)
         decoder.train()
 
@@ -139,10 +142,9 @@ def train(train_set, validation_set, lr, epochs, vocabulary):
                 batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size')
                 epoch_num_train_examples += batch_num_train_examples
 
-                # X = .to(self.device)
-                # X_rev = X_rev.to(self.device)
-                # X_len = X_len.to(self.device)
-                # y = y.to(self.device)
+                images = images.to(device)
+                captions_length = captions_length.to(device)
+                targets = targets.to(device)
 
                 # computing the network output on the current mini-batch
                 features = encoder(images)
@@ -163,8 +165,8 @@ def train(train_set, validation_set, lr, epochs, vocabulary):
                 torch.save(decoder.state_dict(),".saved/decoder.pt")
                 features = encoder(images)
                 caption = decoder.sample(features[0])
-                print(v_enriched.rev_translate(captions))
-                print(v_enriched.rev_translate(caption))
+                print(vocabulary.rev_translate(captions))
+                print(vocabulary.rev_translate(caption))
                 # computing the performance of the net on the current training mini-batch
                 # with torch.no_grad():  # keeping these operations out of those for which we will compute the gradient
                 #     self.net.eval()  # switching to eval mode
 
@@ -0,0 +1,122 @@
+import os
+import pandas as pd 
+import torch
+import numpy as np 
+from enum import Enum
+from torch.utils.data import Dataset, DataLoader
+import torch.nn as nn
+from PIL import Image
+import re
+from torchvision import transforms
+
+class MyDataset(Dataset):
+    
+    training_image_trasformation_parameter = {
+        "crop":{
+            "size": 224,
+            "scale": (0.08,1.0),
+            "ratio": (3. / 4., 4. / 3.),
+        },
+        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
+        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
+    }
+    
+    evaluation_image_trasformation_parameter = {
+        "crop":{
+            "size": 256,
+            "center": 224
+        },
+        "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB)
+        "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB)
+    }
+    
+    def __init__(self, directory_of_data:str = None, percentage:int = 100, already_computed_dataframe: pd.DataFrame = None):
+        """Create a new dataset from source files
+
+        Args:
+            directory_of_data (str): [description]
+        """
+        if already_computed_dataframe is not None:
+            self.directory_of_data = directory_of_data
+            self._dataset = already_computed_dataframe
+            return
+        
+        if not os.path.exists(directory_of_data):
+            raise ValueError(f"{directory_of_data} not Exist!")
+        if not os.path.isdir(directory_of_data):
+            raise ValueError(f"{directory_of_data} is not a directory!")
+        
+        _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]].sample(frac=1)
+        self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100)))
+        self.directory_of_data = directory_of_data
+        
+    def get_fraction_of_dataset(self, percentage: int): 
+        _temp_df_moved = self._dataset.head(int(len(self._dataset)*(percentage/100)))
+        _temp_df_copy = _temp_df_moved.copy()
+        return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy)
+    
+    def get_all_distinct_words_in_dataset(self):
+        words = []
+        for idx,row in self._dataset.iterrows():
+            for word in re.findall("[\\w]+|\.|\,", row["comment"].lower()):
+                if word not in words:
+                    words.append(word)
+        return words
+    
+    def __len__(self):
+        return self._dataset.shape[0]
+    
+    def __getitem__(self, idx):
+        
+        image, caption = Image.open(f"{self.directory_of_data}/images/{self._dataset.iloc[idx]['image_name']}").convert('RGB'), \
+                            re.findall("[\\w]+|\.|\,", self._dataset.iloc[idx]["comment"].lower())
+        
+        return image, caption 
+    
+    def pack_minibatch_training(self, data, vocabulary):
+        
+        # Sort a data list by caption length (descending order).
+        data.sort(key=lambda x: len(x[1]), reverse=True)
+    
+        images, captions = zip(*data)
+        
+        operations = transforms.Compose([
+                transforms.RandomResizedCrop(MyDataset.training_image_trasformation_parameter["crop"]["size"], scale=MyDataset.training_image_trasformation_parameter["crop"]["scale"], ratio=MyDataset.training_image_trasformation_parameter["crop"]["ratio"]), # Crop a random portion of image and resize it to a given size.
+                transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability.
+                transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor.  (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
+                transforms.Normalize(mean=MyDataset.training_image_trasformation_parameter["mean"], std=MyDataset.training_image_trasformation_parameter["std_dev"]),
+        ])
+        images = list(map(lambda image: operations(image),list(images)))
+        
+        # Merge images (from tuple of 3D tensor to 4D tensor).
+        images = torch.stack(images, 0)
+        
+        caption_lengths = [len(caption) for caption in captions]
+        captions_ids = [vocabulary.translate(caption) for caption in captions]
+        captions_withoud_start_id = [vocabulary.translate(caption)[1:] for caption in captions]
+        captions = nn.utils.rnn.pad_sequence(captions_ids, padding_value=0, batch_first=True)
+        captions_withoud_start_id = nn.utils.rnn.pad_sequence(captions_withoud_start_id, padding_value=0, batch_first=True)
+        return images,captions_withoud_start_id.type(torch.LongTensor),torch.tensor(caption_lengths),captions
+    
+    def pack_minibatch_evaluation(self, data):
+        
+        # Sort a data list by caption length (descending order).
+        data.sort(key=lambda x: len(x[1]), reverse=True)
+    
+        images, captions = zip(*data)
+        
+        operations = transforms.Compose([
+                transforms.Resize(MyDataset.evaluation_image_trasformation_parameter["crop"]["size"]), 
+                transforms.CenterCrop(MyDataset.evaluation_image_trasformation_parameter["crop"]["center"]),  # Crops the given image at the center.
+                transforms.ToTensor(),
+                transforms.Normalize(mean=MyDataset.evaluation_image_trasformation_parameter["mean"], std=MyDataset.evaluation_image_trasformation_parameter["std_dev"])
+        ])
+        images = images.map(lambda image: operations(image))
+        
+        # Merge images (from tuple of 3D tensor to 4D tensor).
+        images = torch.stack(images, 0)
+        
+        caption_lengths = [len(caption) for caption in captions]
+        captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True)
+        return images,captions.type(torch.LongTensor),caption_lengths 
+        
@@ -0,0 +1,196 @@
+import torch
+import torch.nn as nn
+import torchvision.models as models
+from torch.nn.utils.rnn import pack_padded_sequence
+import torch.nn.functional as F
+from torch.autograd import Variable
+import torch.nn.functional as F
+import random
+
+device = "cuda:0"
+class EncoderCNN(nn.Module):
+    def __init__(self, embed_size):
+        super(EncoderCNN, self).__init__()
+        resnet = models.resnet50(pretrained=True)
+        for param in resnet.parameters():
+            param.requires_grad_(False)
+        
+        modules = list(resnet.children())[:-1]   # remove last fc layer
+        self.resnet = nn.Sequential(*modules)
+        self.linear = nn.Linear(resnet.fc.in_features, 50) 
+        
+    def forward(self, images):
+        
+        features = self.resnet(images) 
+        features = features.reshape(features.size(0), -1)
+        features = self.linear(features)
+        return features
+    
+class DecoderRNN(nn.Module):
+    def __init__(self, hidden_size, padding_index, vocab_size, embeddings ):
+        """Set the hyper-parameters and build the layers."""
+        super(DecoderRNN, self).__init__()
+        # Keep track of hidden_size for initialization of hidden state
+        self.hidden_size = hidden_size
+        
+        # Embedding layer that turns words into a vector of a specified size
+        self.word_embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True, padding_idx = 0)
+        
+        # The LSTM takes embedded word vectors (of a specified size) as input
+        # and outputs hidden states of size hidden_dim
+        self.lstm = nn.LSTM(input_size=50, \
+                            hidden_size=1024, # LSTM hidden units 
+                            num_layers=1, # number of LSTM layer
+                            batch_first=True,  # input & output will have batch size as 1st dimension
+                            dropout=0, # Not applying dropout 
+                            bidirectional=False, # unidirectional LSTM
+                           )
+        
+        # The linear layer that maps the hidden state output dimension
+        # to the number of words we want as output, vocab_size
+        self.linear_1 = nn.Linear(1024, vocab_size)
+
+    def init_hidden_state(self, encoder_out):
+        """
+        Creates the initial hidden and cell states for the decoder's LSTM based on the encoded images.
+        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
+        :return: hidden state, cell state
+        """
+        
+        h = encoder_out.reshape((1,encoder_out.shape[0],encoder_out.shape[1]))  # (batch_size, decoder_dim)
+        c = encoder_out.reshape((1,encoder_out.shape[0],encoder_out.shape[1]))
+        return h, c
+        
+    
+    def forward(self, features, captions,caption_lengths):
+        """ Define the feedforward behavior of the model """      
+        
+        # Initialize the hidden state
+        batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
+        
+        # Create embedded word vectors for each word in the captions
+        inputs = self.word_embeddings(captions) # embeddings new shape : (batch_size, captions length, embed_size)
+        
+       
+        # Get the output and hidden state by passing the lstm over our word embeddings
+        # the lstm takes in our embeddings and hidden state
+        #h, c = self.init_hidden_state(features) 
+        inputs = torch.cat((features.unsqueeze(1), inputs), dim=1)
+        lstm_out, self.hidden = self.lstm(inputs) # lstm_out shape : (batch_size, caption length, hidden_size), Defaults to zeros if (h_0, c_0) is not provided.
+        
+        lstm_out = lstm_out[:,1:,:]
+        # Fully connected layers
+        outputs = self.linear_1(lstm_out) # outputs shape : (batch_size, caption length, vocab_size)
+        
+        return outputs
+    
+    def sample(self, features):
+        """Generate captions for given image features using greedy search."""
+       
+        sampled_ids = []
+        input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,1,-1))
+        with torch.no_grad(): 
+            print(features.shape)
+            _ ,state = self.lstm(features.reshape(1,1,-1))
+            for _ in range(15):
+                hiddens, state = self.lstm(input, state)           # hiddens: (batch_size, 1, hidden_size)
+                outputs = self.linear_1(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
+                _, predicted = F.softmax(outputs,dim=1).cuda.max(1)  if device == "cuda" else   F.softmax(outputs,dim=1).max(1)                # predicted: (batch_size)
+                sampled_ids.append(predicted)
+                inputs = self.word_embeddings(predicted)                       # inputs: (batch_size, embed_size)
+                input = inputs.unsqueeze(1).to(torch.device(device))                       # inputs: (batch_size, 1, embed_size)
+                if predicted == 2:
+                    break
+            sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
+        return sampled_ids
+
+def save(self, file_name):
+    """Save the classifier."""
+
+    torch.save(self.net.state_dict(), file_name)
+
+def load(self, file_name):
+    """Load the classifier."""
+
+    # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device)
+    self.net.load_state_dict(torch.load(file_name, map_location=self.device))
+        
+def train(train_set, validation_set, lr, epochs, vocabulary):
+        device_t = torch.device(device)
+        criterion = nn.CrossEntropyLoss(ignore_index=0,reduction="sum").cuda() if device == "cuda" else nn.CrossEntropyLoss(ignore_index=0,reduction="sum")
+        
+        # initializing some elements
+        best_val_acc = -1.  # the best accuracy computed on the validation data
+        best_epoch = -1  # the epoch in which the best accuracy above was computed
+
+        encoder = EncoderCNN(50)
+        decoder = DecoderRNN(1024,0,len(vocabulary.word2id.keys()),vocabulary.embeddings)
+        
+        encoder.to(device_t)
+        decoder.to(device_t)
+        
+        # ensuring the classifier is in 'train' mode (pytorch)
+        decoder.train()
+
+        # creating the optimizer
+        optimizer = torch.optim.Adam(list(decoder.parameters()) + list(encoder.linear.parameters()), lr)
+
+        # loop on epochs!
+        for e in range(0, epochs):
+
+            # epoch-level stats (computed by accumulating mini-batch stats)
+            epoch_train_acc = 0.
+            epoch_train_loss = 0.
+            epoch_num_train_examples = 0
+
+            for images,captions,captions_length,captions_training in train_set:
+                optimizer.zero_grad() 
+                
+                # zeroing the memory areas that were storing previously computed gradients
+                batch_num_train_examples = images.shape[0]  # mini-batch size (it might be different from 'batch_size')
+                epoch_num_train_examples += batch_num_train_examples
+                
+                lengths = Variable(torch.LongTensor(captions_length))
+                    
+                lengths = lengths.to(device_t)
+                images = images.to(device_t)
+                captions = captions.to(device_t) # captions > (B, L)
+                captions_training = captions_training.to(device_t) # captions > (B, |L|-1) without end token
+
+                # computing the network output on the current mini-batch
+                features = encoder(images)
+                outputs = decoder(features, captions,lengths) # outputs > (B, L, |V|); 
+                
+                # (B, L, |V|) -> (B * L, |V|) and captions > (B * L)
+                loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions.reshape(-1))
+                
+                # computing gradients and updating the network weights
+                loss.backward()  # computing gradients
+                optimizer.step()  # updating weights
+
+                print(f"mini-batch:\tloss={loss.item():.4f}")
+            with torch.no_grad():
+                decoder.eval()
+                encoder.eval()
+                features = encoder(images)
+                numb = random.randint(0,2)
+                caption = decoder.sample(features[numb])
+                print(vocabulary.rev_translate(captions[numb]))
+                print(vocabulary.rev_translate(caption[0]))
+                decoder.train()
+                encoder.train()
+
+# Example of usage
+if __name__ == "__main__":
+    from Vocabulary import Vocabulary
+    from Dataset import MyDataset
+    from torch.utils.data import DataLoader
+    ds = MyDataset("./dataset", percentage=1)
+    ds = ds.get_fraction_of_dataset(percentage=100)
+    # use dataloader facilities which requires a preprocessed dataset
+    v = Vocabulary(ds,reload=True)    
+    
+    dataloader = DataLoader(ds, batch_size=30,
+                        shuffle=True, num_workers=4, collate_fn = lambda data: ds.pack_minibatch_training(data,v))
+    
+    train(dataloader, dataloader, 1e-3, 400, v)