diff --git a/.gitignore b/.gitignore index 8a4a0ca..6966b61 100644 --- a/.gitignore +++ b/.gitignore @@ -119,3 +119,4 @@ dmypy.json dataset/* .saved/* *.Identifier +*.zip diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..4107551 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Usare IntelliSense per informazioni sui possibili attributi. + // Al passaggio del mouse vengono visualizzate le descrizioni degli attributi esistenti. + // Per altre informazioni, visitare: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: File corrente", + "type": "python", + "request": "launch", + "program": "/home/christian/Documenti/GitHub/Image-Captioning/v1/NeuralNet.py", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/light_version/Dataset.py b/bck_old/Dataset.py similarity index 99% rename from light_version/Dataset.py rename to bck_old/Dataset.py index 19ce07d..3cd200c 100644 --- a/light_version/Dataset.py +++ b/bck_old/Dataset.py @@ -117,6 +117,7 @@ def pack_minibatch_evaluation(self, data): images = torch.stack(images, 0) caption_lengths = [len(caption) for caption in captions] + captions captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True) return images,captions.type(torch.LongTensor),caption_lengths \ No newline at end of file diff --git a/light_version/NeuralNet.py b/bck_old/NeuralNet.py similarity index 90% rename from light_version/NeuralNet.py rename to bck_old/NeuralNet.py index 08256b4..a72c6b7 100644 --- a/light_version/NeuralNet.py +++ b/bck_old/NeuralNet.py @@ -9,7 +9,7 @@ device = "cuda:0" class EncoderCNN(nn.Module): - def __init__(self, embed_size): + def __init__(self, embedding_size): super(EncoderCNN, self).__init__() resnet = models.resnet50(pretrained=True) for param in resnet.parameters(): @@ -17,38 +17,29 @@ def __init__(self, embed_size): modules = list(resnet.children())[:-1] # remove last fc layer self.resnet = nn.Sequential(*modules) - self.linear = nn.Linear(resnet.fc.in_features, 50) + self.linear = nn.Linear(resnet.fc.in_features, embedding_size) def forward(self, images): - features = self.resnet(images) - features = features.reshape(features.size(0), -1) + features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.) features = self.linear(features) return features class DecoderRNN(nn.Module): - def __init__(self, hidden_size, padding_index, vocab_size, embeddings ): + def __init__(self, hidden_size, padding_index, vocab_size, embeddings, embedding_size): """Set the hyper-parameters and build the layers.""" super(DecoderRNN, self).__init__() - # Keep track of hidden_size for initialization of hidden state - self.hidden_size = hidden_size # Embedding layer that turns words into a vector of a specified size - self.word_embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True, padding_idx = 0) + self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index) # The LSTM takes embedded word vectors (of a specified size) as input # and outputs hidden states of size hidden_dim - self.lstm = nn.LSTM(input_size=50, \ - hidden_size=1024, # LSTM hidden units - num_layers=1, # number of LSTM layer - batch_first=True, # input & output will have batch size as 1st dimension - dropout=0, # Not applying dropout - bidirectional=False, # unidirectional LSTM - ) + self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size) # The linear layer that maps the hidden state output dimension # to the number of words we want as output, vocab_size - self.linear_1 = nn.Linear(1024, vocab_size) + self.linear_1 = nn.Linear(hidden_size, vocab_size) def init_hidden_state(self, encoder_out): """ diff --git a/light_version/Vocabulary.py b/bck_old/Vocabulary.py similarity index 100% rename from light_version/Vocabulary.py rename to bck_old/Vocabulary.py diff --git a/heavy_version/Models/Dataset.py b/heavy_version/Models/Dataset.py deleted file mode 100644 index b0cac76..0000000 --- a/heavy_version/Models/Dataset.py +++ /dev/null @@ -1,134 +0,0 @@ -from xml.dom import ValidationErr - -from Sample import Sample -import os -import pandas as pd -import torch -import numpy as np -from enum import Enum -from torch.utils.data import Dataset, DataLoader -import torch.nn as nn - -class DatasetState(Enum): - """A dataset could be in 3 possible, mutual exclusive, state: - - Raw -> Sample are raw, no preprocessing operation performed - - Training -> All Samples are pre-processed for training - - Evaluation -> All Samples are pre-processed for evaluation - - Args: - Enum (int): Raw or Training or Evaluation - """ - Raw = 0 - Training = 1 - Evaluation = 2 - -# TO-Do -# Aggiungere a README, la modalita` in cui si elabora il dataset e`: ho una cartella il cui contenuto e`: -# 1) un file result.csv che contiene i dati come il formato gia` definito -# 2) una cartella images nella quale ci sono tutte le immagini, tutte le sottocartelle di images non verranno considerate - -class MyDataset(Dataset): - # The dataset will have this shape - # | id_sample | sample | dirty | - # |-----------|--------|-------| - # | | | | - # | | | | - # | | | | - # - # id_sample is an unique identifier of the sample - # sample is the object associated - # dirty is boolean and it means: this sample was already taken from the method get_fraction_of_dataset, this implies that externally somebody already taken this sample. - - def __init__(self, directory_of_data:str = None, percentage:int = 100,already_computed_dataframe: pd.DataFrame = None, state: DatasetState = DatasetState.Raw): - """Create a new dataset from source files - - Args: - directory_of_data (str): [description] - """ - self.state: DatasetState = state - if already_computed_dataframe is not None: - self.dataset = already_computed_dataframe - return - - if not os.path.exists(directory_of_data): - raise ValueError(f"{directory_of_data} not Exist!") - if not os.path.isdir(directory_of_data): - raise ValueError(f"{directory_of_data} is not a directory!") - - _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]] - _temp_dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100))) - samples = _temp_dataset.apply(lambda row: Sample(int(row.name)+1,f"{directory_of_data}/images/{row.image_name}",row.comment),axis=1) - - self.dataset: pd.DataFrame = pd.DataFrame(list(zip([i for i in range(len(samples))],samples,[False for _ in range(len(samples))])), columns=["id_sample","sample","dirty"]) - - - def suffle_data_set(self): - self.dataset.apply(torch.randperm, axis=0) - - def get_fraction_of_dataset(self, percentage: int, also_dirty: bool = False): - if not also_dirty: - _temp_df = self.dataset[self.dataset["dirty"] == False] - _temp_df = _temp_df.apply(np.random.permutation, axis=0) - _temp_df_moved = _temp_df.head(int(len(_temp_df)*(percentage/100))) - _temp_df_copy = _temp_df_moved.copy() - self.dataset.loc[_temp_df_moved["id_sample"],"dirty"] = True - return MyDataset(already_computed_dataframe=_temp_df_copy) - - - def make_dirty(self) -> bool: - self.dataset["dirty"] = True - - def make_clean(self) -> bool: - self.dataset["dirty"] = False - - # torch.utils.data.Dataset is an abstract class representing a dataset. Your custom dataset should inherit Dataset and override the following methods: - - # __len__ so that len(dataset) returns the size of the dataset. - # __getitem__ to support the indexing such that dataset[i] can be used to get i-ith sample. - - def __len__(self): - return self.dataset.shape[0] - - def __getitem__(self, idx): - - if self.state == DatasetState.Raw: - raise ValidationErr("The getitem built-in method cannot be executed when the dataset is in a RAW state.\n Please do some preprocessing on it before __getitem__ call.") - - - sample: Sample = self.dataset.iloc[idx]["sample"] - image, caption = sample.image, sample.caption - - return image,caption - - def pack_minibatch(self, data): - - # Sort a data list by caption length (descending order). - data.sort(key=lambda x: len(x[1]), reverse=True) - - images, captions = zip(*data) - - # Merge images (from tuple of 3D tensor to 4D tensor). - images = torch.stack(images, 0) - - caption_lengths = [len(caption) for caption in captions] - captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True) - return images,captions.type(torch.LongTensor),caption_lengths -#------------------------------- -# Usage - -if __name__ == "__main__": - from Vocabulary import Vocabulary - from PreProcess import PreProcess - ds = MyDataset("./dataset/flickr30k_images/flickr30k_images") - df = ds.get_fraction_of_dataset(percentage=10) - print("pippo") - - # use dataloader facilities which requires a preprocessed dataset - v = Vocabulary(verbose=True) - df_pre_processed,v_enriched = PreProcess.DatasetForTraining.process(dataset=df,vocabulary=v) - - dataloader = DataLoader(df, batch_size=4, - shuffle=False, num_workers=0, collate_fn=df.pack_minibatch) - - for i_batch,images,captions in enumerate(dataloader): - print(i_batch, captions) \ No newline at end of file diff --git a/heavy_version/Models/Interface/__init__.py b/heavy_version/Models/Interface/__init__.py deleted file mode 100644 index 16d05d6..0000000 --- a/heavy_version/Models/Interface/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Interfaces that allow to create multiple instances of an abstract class for different esigence. \ No newline at end of file diff --git a/heavy_version/Models/NeuralNet.py b/heavy_version/Models/NeuralNet.py deleted file mode 100644 index 5aec7b4..0000000 --- a/heavy_version/Models/NeuralNet.py +++ /dev/null @@ -1,220 +0,0 @@ -import torch -import torch.nn as nn -import torchvision.models as models -from torch.nn.utils.rnn import pack_padded_sequence -import torch.nn.functional as F - -class EncoderCNN(nn.Module): - def __init__(self, embed_size): - super(EncoderCNN, self).__init__() - resnet = models.resnet50(pretrained=True) - for param in resnet.parameters(): - param.requires_grad_(False) - - modules = list(resnet.children())[:-1] # remove last fc layer - self.resnet = nn.Sequential(*modules) - self.embed = nn.Linear(resnet.fc.in_features, embed_size) # attach a linear layer () - - def forward(self, images): - with torch.no_grad(): - features = self.resnet(images) - features = features.reshape(features.size(0), -1) - features = self.embed(features) - return features - -class DecoderRNN(nn.Module): - def __init__(self, hidden_size, padding_index, vocab_size, embeddings ): - """Set the hyper-parameters and build the layers.""" - super(DecoderRNN, self).__init__() - # Keep track of hidden_size for initialization of hidden state - self.hidden_size = hidden_size - - # Embedding layer that turns words into a vector of a specified size - self.word_embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True, padding_idx = 0) - - # The LSTM takes embedded word vectors (of a specified size) as input - # and outputs hidden states of size hidden_dim - self.lstm = nn.LSTM(input_size=50, \ - hidden_size=hidden_size, # LSTM hidden units - num_layers=1, # number of LSTM layer - batch_first=True, # input & output will have batch size as 1st dimension - dropout=0, # Not applying dropout - bidirectional=False, # unidirectional LSTM - ) - - # The linear layer that maps the hidden state output dimension - # to the number of words we want as output, vocab_size - self.linear = nn.Linear(hidden_size, vocab_size) - - def init_hidden(self, batch_size): - """ At the start of training, we need to initialize a hidden state; - there will be none because the hidden state is formed based on previously seen data. - So, this function defines a hidden state with all zeroes - The axes semantics are (num_layers, batch_size, hidden_dim) - """ - return (torch.zeros((1, batch_size, self.hidden_size)), \ - torch.zeros((1, batch_size, self.hidden_size))) - - - def forward(self, features, captions,caption_lengths): - """ Define the feedforward behavior of the model """ - - # Initialize the hidden state - batch_size = features.shape[0] # features is of shape (batch_size, embed_size) - self.hidden = self.init_hidden(batch_size) - - # Create embedded word vectors for each word in the captions - embeddings = self.word_embeddings(captions) # embeddings new shape : (batch_size, captions length -1, embed_size) - - # Stack the features and captions - embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1) # embeddings new shape : (batch_size, caption length, embed_size) - - packed = pack_padded_sequence(embeddings, caption_lengths, batch_first=True) - # Get the output and hidden state by passing the lstm over our word embeddings - # the lstm takes in our embeddings and hidden state - lstm_out, self.hidden = self.lstm(packed) # lstm_out shape : (batch_size, caption length, hidden_size) - - # Fully connected layer - outputs = self.linear(lstm_out[0]) # outputs shape : (batch_size, caption length, vocab_size) - - return outputs - - def sample(self, features, states=None): - """Generate captions for given image features using greedy search.""" - sampled_ids = [] - inputs = features.unsqueeze(1) - inputs = inputs.reshape((1,1,inputs.shape[0])) - self.init_hidden(1) - with torch.no_grad(): - for _ in range(30): - hiddens, states = self.lstm(inputs, states) # hiddens: (batch_size, 1, hidden_size) - outputs = self.linear(hiddens.squeeze(1)) # outputs: (batch_size, vocab_size) - _, predicted = outputs.max(1) # predicted: (batch_size) - sampled_ids.append(predicted) - inputs = self.word_embeddings(predicted) # inputs: (batch_size, embed_size) - inputs = inputs.unsqueeze(1) # inputs: (batch_size, 1, embed_size) - sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (batch_size, max_seq_length) - return sampled_ids - -def save(self, file_name): - """Save the classifier.""" - - torch.save(self.net.state_dict(), file_name) - -def load(self, file_name): - """Load the classifier.""" - - # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device) - self.net.load_state_dict(torch.load(file_name, map_location=self.device)) - -def train(train_set, validation_set, lr, epochs, vocabulary): - device = torch.device("cuda:0") - criterion = nn.CrossEntropyLoss() - - # initializing some elements - best_val_acc = -1. # the best accuracy computed on the validation data - best_epoch = -1 # the epoch in which the best accuracy above was computed - - encoder = EncoderCNN(50) - decoder = DecoderRNN(2048,0,len(vocabulary.word2id.keys()),vocabulary.embeddings) - - encoder.to(device) - decoder.to(device) - - # ensuring the classifier is in 'train' mode (pytorch) - decoder.train() - - # creating the optimizer - optimizer = torch.optim.Adam(list(decoder.parameters()), lr) - - # loop on epochs! - for e in range(0, epochs): - - # epoch-level stats (computed by accumulating mini-batch stats) - epoch_train_acc = 0. - epoch_train_loss = 0. - epoch_num_train_examples = 0 - - for images,captions,captions_length in train_set: - decoder.zero_grad() - encoder.zero_grad() - # zeroing the memory areas that were storing previously computed gradients - batch_num_train_examples = images.shape[0] # mini-batch size (it might be different from 'batch_size') - epoch_num_train_examples += batch_num_train_examples - - images = images.to(device) - captions_length = captions_length.to(device) - targets = targets.to(device) - - # computing the network output on the current mini-batch - features = encoder(images) - outputs = decoder(features, captions,captions_length) - - targets = pack_padded_sequence(captions, captions_length, batch_first=True)[0] - - # computing the loss function - try: - loss = criterion(outputs, targets) - except Exception as ex: - print(ex) - # computing gradients and updating the network weights - loss.backward() # computing gradients - optimizer.step() # updating weights - - print(f"mini-batch:\tloss={loss.item():.4f}") - torch.save(decoder.state_dict(),".saved/decoder.pt") - features = encoder(images) - caption = decoder.sample(features[0]) - print(vocabulary.rev_translate(captions)) - print(vocabulary.rev_translate(caption)) - # computing the performance of the net on the current training mini-batch - # with torch.no_grad(): # keeping these operations out of those for which we will compute the gradient - # self.net.eval() # switching to eval mode - - # # computing performance - # batch_train_acc = self.__performance(outputs, y) - - # # accumulating performance measures to get a final estimate on the whole training set - # epoch_train_acc += batch_train_acc * batch_num_train_examples - - # # accumulating other stats - # epoch_train_loss += loss.item() * batch_num_train_examples - - # self.net.train() # going back to train mode - - # # printing (mini-batch related) stats on screen - # print(" mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc)) - - # val_acc = self.eval_classifier(validation_set) - - # # saving the model if the validation accuracy increases - # if val_acc > best_val_acc: - # best_val_acc = val_acc - # best_epoch = e + 1 - # self.save("classifier.pth") - - # epoch_train_loss /= epoch_num_train_examples - - # # printing (epoch related) stats on screen - # print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}" - # + (", BEST!" if best_epoch == e + 1 else "")) - # .format(e + 1, epochs, epoch_train_loss, - # epoch_train_acc / epoch_num_train_examples, val_acc)) - -# Example of usage -if __name__ == "__main__": - from Vocabulary import Vocabulary - from PreProcess import PreProcess - from Dataset import MyDataset - from torch.utils.data import DataLoader - ds = MyDataset("./dataset", percentage=2) - df = ds.get_fraction_of_dataset(percentage=100) - - # use dataloader facilities which requires a preprocessed dataset - v = Vocabulary(verbose=True) - df_pre_processed,v_enriched = PreProcess.DatasetForTraining.process(dataset=df,vocabulary=v) - - dataloader = DataLoader(df, batch_size=10, - shuffle=True, num_workers=0, collate_fn=df.pack_minibatch) - - train(dataloader, dataloader, 1e-2, 10, v_enriched) \ No newline at end of file diff --git a/heavy_version/Models/PreProcess.py b/heavy_version/Models/PreProcess.py deleted file mode 100644 index 3a04af7..0000000 --- a/heavy_version/Models/PreProcess.py +++ /dev/null @@ -1,191 +0,0 @@ -from abc import ABC, abstractmethod -from enum import Enum -from PIL import Image -from torchvision import transforms -import torch -from Dataset import Dataset, DatasetState -from Sample import Sample -from Vocabulary import Vocabulary -import re -from typing import Tuple - -class ABCPreProcess(ABC): - """Class which implements preprocessing methods for a given object - """ - - @abstractmethod - def process(object_i, **parameters): - pass - - -class PreProcessImageForTraining(ABCPreProcess): - - @staticmethod - def process(object_i: Image, parameters) -> torch.FloatTensor: - """ - Function that pre-process an image for training. - - Args: - object_i (Image): [description], - parameters:{ - crop:{ - "size": (int), Expected output size of the crop, for each edge. - "scale: Tuple(float,float), :ower and upper bounds for the random area of the crop, before resizing. - "ratio": Tuple(float,float), Lower and upper bounds for the random aspect ratio of the crop, before resizing. - } - "mean": (float), - "std_dev": (float) - } - - Returns: - torch.FloatTensor: torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] - """ - operations = transforms.Compose([ - transforms.RandomResizedCrop(parameters["crop"]["size"], scale=parameters["crop"]["scale"], ratio=parameters["crop"]["ratio"]), # Crop a random portion of image and resize it to a given size. - transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability. - transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor. (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] - transforms.Normalize(mean=parameters["mean"], std=parameters["std_dev"]), - ]) - return operations(object_i) - - - -class PreProcessImageForEvaluation(ABCPreProcess): - - @staticmethod - def process(object_i: Image, **parameters) -> torch.FloatTensor: - """Function that pre-process an image for evaluation. - Args: - object_i (Image): [description], - parameters:{ - crop:{ - "size": (int), Desired output size of the crop, for each edge. - "center: (int) Desired output size of the crop after centering - } - "mean": (float), - "std_dev": (float) - } - - Returns: - torch.FloatTensor: torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] - """ - operations = transforms.Compose([ - transforms.Resize(parameters["crop"]["size"]), - transforms.CenterCrop(parameters["crop"]["center"]), # Crops the given image at the center. - transforms.ToTensor(), - transforms.Normalize(mean=parameters["mean"], std=parameters["std_dev"]), - ]) - return operations(object_i) - -class PreProcessCaption(ABCPreProcess): - - @staticmethod - def process(caption: str, **parameters) -> list[str]: - """Process a caption for being used in the network - - Args: - caption (str): The caption to be processed. - - Returns: - torch.tensor: A tensor - """ - tokenized_caption = re.findall("[\\w]+|\.|\,", caption.lower()) - return tokenized_caption - - -class PreProcessDatasetForTraining(ABCPreProcess): - - image_trasformation_parameter = { - "crop":{ - "size": 224, - "scale": (0.08,1.0), - "ratio": (3. / 4., 4. / 3.), - }, - "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB) - "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB) - } - @staticmethod - def process(dataset: Dataset, vocabulary: Vocabulary) -> Tuple[Dataset,Vocabulary]: - - # Control block - if dataset.state == DatasetState.Training: - torch.warnings.warn("The Dataset is already prepared for Training, another pre-process training could lead to some inconsistence.") - - if dataset.state == DatasetState.Evaluation: - torch.warnings.warn("The Dataset is already prepared for Evaluation, pre-process for training could lead to some inconsistence.") - - # PreProcess block - for sample in dataset.dataset["sample"]: - sample.alter_caption(PreProcess.Caption.process(sample.caption)) - sample.alter_image(PreProcess.ImageForTraining.process(sample.image, PreProcessDatasetForTraining.image_trasformation_parameter)) - - # Enrich the vocabulary - vocabulary.make_enrich = True - vocabulary.bulk_enrich([sample.caption for sample in dataset.dataset["sample"][:]]) - vocabulary.make_enrich = False - - # Do the In Place Translation for the caption for each sample in the dataset - dataset.dataset.apply(lambda record: record["sample"].alter_caption(vocabulary.translate(record["sample"].caption)), axis=1) - - dataset.state = DatasetState.Training - return dataset, vocabulary - - -class PreProcessDatasetForEvaluation(ABCPreProcess): - - image_trasformation_parameter = { - "crop":{ - "size": 256, - "center": 224 - }, - "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB) - "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB) - } - @staticmethod - def process(dataset: Dataset, vocabulary: Vocabulary) -> Tuple[Dataset,Vocabulary]: - - # Control block - if dataset.state == DatasetState.Training: - torch.warnings.warn("The Dataset is already prepared for Training, another pre-process training could lead to some inconsistence.") - - if dataset.state == DatasetState.Evaluation: - torch.warnings.warn("The Dataset is already prepared for Evaluation, pre-process for training could lead to some inconsistence.") - - # PreProcess block - for sample in dataset.dataset["sample"]: - sample.alter_caption(PreProcess.Caption.process(sample.caption)) - sample.alter_image(PreProcess.ImageForTraining.process(sample.image, PreProcessDatasetForTraining.image_trasformation_parameter)) - - - # Enrich the vocabulary - vocabulary.make_enrich = True - vocabulary.bulk_enrich([sample.caption for sample in dataset.dataset["sample"][:]]) - vocabulary.make_enrich = False - - # Do the In Place Translation for the caption for each sample in the dataset - dataset.dataset.apply(lambda record: record["sample"].alter_caption(vocabulary.translate(record["sample"].caption)), axis=1) - - dataset.state = DatasetState.Evaluation - return dataset, vocabulary - -class PreProcess(): - ImageForTraining = PreProcessImageForTraining - ImageForEvaluation = PreProcessImageForEvaluation - Caption = PreProcessCaption - DatasetForTraining = PreProcessDatasetForTraining - DatasetForEvaluation = PreProcessDatasetForTraining - -# ---------------------------------------------------------------- -# How to use - -if __name__ == '__main__': - - ds = Dataset("./dataset") - df = ds.get_fraction_of_dataset(percentage=10) - - v = Vocabulary(verbose=True) - # Make a translation - print(v.translate(["I","like","PLay","piano","."])) - - df_pre_processed,v_enriched = PreProcess.DatasetForTraining.process(dataset=df,vocabulary=v) - print(df_pre_processed) \ No newline at end of file diff --git a/heavy_version/Models/Sample.py b/heavy_version/Models/Sample.py deleted file mode 100644 index 22a07ab..0000000 --- a/heavy_version/Models/Sample.py +++ /dev/null @@ -1,90 +0,0 @@ -from PIL import Image -import os - -class Sample(): - """Model class for managing a sigle sample of the dataset. - """ - - def __init__(self, id: int, image_path_file: str, caption: str, verbose: bool = False): - """Constructor of a sample of the dataset. - - Args: - id (int): The id associated with this sample. - [Constraint: The id must be unique (Caller responsibility).] - [Constraint: The id must be greater than 0.] - image_path_file (str): The image path, in relative path format. (Assumed the main.py as the entry point) - caption (str): Raw caption associated with this sample. - [Constraint: Caption has to be a string with length greater than 1 characters.] - verbose (bool, optional): The class will be verbose if True. Defaults to False. - Raises: - FileNotFoundError: The given relative path to the image is invalid. - ValueError: The caption is invalid. - ValueError: The id is invalid. - """ - - # Validation of constructor parameters - if not os.path.isfile(image_path_file): - raise FileNotFoundError("The given path_file resemble a non-existing file.") - - if len(caption.strip()) <= 1: - raise ValueError(f"The caption has a length of {len(caption.strip())} characters, which is not supported.") - - if id <= 0: - raise ValueError(f"The id must be greater than 0. \n Given {id}.") - - if verbose: - print(f"Image path: {image_path_file}") - print(f"Loading..") - - # Loading the image - self._image = None - try: - self._image = Image.open(image_path_file).convert('RGB') # Load and convert to RGB - except Exception as e: - raise e - if self._image is None: - raise Exception("Could not load image.") - if verbose: - print("Ok.") - - # Loading the caption - self.caption = caption - - # Loading the id - self.id = id - - # Set verbosity - self._verbose = verbose - - # Tell externally if this sample is altered (pre-processed, or other..) if False, otherwise the data is inside are raw if True - self.is_raw = True - - @property - def image(self) -> Image: - """Getter of the image property - - Returns: - Image: The image object. - """ - return self._image - - def alter_image(self, altered_image): - """Alter the sample image by place a new one (could be the same but modified or a tensorial form of the image) - - Args: - image (object): The new image with possible differente representation - """ - self._image = altered_image - self.is_raw = False - - def alter_caption(self, altered_caption): - """Alter the caption, now could be a string as before or a list of string, ready for being processed by the NN. - - Args: - image (Image): The new image - """ - self.caption = altered_caption - self.is_raw = False - - - \ No newline at end of file diff --git a/heavy_version/Models/Vocabulary.py b/heavy_version/Models/Vocabulary.py deleted file mode 100644 index 49f56ae..0000000 --- a/heavy_version/Models/Vocabulary.py +++ /dev/null @@ -1,185 +0,0 @@ -import os -import torch -import warnings - -class Vocabulary(): - # The vocabulary implementation is done with a pre-trained word embedding GLOVE50d - # each word is represented by a record in a dataframe with this structure - - - def __init__(self, verbose: bool = False): - - self.enriched = False # Tell that all the word coming from the dataset are in the vocabulary if it is set to True - self._make_enrich = False # Allow the user to enrich the vocabulary if it is set to True - # Check if the enriched vocabulary(glove + PAD + SOS + EOS + UNK + dataset vocabulary) already exists - if os.path.exists(".saved/rich_embeddings.pt") and os.path.exists(".saved/rich_word2id.pt"): - self.embeddings = torch.load(".saved/rich_embeddings.pt") - self.word2id = torch.load(".saved/rich_word2id.pt") - self.enriched = True - return - - # Check if the base vocabulary(glove + PAD + SOS + EOS + UNK) already exists - if os.path.exists(".saved/base_embeddings.pt") and os.path.exists(".saved/base_word2id.pt"): - self.embeddings = torch.load(".saved/base_embeddings.pt") - self.word2id = torch.load(".saved/base_word2id.pt") - return - - # Since the constructor arrived here, we need to load for the 1st time the glove word embeddings - - self.word2id = {} - self.embeddings = torch.zeros((400004, 50)) # DIM1: Glove50 rows + 4 flavored token (PAD + SOS + EOS + UNK) | DIM2: Embedding Size 50d - - # Initialize the token: - # , , , - self.word2id[""] = 0 - self.word2id[""] = 1 - self.word2id[""] = 2 - self.word2id[""] = 3 - - self.embeddings[self.word2id[""]] = torch.zeros(50, dtype=torch.float32) - self.embeddings[self.word2id[""]] = torch.rand(50, dtype=torch.float32) - self.embeddings[self.word2id[""]] = torch.rand(50, dtype=torch.float32) - self.embeddings[self.word2id[""]] = torch.rand(50, dtype=torch.float32) - - counter = 4 - with open('.saved/glove.6B.50d.txt', 'r', encoding='utf-8') as _vocabulary_file: - for line in _vocabulary_file: - line = line.strip().split() - self.word2id[line[0]] = counter - self.embeddings[counter] = torch.tensor([float(dimension) for dimension in line[1:]], dtype=torch.float32) - counter += 1 - torch.save(self.embeddings,".saved/base_embeddings.pt") - torch.save(self.word2id,".saved/base_word2id.pt") - print("break") - - def predefined_token_idx(self) -> dict: - return { - "":0, - "":1, - "":2, - "":3 - } - - def translate(self, word_sequence : list[str]) -> torch.tensor: - """Given a sequence of word, translate into id list according to the vocabulary. - - Args: - word_sequence (str): [description] - """ - # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset. - if not self.enriched: - warnings.warn("The vocabulary is not enriched with dataset words that could be not in glove, pay attention to what you want to do with this representation.") - - # Initialize the translator - _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # +2 because of and token - _sequence[0] = self.word2id[""] - _sequence[-1] = self.word2id[""] - - counter = 1 # SKIP THE TOKEN - for word in word_sequence: - if word.lower() in self.word2id.keys(): - _sequence[counter] = self.word2id[word.lower()] - else: - _sequence[counter] = self.word2id[""] - counter += 1 - return _sequence - - def rev_translate(self, words_id : torch.tensor) -> list[str]: - """Given a sequence of word, translate into id list according to the vocabulary. - - Args: - word_sequence (str): [description] - """ - # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset. - return [list(self.word2id.keys())[idx] for idx in words_id[0,:].tolist()] # word_id (1,caption_length) - - - @property - def make_enrich(self): - return self._make_enrich - - @make_enrich.setter - def make_enrich(self, value: bool): - if not isinstance(value,bool): - raise TypeError("The value that you want to put on make_enrich is not a boolean. Pay attention!") - - if value is False: - if self.make_enrich and not self.enriched: # If before the setter call make_enrich was True, probably the vocabulary was enriched by somebody, so the vocabulary now is in the state Enriched - self.enriched = True - # The enriched version of the vocabulary need to be dumped in memory - torch.save(self.embeddings,".saved/rich_embeddings.pt") - torch.save(self.word2id,".saved/rich_word2id.pt") - self._make_enrich = False - else: - self._make_enrich = value - - def enrich(self, words: list[str]) -> bool: - - if not self.make_enrich: - raise ValueError(f"The vocabulary is not set to be enriched, before the enrichment set the flag 'make_enrich' to True.") - - _new_word = [] - for word in words: - if word.lower() in self.word2id.keys(): - continue - _new_word.append(word) - - if len(_new_word) == 0: - return False - - _enrichment_of_embedding = torch.zeros((len(_new_word),50)) - - _id_carry = len(self.word2id.keys()) # The new ids start from len(.) cause the ids start from 0 and not from 1 - - for number,word in enumerate(_new_word): - self.word2id[word.lower()] = _id_carry - _enrichment_of_embedding[number] = torch.rand(50, dtype=torch.float32) - _id_carry += 1 - - # Append the enrichment at the end of the embeddings_matrix - self.embeddings = torch.cat([self.embeddings,_enrichment_of_embedding], dim=0) - return True - - def bulk_enrich(self, sequences: list[list[str]]) -> bool: - _words_flatten = [word for sequence in sequences for word in sequence] # flatten a list of list, credits to wjandrea on stackoverflow <3 - return self.enrich(_words_flatten) - - - def __len__(self): - """The total number of words in this Vocabulary.""" - - return len(self.word2id.keys()) - - -# ---------------------------------------------------------------- -# Usage example - -if __name__ == '__main__': - #Load the vocabulary - v = Vocabulary(verbose=True) - # Make a translation - print(v.translate(["I","like","PLay","piano","."])) - # Enrich the vocabulary - v.make_enrich = True - dataset = ["I","Like","PLay","PIPPOplutopaperino"] - v.enrich(dataset) - v.make_enrich = False - # Enrich the vocabulary with a bulk insert - v.make_enrich = True - dataset = [["I","Like","PLay","PIPPOplutopaperino"],["I","Like","PLay","pizza"]] - v.bulk_enrich(dataset) - v.make_enrich = False - - - - - - - - - - - - - - \ No newline at end of file diff --git a/heavy_version/Models/__init__.py b/heavy_version/Models/__init__.py deleted file mode 100644 index 0ae43be..0000000 --- a/heavy_version/Models/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# All the Model which represents a specific entity in the framework of this project. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5e0b8da --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +certifi==2021.10.8 +charset-normalizer==2.0.10 +dataclasses==0.8 +idna==3.3 +kaggle==1.5.12 +numpy==1.19.5 +pandas==1.1.5 +Pillow==8.4.0 +pkg_resources==0.0.0 +python-dateutil==2.8.2 +python-slugify==5.0.2 +pytz==2021.3 +requests==2.27.1 +six==1.16.0 +text-unidecode==1.3 +torch==1.3.0+cu100 +torchvision==0.4.1+cu100 +tqdm==4.62.3 +typing_extensions==4.0.1 +urllib3==1.26.8 diff --git a/v1 copy/Dataset.py b/v1 copy/Dataset.py new file mode 100644 index 0000000..c2a0ed8 --- /dev/null +++ b/v1 copy/Dataset.py @@ -0,0 +1,135 @@ +import os +import pandas as pd +import torch +import numpy as np +from enum import Enum +from torch.utils.data import Dataset, DataLoader +import torch.nn as nn +from PIL import Image +import re +from torchvision import transforms + +class MyDataset(Dataset): + + training_image_trasformation_parameter = { + "crop":{ + "size": 224, + "scale": (0.08,1.0), + "ratio": (3. / 4., 4. / 3.), + }, + "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB) + "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB) + } + + evaluation_image_trasformation_parameter = { + "crop":{ + "size": 256, + "center": 224 + }, + "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB) + "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB) + } + + def __init__(self, directory_of_data:str = None, percentage:int = 100, already_computed_dataframe: pd.DataFrame = None): + """Create a new dataset from source files + + Args: + directory_of_data (str): [description] + """ + if already_computed_dataframe is not None: + self.directory_of_data = directory_of_data + self._dataset = already_computed_dataframe + return + + if not os.path.exists(directory_of_data): + raise ValueError(f"{directory_of_data} not Exist!") + if not os.path.isdir(directory_of_data): + raise ValueError(f"{directory_of_data} is not a directory!") + + _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]] + self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100))) + self.directory_of_data = directory_of_data + + def get_fraction_of_dataset(self, percentage: int): + _temp_df_moved = self._dataset.head(int(len(self._dataset)*(percentage/100))).sample(frac=1) + _temp_df_copy = _temp_df_moved.copy() + return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy) + + def get_all_distinct_words_in_dataset(self): + words = [] + for idx,row in self._dataset.iterrows(): + for word in re.findall("[\\w]+|\.|\,", row["comment"].lower()): + if word not in words: + words.append(word) + return words + + def __len__(self): + return self._dataset.shape[0] + + def __getitem__(self, idx): + + image, caption = Image.open(f"{self.directory_of_data}/flickr30k_images/{self._dataset.iloc[idx]['image_name']}").convert('RGB'), \ + re.findall("[\\w]+|\.|\,", self._dataset.iloc[idx]["comment"].lower()) + + return image, caption + + def pack_minibatch_training(self, data, vocabulary): + + # Sort a data list by caption length (descending order). + data.sort(key=lambda x: len(x[1]), reverse=True) + + images, captions = zip(*data) + + operations = transforms.Compose([ + transforms.RandomResizedCrop(MyDataset.training_image_trasformation_parameter["crop"]["size"], scale=MyDataset.training_image_trasformation_parameter["crop"]["scale"], ratio=MyDataset.training_image_trasformation_parameter["crop"]["ratio"]), # Crop a random portion of image and resize it to a given size. + transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability. + transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor. (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] + transforms.Normalize(mean=MyDataset.training_image_trasformation_parameter["mean"], std=MyDataset.training_image_trasformation_parameter["std_dev"]), + ]) + images = list(map(lambda image: operations(image),list(images))) + + # Merge images (from tuple of 3D tensor to 4D tensor). + images = torch.stack(images, 0) # (Batch Size, Color, Height, Width) + + captions_length = [len(caption) for caption in captions] # (Batch Size,) + + captions_training_ids = [vocabulary.translate(caption,"uncomplete")for caption in captions] # (Batch Size, Caption) + + captions_target_ids = [vocabulary.translate(caption,"complete") for caption in captions] + + captions_training_ids = nn.utils.rnn.pad_sequence(captions_training_ids, padding_value=0, batch_first=True) + + captions_target_ids = nn.utils.rnn.pad_sequence(captions_target_ids, padding_value=0, batch_first=True) + + return images,captions_training_ids.type(torch.LongTensor),captions_target_ids.type(torch.LongTensor) + + def pack_minibatch_evaluation(self, data, vocabulary): + + # Sort a data list by caption length (descending order). + data.sort(key=lambda x: len(x[1]), reverse=True) + + images, captions = zip(*data) + + operations = transforms.Compose([ + transforms.Resize(MyDataset.evaluation_image_trasformation_parameter["crop"]["size"]), + transforms.CenterCrop(MyDataset.evaluation_image_trasformation_parameter["crop"]["center"]), # Crops the given image at the center. + transforms.ToTensor(), + transforms.Normalize(mean=MyDataset.evaluation_image_trasformation_parameter["mean"], std=MyDataset.evaluation_image_trasformation_parameter["std_dev"]) + ]) + + images = list(map(lambda image: operations(image),list(images))) + + # Merge images (from tuple of 3D tensor to 4D tensor). + images = torch.stack(images, 0) # (Batch Size, Color, Height, Width) + + + captions_evaluation_ids = [vocabulary.translate(caption,"uncomplete")for caption in captions] # (Batch Size, Caption) + + captions_target_ids = [vocabulary.translate(caption,"complete") for caption in captions] + + captions_evaluation_ids = nn.utils.rnn.pad_sequence(captions_evaluation_ids, padding_value=0, batch_first=True) + + captions_target_ids = nn.utils.rnn.pad_sequence(captions_target_ids, padding_value=0, batch_first=True) + + return images,captions_evaluation_ids.type(torch.LongTensor),captions_target_ids.type(torch.LongTensor) + \ No newline at end of file diff --git a/v1 copy/NeuralNet.py b/v1 copy/NeuralNet.py new file mode 100644 index 0000000..9ac00c9 --- /dev/null +++ b/v1 copy/NeuralNet.py @@ -0,0 +1,345 @@ +##################################################### +## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! +# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :) +## +## +## pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html + + +import torch +import torch.nn as nn +import torchvision.models as models +from torch.nn.utils.rnn import pack_padded_sequence +import torch.nn.functional as F + +device="cpu" +class EncoderCNN(nn.Module): + def __init__(self, projection_size): + super(EncoderCNN, self).__init__() + resnet = models.resnet50(pretrained=True) + for param in resnet.parameters(): + param.requires_grad_(False) + + modules = list(resnet.children())[:-1] # remove last fc layer + self.resnet = nn.Sequential(*modules) + self.linear = nn.Linear(resnet.fc.in_features, projection_size) + + def forward(self, images): + features = self.resnet(images) + features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.) + features = self.linear(features) + return features + +class DecoderRNN(nn.Module): + def __init__(self, hidden_size, padding_index, vocab_size, embedding_size): + """[summary] + + Args: + hidden_size ([type]): [description] + padding_index ([type]): [description] + vocab_size ([type]): [description] + embedding_size ([type]): [description] + """ + super(DecoderRNN, self).__init__() + + # Embedding layer that turns words into a vector of a specified size + self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index) + + # The LSTM takes embedded word vectors (of a specified size) as input + # and outputs hidden states of size hidden_dim + self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size) + + # The linear layer that maps the hidden state output dimension + # to the number of words we want as output, vocab_size + self.linear_1 = nn.Linear(hidden_size, vocab_size) + + + def forward(self, features, captions): + """[summary] + + Args: + features (torch.tensor(batch_size, hidden_size)): [description] + captions (torch.tensor(batch_size, max_captions_length, word_embedding)): [description] + + Returns: + [torch.tensor(batch_size, max_captions_length, vocab_size)]: [description] + """ + + # Initialize the hidden state + batch_size = features.shape[0] # features is of shape (batch_size, embed_size) + + # Create embedded word vector for each word in the captions + inputs = self.word_embeddings(captions) # In: Out: (batch_size, captions length, embed_size) + + # Feed LSTMCell with image features and retrieve the state + + _h, _c = self.lstm_unit(features) # _h : (Batch size, Hidden size) + + # Deterministict Output as first word of the caption :) + start = torch.zeros(self.word_embeddings.num_embeddings) + start[1] = 1 + outputs = start.repeat(batch_size,1,1).to(torch.device(device)) # Bulk insert of embeddings to all the elements of the batch + + + + # How it works the loop? + # For each time step t \in {0, N-1}, where N is the caption length + + # Since the sequences are padded, how the forward is performed? Since the don't need to be feeded as input? + # The assumption is that the captions are of lenght N-1, so the captions provide by external as input are without token + + for idx in range(0,inputs.shape[1]): + _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) + _outputs = self.linear_1(_h) + outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) + + return outputs # (Batch Size, N, |Vocabulary|) + + def generate_caption(self, features, max_caption_length): + """Generate captions for given image features using greedy search.""" + + sampled_ids = [torch.tensor([1]).to(torch.device(device))] # Hardcoded + input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,-1)) + with torch.no_grad(): + _h ,_c = self.lstm_unit(features.unsqueeze(0)) + for _ in range(max_caption_length-1): + _h, _c = self.lstm_unit(input, (_h ,_c)) # _h: (1, 1, hidden_size) + outputs = self.linear_1(_h) # outputs: (1, vocab_size) + _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if device == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: (batch_size) + sampled_ids.append(predicted) + input = self.word_embeddings(predicted) # inputs: (batch_size, embed_size) + input = input.to(torch.device(device)) # inputs: (batch_size, 1, embed_size) + if predicted == 2: + break + sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (batch_size, max_seq_length) + return sampled_ids + + +class CaRNet1(nn.Module): + + def __init__(self, hidden_size, padding_index, vocab_size, embedding_size, device = "cpu"): + """[summary] + + Args: + hidden_size ([type]): [description] + padding_index ([type]): [description] + vocab_size ([type]): [description] + embedding_size ([type]): [description] + """ + super(CaRNet1, self).__init__() + self.padding_index = padding_index + self.device = torch.device(device) + self.C = EncoderCNN(embedding_size) + self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size) + + self.C.to(self.device) + self.R.to(self.device) + + def save(self, file_name): + """Save the classifier.""" + torch.save(self.C.state_dict(), f".saved/v1/{file_name}_C.pth") + torch.save(self.R.state_dict(), f".saved/v1/{file_name}_R.pth") + + def load(self, file_name): + """Load the classifier.""" + + # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device) + self.C.load_state_dict(torch.load(f".saved/v1/{file_name}_C.pth", map_location=self.device)) + self.R.load_state_dict(torch.load(f".saved/v1/{file_name}_R.pth", map_location=self.device)) + + def forward(self,images,captions): + features = self.C(images) + return self.R(features, captions) + + def __accuracy(self, outputs, labels): + """[summary] + + Args: + outputs ([type]): [description] + labels ([type]): [description] + + Returns: + [type]: [description] + """ + # Assume outputs and labels have same shape and already padded + # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch! + # With this technique we evaluate all major case: + # 1) Output caption is longer than expected : Output.ID - .ID != 0 + # 2) Output is less longer than expect : .ID - Target.ID != 0 + # 3) Output has equal dimension but different label : Output.ID - Target.ID != 0, + # Hp. 1 : Output.ID - Target.ID = 0 need to be considered as good match because it means that both output and target end before this token + # Hp. 2 : Both Outputs and Target need to be dropped on the first word because is evaluated in a deterministic fashion :) + # computing the accuracy + + right_predictions = torch.eq(outputs[:,1:], labels[:,1:]) + acc = torch.mean(right_predictions.to(torch.float32).sum(axis=1) / right_predictions.shape[1] ).item() # Accuracy = TP+TN / ALL + return acc + + # TO DO: Devo usare la confusion matrix????????? + + def train(self, train_set, validation_set, lr, epochs, vocabulary): + + criterion = nn.CrossEntropyLoss(ignore_index=self.padding_index,reduction="sum").cuda() if self.device.type == "cuda" \ + else nn.CrossEntropyLoss(ignore_index=0,reduction="sum") + + # initializing some elements + best_val_acc = -1. # the best accuracy computed on the validation data + best_epoch = -1 # the epoch in which the best accuracy above was computed + + + + # ensuring the classifier is in 'train' mode (pytorch) + self.C.train() + self.R.train() + + # creating the optimizer + optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr) + + # loop on epochs! + for e in range(0, epochs): + + # epoch-level stats (computed by accumulating mini-batch stats) + epoch_train_acc = 0. + epoch_train_loss = 0. + epoch_num_train_examples = 0 + + for images,captions_training_ids,captions_target_ids in train_set: + optimizer.zero_grad() + + batch_num_train_examples = images.shape[0] # mini-batch size (it might be different from 'batch_size') + epoch_num_train_examples += batch_num_train_examples + + + images = images.to(self.device) + captions_training_ids = captions_training_ids.to(self.device) # captions > (B, L) + captions_target_ids = captions_target_ids.to(self.device) # captions > (B, |L|-1) without end token + + # computing the network output on the current mini-batch + features = self.C(images) + outputs = self.R(features, captions_training_ids) # outputs > (B, L, |V|); + + # (B, L, |V|) -> (B * L, |V|) and captions > (B * L) + loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions_target_ids.reshape(-1)) + + # computing gradients and updating the network weights + loss.backward() # computing gradients + optimizer.step() # updating weights + + # with torch.no_grad(): + # self.C.eval() + # self.R.eval() + # features = self.C(images) + # import random + # numb = random.randint(0,2) + # caption = self.R.generate_caption(features[numb],30) + # print(vocabulary.rev_translate(captions_target_ids[numb])) + # print(vocabulary.rev_translate(caption[0])) + # self.C.train() + # self.R.train() + + with torch.no_grad(): + self.C.eval() + self.R.eval() + + # Compute captions as ids for all the training images + projections = self.C(images) + + captions_output = torch.zeros((projections.shape[0],captions_target_ids.shape[1])).to(torch.device(device)) + + for idx,projection in enumerate(range(projections.shape[0])): + _caption_no_pad = self.R.generate_caption(projections[idx],captions_target_ids.shape[1]) + captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad + # Fill the remaining portion of caption eventually with zeros + # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. + + captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors + + # computing performance + batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_target_ids) + + # accumulating performance measures to get a final estimate on the whole training set + epoch_train_acc += batch_train_acc * batch_num_train_examples + + # accumulating other stats + epoch_train_loss += loss.item() * batch_num_train_examples + self.C.train() + self.R.train() + + # printing (mini-batch related) stats on screen + print(" mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc)) + + val_acc = self.eval_classifier(validation_set) + + # saving the model if the validation accuracy increases + if val_acc > best_val_acc: + best_val_acc = val_acc + best_epoch = e + 1 + self.save("CaRNetv1") + + epoch_train_loss /= epoch_num_train_examples + + # printing (epoch related) stats on screen + print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}" + + (", BEST!" if best_epoch == e + 1 else "")) + .format(e + 1, epochs, epoch_train_loss, + epoch_train_acc / epoch_num_train_examples, val_acc)) + + def eval_classifier(self, data_set): + """Evaluate the classifier on the given data set.""" + + # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state) + training_mode_originally_on = self.C.training and self.R.training + if training_mode_originally_on: + self.C.eval() + self.R.eval() # enforcing evaluation mode + + + + with torch.no_grad(): # keeping off the autograd engine + + # loop on mini-batches to accumulate the network outputs (creating a new iterator) + for images,_,captions_validation_target_ids in data_set: + images = images.to(self.device) + + captions_validation_target_ids = captions_validation_target_ids.to(self.device) + + projections = self.C(images) + + captions_output = torch.zeros((projections.shape[0],captions_validation_target_ids.shape[1])).to(torch.device(device)) + + for idx,projection in enumerate(range(projections.shape[0])): + _caption_no_pad = self.R.generate_caption(projections[idx],captions_validation_target_ids.shape[1]) + captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad + # Fill the remaining portion of caption eventually with zeros + # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. + + captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors + + # computing performance + acc = self.__accuracy(captions_output_padded.squeeze(1), captions_validation_target_ids) + + if training_mode_originally_on: + self.C.train() # restoring the training state, if needed + self.R.train() + return acc +# Example of usage +if __name__ == "__main__": + from Vocabulary import Vocabulary + from Dataset import MyDataset + from torch.utils.data import DataLoader + ds = MyDataset("./dataset/flickr30k_images/", percentage=8) + v = Vocabulary(ds,reload=True) + dc = ds.get_fraction_of_dataset(percentage=70) + df = ds.get_fraction_of_dataset(percentage=30) + # use dataloader facilities which requires a preprocessed dataset + + + dataloader_training = DataLoader(dc, batch_size=100, + shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_training(data,v)) + + dataloader_evaluation = DataLoader(df, batch_size=50, + shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v)) + + net = CaRNet1(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cpu") + net.load("CaRNetv1") + net.train(dataloader_training,dataloader_evaluation,1e-3,500,v) diff --git a/v1 copy/Vocabulary.py b/v1 copy/Vocabulary.py new file mode 100644 index 0000000..725945a --- /dev/null +++ b/v1 copy/Vocabulary.py @@ -0,0 +1,131 @@ +import os +import torch +import warnings +from Dataset import MyDataset +from typing import List + +class Vocabulary(): + # The vocabulary implementation is done with a pre-trained word embedding GLOVE50d + # each word is represented by a record in a dataframe with this structure + + + def __init__(self, source_dataset: MyDataset, verbose: bool = False, reload: bool = False): + + self.enriched = False # Tell that all the word coming from the dataset are in the vocabulary if it is set to True + self._make_enrich = False # Allow the user to enrich the vocabulary if it is set to True + # Check if the enriched vocabulary(glove + PAD + SOS + EOS + UNK + dataset vocabulary) already exists + if os.path.exists(".saved/rich_embeddings_v1.pt") and os.path.exists(".saved/rich_word2id_v1.pt") and not reload: + self.embeddings = torch.load(".saved/rich_embeddings_v1.pt") + self.word2id = torch.load(".saved/rich_word2id_v1.pt") + self.enriched = True + return + + # Since the constructor arrived here, we need to load for the 1st time all the possible words from the dataset + dataset_words = source_dataset.get_all_distinct_words_in_dataset() + + # Dictionary length + self.dictionary_length = len(dataset_words)+4 # Dictionary word + 4 Flavored Token (PAD + SOS + EOS + UNK) + + self.word2id = {} + self.embeddings = torch.zeros((self.dictionary_length, self.dictionary_length)) # DIM1: dict rows + 4 flavored token (PAD + SOS + EOS + UNK) | DIM2: Dict Rows +4 flavored token (PAD + SOS + EOS + UNK) as 1-hot + + # Initialize the token: + # , , , + self.word2id[""] = 0 + self.word2id[""] = 1 + self.word2id[""] = 2 + self.word2id[""] = 3 + + counter = 4 + for word in dataset_words: + self.word2id[word] = counter + counter += 1 + + self.embeddings = torch.eye(self.dictionary_length) + + def predefined_token_idx(self) -> dict: + return { + "":0, + "":1, + "":2, + "":3 + } + + def translate(self, word_sequence : List[str], type : str = "complete") -> torch.tensor: + """Given a sequence of word, translate into id list according to the vocabulary. + + Args: + word_sequence (str): [description] + """ + + # Initialize the translator + + if type == "uncomplete": + _sequence = torch.zeros(len(word_sequence)+1, dtype=torch.int32) # + ...Caption... + + if type == "complete": + _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # + ...Caption... + + _sequence[-1] = self.word2id[""] + + _sequence[0] = self.word2id[""] + + counter = 1 # Always skip + + # Evaluate all the word into the caption and translate it to an embeddings + for word in word_sequence: + if word.lower() in self.word2id.keys(): + _sequence[counter] = self.word2id[word.lower()] + else: + _sequence[counter] = self.word2id[""] + counter += 1 + + return _sequence + + def rev_translate(self, words_id : torch.tensor) -> List[str]: + """Given a sequence of word, translate into id list according to the vocabulary. + + Args: + word_sequence (str): [description] + """ + # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset. + return [list(self.word2id.keys())[idx] for idx in words_id[:].tolist()] # word_id (1,caption_length) + + + def __len__(self): + """The total number of words in this Vocabulary.""" + + return len(self.word2id.keys()) + + +# ---------------------------------------------------------------- +# Usage example + +if __name__ == '__main__': + #Load the vocabulary + v = Vocabulary(verbose=True) + # Make a translation + print(v.translate(["I","like","PLay","piano","."])) + # Enrich the vocabulary + v.make_enrich = True + dataset = ["I","Like","PLay","PIPPOplutopaperino"] + v.enrich(dataset) + v.make_enrich = False + # Enrich the vocabulary with a bulk insert + v.make_enrich = True + dataset = [["I","Like","PLay","PIPPOplutopaperino"],["I","Like","PLay","pizza"]] + v.bulk_enrich(dataset) + v.make_enrich = False + + + + + + + + + + + + + + \ No newline at end of file diff --git a/light_version/__init__.py b/v1 copy/__init__.py similarity index 100% rename from light_version/__init__.py rename to v1 copy/__init__.py diff --git a/v1/Dataset.py b/v1/Dataset.py new file mode 100644 index 0000000..c2a0ed8 --- /dev/null +++ b/v1/Dataset.py @@ -0,0 +1,135 @@ +import os +import pandas as pd +import torch +import numpy as np +from enum import Enum +from torch.utils.data import Dataset, DataLoader +import torch.nn as nn +from PIL import Image +import re +from torchvision import transforms + +class MyDataset(Dataset): + + training_image_trasformation_parameter = { + "crop":{ + "size": 224, + "scale": (0.08,1.0), + "ratio": (3. / 4., 4. / 3.), + }, + "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB) + "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB) + } + + evaluation_image_trasformation_parameter = { + "crop":{ + "size": 256, + "center": 224 + }, + "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB) + "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB) + } + + def __init__(self, directory_of_data:str = None, percentage:int = 100, already_computed_dataframe: pd.DataFrame = None): + """Create a new dataset from source files + + Args: + directory_of_data (str): [description] + """ + if already_computed_dataframe is not None: + self.directory_of_data = directory_of_data + self._dataset = already_computed_dataframe + return + + if not os.path.exists(directory_of_data): + raise ValueError(f"{directory_of_data} not Exist!") + if not os.path.isdir(directory_of_data): + raise ValueError(f"{directory_of_data} is not a directory!") + + _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]] + self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100))) + self.directory_of_data = directory_of_data + + def get_fraction_of_dataset(self, percentage: int): + _temp_df_moved = self._dataset.head(int(len(self._dataset)*(percentage/100))).sample(frac=1) + _temp_df_copy = _temp_df_moved.copy() + return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy) + + def get_all_distinct_words_in_dataset(self): + words = [] + for idx,row in self._dataset.iterrows(): + for word in re.findall("[\\w]+|\.|\,", row["comment"].lower()): + if word not in words: + words.append(word) + return words + + def __len__(self): + return self._dataset.shape[0] + + def __getitem__(self, idx): + + image, caption = Image.open(f"{self.directory_of_data}/flickr30k_images/{self._dataset.iloc[idx]['image_name']}").convert('RGB'), \ + re.findall("[\\w]+|\.|\,", self._dataset.iloc[idx]["comment"].lower()) + + return image, caption + + def pack_minibatch_training(self, data, vocabulary): + + # Sort a data list by caption length (descending order). + data.sort(key=lambda x: len(x[1]), reverse=True) + + images, captions = zip(*data) + + operations = transforms.Compose([ + transforms.RandomResizedCrop(MyDataset.training_image_trasformation_parameter["crop"]["size"], scale=MyDataset.training_image_trasformation_parameter["crop"]["scale"], ratio=MyDataset.training_image_trasformation_parameter["crop"]["ratio"]), # Crop a random portion of image and resize it to a given size. + transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability. + transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor. (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] + transforms.Normalize(mean=MyDataset.training_image_trasformation_parameter["mean"], std=MyDataset.training_image_trasformation_parameter["std_dev"]), + ]) + images = list(map(lambda image: operations(image),list(images))) + + # Merge images (from tuple of 3D tensor to 4D tensor). + images = torch.stack(images, 0) # (Batch Size, Color, Height, Width) + + captions_length = [len(caption) for caption in captions] # (Batch Size,) + + captions_training_ids = [vocabulary.translate(caption,"uncomplete")for caption in captions] # (Batch Size, Caption) + + captions_target_ids = [vocabulary.translate(caption,"complete") for caption in captions] + + captions_training_ids = nn.utils.rnn.pad_sequence(captions_training_ids, padding_value=0, batch_first=True) + + captions_target_ids = nn.utils.rnn.pad_sequence(captions_target_ids, padding_value=0, batch_first=True) + + return images,captions_training_ids.type(torch.LongTensor),captions_target_ids.type(torch.LongTensor) + + def pack_minibatch_evaluation(self, data, vocabulary): + + # Sort a data list by caption length (descending order). + data.sort(key=lambda x: len(x[1]), reverse=True) + + images, captions = zip(*data) + + operations = transforms.Compose([ + transforms.Resize(MyDataset.evaluation_image_trasformation_parameter["crop"]["size"]), + transforms.CenterCrop(MyDataset.evaluation_image_trasformation_parameter["crop"]["center"]), # Crops the given image at the center. + transforms.ToTensor(), + transforms.Normalize(mean=MyDataset.evaluation_image_trasformation_parameter["mean"], std=MyDataset.evaluation_image_trasformation_parameter["std_dev"]) + ]) + + images = list(map(lambda image: operations(image),list(images))) + + # Merge images (from tuple of 3D tensor to 4D tensor). + images = torch.stack(images, 0) # (Batch Size, Color, Height, Width) + + + captions_evaluation_ids = [vocabulary.translate(caption,"uncomplete")for caption in captions] # (Batch Size, Caption) + + captions_target_ids = [vocabulary.translate(caption,"complete") for caption in captions] + + captions_evaluation_ids = nn.utils.rnn.pad_sequence(captions_evaluation_ids, padding_value=0, batch_first=True) + + captions_target_ids = nn.utils.rnn.pad_sequence(captions_target_ids, padding_value=0, batch_first=True) + + return images,captions_evaluation_ids.type(torch.LongTensor),captions_target_ids.type(torch.LongTensor) + \ No newline at end of file diff --git a/v1/NeuralNet.py b/v1/NeuralNet.py new file mode 100644 index 0000000..9ac00c9 --- /dev/null +++ b/v1/NeuralNet.py @@ -0,0 +1,345 @@ +##################################################### +## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! +# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :) +## +## +## pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html + + +import torch +import torch.nn as nn +import torchvision.models as models +from torch.nn.utils.rnn import pack_padded_sequence +import torch.nn.functional as F + +device="cpu" +class EncoderCNN(nn.Module): + def __init__(self, projection_size): + super(EncoderCNN, self).__init__() + resnet = models.resnet50(pretrained=True) + for param in resnet.parameters(): + param.requires_grad_(False) + + modules = list(resnet.children())[:-1] # remove last fc layer + self.resnet = nn.Sequential(*modules) + self.linear = nn.Linear(resnet.fc.in_features, projection_size) + + def forward(self, images): + features = self.resnet(images) + features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.) + features = self.linear(features) + return features + +class DecoderRNN(nn.Module): + def __init__(self, hidden_size, padding_index, vocab_size, embedding_size): + """[summary] + + Args: + hidden_size ([type]): [description] + padding_index ([type]): [description] + vocab_size ([type]): [description] + embedding_size ([type]): [description] + """ + super(DecoderRNN, self).__init__() + + # Embedding layer that turns words into a vector of a specified size + self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index) + + # The LSTM takes embedded word vectors (of a specified size) as input + # and outputs hidden states of size hidden_dim + self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size) + + # The linear layer that maps the hidden state output dimension + # to the number of words we want as output, vocab_size + self.linear_1 = nn.Linear(hidden_size, vocab_size) + + + def forward(self, features, captions): + """[summary] + + Args: + features (torch.tensor(batch_size, hidden_size)): [description] + captions (torch.tensor(batch_size, max_captions_length, word_embedding)): [description] + + Returns: + [torch.tensor(batch_size, max_captions_length, vocab_size)]: [description] + """ + + # Initialize the hidden state + batch_size = features.shape[0] # features is of shape (batch_size, embed_size) + + # Create embedded word vector for each word in the captions + inputs = self.word_embeddings(captions) # In: Out: (batch_size, captions length, embed_size) + + # Feed LSTMCell with image features and retrieve the state + + _h, _c = self.lstm_unit(features) # _h : (Batch size, Hidden size) + + # Deterministict Output as first word of the caption :) + start = torch.zeros(self.word_embeddings.num_embeddings) + start[1] = 1 + outputs = start.repeat(batch_size,1,1).to(torch.device(device)) # Bulk insert of embeddings to all the elements of the batch + + + + # How it works the loop? + # For each time step t \in {0, N-1}, where N is the caption length + + # Since the sequences are padded, how the forward is performed? Since the don't need to be feeded as input? + # The assumption is that the captions are of lenght N-1, so the captions provide by external as input are without token + + for idx in range(0,inputs.shape[1]): + _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) + _outputs = self.linear_1(_h) + outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) + + return outputs # (Batch Size, N, |Vocabulary|) + + def generate_caption(self, features, max_caption_length): + """Generate captions for given image features using greedy search.""" + + sampled_ids = [torch.tensor([1]).to(torch.device(device))] # Hardcoded + input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,-1)) + with torch.no_grad(): + _h ,_c = self.lstm_unit(features.unsqueeze(0)) + for _ in range(max_caption_length-1): + _h, _c = self.lstm_unit(input, (_h ,_c)) # _h: (1, 1, hidden_size) + outputs = self.linear_1(_h) # outputs: (1, vocab_size) + _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if device == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: (batch_size) + sampled_ids.append(predicted) + input = self.word_embeddings(predicted) # inputs: (batch_size, embed_size) + input = input.to(torch.device(device)) # inputs: (batch_size, 1, embed_size) + if predicted == 2: + break + sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (batch_size, max_seq_length) + return sampled_ids + + +class CaRNet1(nn.Module): + + def __init__(self, hidden_size, padding_index, vocab_size, embedding_size, device = "cpu"): + """[summary] + + Args: + hidden_size ([type]): [description] + padding_index ([type]): [description] + vocab_size ([type]): [description] + embedding_size ([type]): [description] + """ + super(CaRNet1, self).__init__() + self.padding_index = padding_index + self.device = torch.device(device) + self.C = EncoderCNN(embedding_size) + self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size) + + self.C.to(self.device) + self.R.to(self.device) + + def save(self, file_name): + """Save the classifier.""" + torch.save(self.C.state_dict(), f".saved/v1/{file_name}_C.pth") + torch.save(self.R.state_dict(), f".saved/v1/{file_name}_R.pth") + + def load(self, file_name): + """Load the classifier.""" + + # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device) + self.C.load_state_dict(torch.load(f".saved/v1/{file_name}_C.pth", map_location=self.device)) + self.R.load_state_dict(torch.load(f".saved/v1/{file_name}_R.pth", map_location=self.device)) + + def forward(self,images,captions): + features = self.C(images) + return self.R(features, captions) + + def __accuracy(self, outputs, labels): + """[summary] + + Args: + outputs ([type]): [description] + labels ([type]): [description] + + Returns: + [type]: [description] + """ + # Assume outputs and labels have same shape and already padded + # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch! + # With this technique we evaluate all major case: + # 1) Output caption is longer than expected : Output.ID - .ID != 0 + # 2) Output is less longer than expect : .ID - Target.ID != 0 + # 3) Output has equal dimension but different label : Output.ID - Target.ID != 0, + # Hp. 1 : Output.ID - Target.ID = 0 need to be considered as good match because it means that both output and target end before this token + # Hp. 2 : Both Outputs and Target need to be dropped on the first word because is evaluated in a deterministic fashion :) + # computing the accuracy + + right_predictions = torch.eq(outputs[:,1:], labels[:,1:]) + acc = torch.mean(right_predictions.to(torch.float32).sum(axis=1) / right_predictions.shape[1] ).item() # Accuracy = TP+TN / ALL + return acc + + # TO DO: Devo usare la confusion matrix????????? + + def train(self, train_set, validation_set, lr, epochs, vocabulary): + + criterion = nn.CrossEntropyLoss(ignore_index=self.padding_index,reduction="sum").cuda() if self.device.type == "cuda" \ + else nn.CrossEntropyLoss(ignore_index=0,reduction="sum") + + # initializing some elements + best_val_acc = -1. # the best accuracy computed on the validation data + best_epoch = -1 # the epoch in which the best accuracy above was computed + + + + # ensuring the classifier is in 'train' mode (pytorch) + self.C.train() + self.R.train() + + # creating the optimizer + optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr) + + # loop on epochs! + for e in range(0, epochs): + + # epoch-level stats (computed by accumulating mini-batch stats) + epoch_train_acc = 0. + epoch_train_loss = 0. + epoch_num_train_examples = 0 + + for images,captions_training_ids,captions_target_ids in train_set: + optimizer.zero_grad() + + batch_num_train_examples = images.shape[0] # mini-batch size (it might be different from 'batch_size') + epoch_num_train_examples += batch_num_train_examples + + + images = images.to(self.device) + captions_training_ids = captions_training_ids.to(self.device) # captions > (B, L) + captions_target_ids = captions_target_ids.to(self.device) # captions > (B, |L|-1) without end token + + # computing the network output on the current mini-batch + features = self.C(images) + outputs = self.R(features, captions_training_ids) # outputs > (B, L, |V|); + + # (B, L, |V|) -> (B * L, |V|) and captions > (B * L) + loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions_target_ids.reshape(-1)) + + # computing gradients and updating the network weights + loss.backward() # computing gradients + optimizer.step() # updating weights + + # with torch.no_grad(): + # self.C.eval() + # self.R.eval() + # features = self.C(images) + # import random + # numb = random.randint(0,2) + # caption = self.R.generate_caption(features[numb],30) + # print(vocabulary.rev_translate(captions_target_ids[numb])) + # print(vocabulary.rev_translate(caption[0])) + # self.C.train() + # self.R.train() + + with torch.no_grad(): + self.C.eval() + self.R.eval() + + # Compute captions as ids for all the training images + projections = self.C(images) + + captions_output = torch.zeros((projections.shape[0],captions_target_ids.shape[1])).to(torch.device(device)) + + for idx,projection in enumerate(range(projections.shape[0])): + _caption_no_pad = self.R.generate_caption(projections[idx],captions_target_ids.shape[1]) + captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad + # Fill the remaining portion of caption eventually with zeros + # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. + + captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors + + # computing performance + batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_target_ids) + + # accumulating performance measures to get a final estimate on the whole training set + epoch_train_acc += batch_train_acc * batch_num_train_examples + + # accumulating other stats + epoch_train_loss += loss.item() * batch_num_train_examples + self.C.train() + self.R.train() + + # printing (mini-batch related) stats on screen + print(" mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc)) + + val_acc = self.eval_classifier(validation_set) + + # saving the model if the validation accuracy increases + if val_acc > best_val_acc: + best_val_acc = val_acc + best_epoch = e + 1 + self.save("CaRNetv1") + + epoch_train_loss /= epoch_num_train_examples + + # printing (epoch related) stats on screen + print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}" + + (", BEST!" if best_epoch == e + 1 else "")) + .format(e + 1, epochs, epoch_train_loss, + epoch_train_acc / epoch_num_train_examples, val_acc)) + + def eval_classifier(self, data_set): + """Evaluate the classifier on the given data set.""" + + # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state) + training_mode_originally_on = self.C.training and self.R.training + if training_mode_originally_on: + self.C.eval() + self.R.eval() # enforcing evaluation mode + + + + with torch.no_grad(): # keeping off the autograd engine + + # loop on mini-batches to accumulate the network outputs (creating a new iterator) + for images,_,captions_validation_target_ids in data_set: + images = images.to(self.device) + + captions_validation_target_ids = captions_validation_target_ids.to(self.device) + + projections = self.C(images) + + captions_output = torch.zeros((projections.shape[0],captions_validation_target_ids.shape[1])).to(torch.device(device)) + + for idx,projection in enumerate(range(projections.shape[0])): + _caption_no_pad = self.R.generate_caption(projections[idx],captions_validation_target_ids.shape[1]) + captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad + # Fill the remaining portion of caption eventually with zeros + # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. + + captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors + + # computing performance + acc = self.__accuracy(captions_output_padded.squeeze(1), captions_validation_target_ids) + + if training_mode_originally_on: + self.C.train() # restoring the training state, if needed + self.R.train() + return acc +# Example of usage +if __name__ == "__main__": + from Vocabulary import Vocabulary + from Dataset import MyDataset + from torch.utils.data import DataLoader + ds = MyDataset("./dataset/flickr30k_images/", percentage=8) + v = Vocabulary(ds,reload=True) + dc = ds.get_fraction_of_dataset(percentage=70) + df = ds.get_fraction_of_dataset(percentage=30) + # use dataloader facilities which requires a preprocessed dataset + + + dataloader_training = DataLoader(dc, batch_size=100, + shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_training(data,v)) + + dataloader_evaluation = DataLoader(df, batch_size=50, + shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v)) + + net = CaRNet1(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cpu") + net.load("CaRNetv1") + net.train(dataloader_training,dataloader_evaluation,1e-3,500,v) diff --git a/v1/Vocabulary.py b/v1/Vocabulary.py new file mode 100644 index 0000000..725945a --- /dev/null +++ b/v1/Vocabulary.py @@ -0,0 +1,131 @@ +import os +import torch +import warnings +from Dataset import MyDataset +from typing import List + +class Vocabulary(): + # The vocabulary implementation is done with a pre-trained word embedding GLOVE50d + # each word is represented by a record in a dataframe with this structure + + + def __init__(self, source_dataset: MyDataset, verbose: bool = False, reload: bool = False): + + self.enriched = False # Tell that all the word coming from the dataset are in the vocabulary if it is set to True + self._make_enrich = False # Allow the user to enrich the vocabulary if it is set to True + # Check if the enriched vocabulary(glove + PAD + SOS + EOS + UNK + dataset vocabulary) already exists + if os.path.exists(".saved/rich_embeddings_v1.pt") and os.path.exists(".saved/rich_word2id_v1.pt") and not reload: + self.embeddings = torch.load(".saved/rich_embeddings_v1.pt") + self.word2id = torch.load(".saved/rich_word2id_v1.pt") + self.enriched = True + return + + # Since the constructor arrived here, we need to load for the 1st time all the possible words from the dataset + dataset_words = source_dataset.get_all_distinct_words_in_dataset() + + # Dictionary length + self.dictionary_length = len(dataset_words)+4 # Dictionary word + 4 Flavored Token (PAD + SOS + EOS + UNK) + + self.word2id = {} + self.embeddings = torch.zeros((self.dictionary_length, self.dictionary_length)) # DIM1: dict rows + 4 flavored token (PAD + SOS + EOS + UNK) | DIM2: Dict Rows +4 flavored token (PAD + SOS + EOS + UNK) as 1-hot + + # Initialize the token: + # , , , + self.word2id[""] = 0 + self.word2id[""] = 1 + self.word2id[""] = 2 + self.word2id[""] = 3 + + counter = 4 + for word in dataset_words: + self.word2id[word] = counter + counter += 1 + + self.embeddings = torch.eye(self.dictionary_length) + + def predefined_token_idx(self) -> dict: + return { + "":0, + "":1, + "":2, + "":3 + } + + def translate(self, word_sequence : List[str], type : str = "complete") -> torch.tensor: + """Given a sequence of word, translate into id list according to the vocabulary. + + Args: + word_sequence (str): [description] + """ + + # Initialize the translator + + if type == "uncomplete": + _sequence = torch.zeros(len(word_sequence)+1, dtype=torch.int32) # + ...Caption... + + if type == "complete": + _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # + ...Caption... + + _sequence[-1] = self.word2id[""] + + _sequence[0] = self.word2id[""] + + counter = 1 # Always skip + + # Evaluate all the word into the caption and translate it to an embeddings + for word in word_sequence: + if word.lower() in self.word2id.keys(): + _sequence[counter] = self.word2id[word.lower()] + else: + _sequence[counter] = self.word2id[""] + counter += 1 + + return _sequence + + def rev_translate(self, words_id : torch.tensor) -> List[str]: + """Given a sequence of word, translate into id list according to the vocabulary. + + Args: + word_sequence (str): [description] + """ + # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset. + return [list(self.word2id.keys())[idx] for idx in words_id[:].tolist()] # word_id (1,caption_length) + + + def __len__(self): + """The total number of words in this Vocabulary.""" + + return len(self.word2id.keys()) + + +# ---------------------------------------------------------------- +# Usage example + +if __name__ == '__main__': + #Load the vocabulary + v = Vocabulary(verbose=True) + # Make a translation + print(v.translate(["I","like","PLay","piano","."])) + # Enrich the vocabulary + v.make_enrich = True + dataset = ["I","Like","PLay","PIPPOplutopaperino"] + v.enrich(dataset) + v.make_enrich = False + # Enrich the vocabulary with a bulk insert + v.make_enrich = True + dataset = [["I","Like","PLay","PIPPOplutopaperino"],["I","Like","PLay","pizza"]] + v.bulk_enrich(dataset) + v.make_enrich = False + + + + + + + + + + + + + + \ No newline at end of file diff --git a/v1/__init__.py b/v1/__init__.py new file mode 100644 index 0000000..e69de29