diff --git a/.vscode/launch.json b/.vscode/launch.json index 4107551..8670901 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,15 +1,6 @@ { - // Usare IntelliSense per informazioni sui possibili attributi. - // Al passaggio del mouse vengono visualizzate le descrizioni degli attributi esistenti. - // Per altre informazioni, visitare: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", "configurations": [ - { - "name": "Python: File corrente", - "type": "python", - "request": "launch", - "program": "/home/christian/Documenti/GitHub/Image-Captioning/v1/NeuralNet.py", - "console": "integratedTerminal" - } + {"name":"Main","type":"python","request":"launch","program":"${workspaceFolder}/main.py","console":"integratedTerminal", + "args": ["RNetvI", "train", "1024", "1024", "--attention", "True", "--attention_dim", "1024", "--dataset_folder", "./dataset/flickr30k_images", "--device", "cuda:0", "--splits", "1", "1", "1", "--epochs", "2"]} ] } \ No newline at end of file diff --git a/CaRNetvHC.py b/CaRNetvHC.py deleted file mode 100644 index 3221730..0000000 --- a/CaRNetvHC.py +++ /dev/null @@ -1,347 +0,0 @@ -##################################################### -## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! -# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :) -## -## -## pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html - - -import torch -import torch.nn as nn -import torchvision.models as models -from torch.nn.utils.rnn import pack_padded_sequence -import torch.nn.functional as F - -device="cpu" -class EncoderCNN(nn.Module): - def __init__(self, projection_size): - super(EncoderCNN, self).__init__() - resnet = models.resnet50(pretrained=True) - for param in resnet.parameters(): - param.requires_grad_(False) - - modules = list(resnet.children())[:-1] # remove last fc layer - self.resnet = nn.Sequential(*modules) - self.linear = nn.Linear(resnet.fc.in_features, projection_size) - - def forward(self, images): - features = self.resnet(images) - features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.) - features = self.linear(features) - return features - -class DecoderRNN(nn.Module): - def __init__(self, hidden_size, padding_index, vocab_size, embedding_size): - """[summary] - - Args: - hidden_size ([type]): [description] - padding_index ([type]): [description] - vocab_size ([type]): [description] - embedding_size ([type]): [description] - """ - super(DecoderRNN, self).__init__() - - # Embedding layer that turns words into a vector of a specified size - self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index) - - # The LSTM takes embedded word vectors (of a specified size) as input - # and outputs hidden states of size hidden_dim - self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size) - - self.hidden_size = hidden_size - - # The linear layer that maps the hidden state output dimension - # to the number of words we want as output, vocab_size - self.linear_1 = nn.Linear(hidden_size, vocab_size) - - - def forward(self, features, captions): - """[summary] - - Args: - features (torch.tensor(batch_size, hidden_size)): [description] - captions (torch.tensor(batch_size, max_captions_length, word_embedding)): [description] - - Returns: - [torch.tensor(batch_size, max_captions_length, vocab_size)]: [description] - """ - - # Initialize the hidden state - batch_size = features.shape[0] # features is of shape (batch_size, embed_size) - - # Create embedded word vector for each word in the captions - inputs = self.word_embeddings(captions) # In: Out: (batch_size, captions length, embed_size) - - # Feed LSTMCell with image features and retrieve the state - - _h, _c = tuple( features, features) # _h : (Batch size, Hidden size) - - # Deterministict Output as first word of the caption :) - start = torch.zeros(self.word_embeddings.num_embeddings) - start[1] = 1 - outputs = start.repeat(batch_size,1,1).to(torch.device(device)) # Bulk insert of embeddings to all the elements of the batch - - - - # How it works the loop? - # For each time step t \in {0, N-1}, where N is the caption length - - # Since the sequences are padded, how the forward is performed? Since the don't need to be feeded as input? - # The assumption is that the captions are of lenght N-1, so the captions provide by external as input are without token - - for idx in range(0,inputs.shape[1]): - _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) - _outputs = self.linear_1(_h) - outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) - - return outputs # (Batch Size, N, |Vocabulary|) - - def generate_caption(self, features, max_caption_length): - """Generate captions for given image features using greedy search.""" - - sampled_ids = [torch.tensor([1]).to(torch.device(device))] # Hardcoded - input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,-1)) - with torch.no_grad(): - _h, _c = tuple( features.unsqueeze(0), features.unsqueeze(0)) - for _ in range(max_caption_length-1): - _h, _c = self.lstm_unit(input, (_h ,_c)) # _h: (1, 1, hidden_size) - outputs = self.linear_1(_h) # outputs: (1, vocab_size) - _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if device == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: (batch_size) - sampled_ids.append(predicted) - input = self.word_embeddings(predicted) # inputs: (batch_size, embed_size) - input = input.to(torch.device(device)) # inputs: (batch_size, 1, embed_size) - if predicted == 2: - break - sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (batch_size, max_seq_length) - return sampled_ids - - -class CaRNet1(nn.Module): - - def __init__(self, hidden_size, padding_index, vocab_size, embedding_size, device = "cpu"): - """[summary] - - Args: - hidden_size ([type]): [description] - padding_index ([type]): [description] - vocab_size ([type]): [description] - embedding_size ([type]): [description] - """ - super(CaRNet1, self).__init__() - self.padding_index = padding_index - self.device = torch.device(device) - self.C = EncoderCNN(embedding_size) - self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size) - - self.C.to(self.device) - self.R.to(self.device) - - def save(self, file_name): - """Save the classifier.""" - torch.save(self.C.state_dict(), f".saved/vHC/{file_name}_C.pth") - torch.save(self.R.state_dict(), f".saved/vHC/{file_name}_R.pth") - - def load(self, file_name): - """Load the classifier.""" - - # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device) - self.C.load_state_dict(torch.load(f".saved/vHC/{file_name}_C.pth", map_location=self.device)) - self.R.load_state_dict(torch.load(f".saved/vHC/{file_name}_R.pth", map_location=self.device)) - - def forward(self,images,captions): - features = self.C(images) - return self.R(features, captions) - - def __accuracy(self, outputs, labels): - """[summary] - - Args: - outputs ([type]): [description] - labels ([type]): [description] - - Returns: - [type]: [description] - """ - # Assume outputs and labels have same shape and already padded - # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch! - # With this technique we evaluate all major case: - # 1) Output caption is longer than expected : Output.ID - .ID != 0 - # 2) Output is less longer than expect : .ID - Target.ID != 0 - # 3) Output has equal dimension but different label : Output.ID - Target.ID != 0, - # Hp. 1 : Output.ID - Target.ID = 0 need to be considered as good match because it means that both output and target end before this token - # Hp. 2 : Both Outputs and Target need to be dropped on the first word because is evaluated in a deterministic fashion :) - # computing the accuracy - - right_predictions = torch.eq(outputs[:,1:], labels[:,1:]) - acc = torch.mean(right_predictions.to(torch.float32).sum(axis=1) / right_predictions.shape[1] ).item() # Accuracy = TP+TN / ALL - return acc - - # TO DO: Devo usare la confusion matrix????????? - - def train(self, train_set, validation_set, lr, epochs, vocabulary): - - criterion = nn.CrossEntropyLoss(ignore_index=self.padding_index,reduction="sum").cuda() if self.device.type == "cuda" \ - else nn.CrossEntropyLoss(ignore_index=0,reduction="sum") - - # initializing some elements - best_val_acc = -1. # the best accuracy computed on the validation data - best_epoch = -1 # the epoch in which the best accuracy above was computed - - - - # ensuring the classifier is in 'train' mode (pytorch) - self.C.train() - self.R.train() - - # creating the optimizer - optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr) - - # loop on epochs! - for e in range(0, epochs): - - # epoch-level stats (computed by accumulating mini-batch stats) - epoch_train_acc = 0. - epoch_train_loss = 0. - epoch_num_train_examples = 0 - - for images,captions_training_ids,captions_target_ids in train_set: - optimizer.zero_grad() - - batch_num_train_examples = images.shape[0] # mini-batch size (it might be different from 'batch_size') - epoch_num_train_examples += batch_num_train_examples - - - images = images.to(self.device) - captions_training_ids = captions_training_ids.to(self.device) # captions > (B, L) - captions_target_ids = captions_target_ids.to(self.device) # captions > (B, |L|-1) without end token - - # computing the network output on the current mini-batch - features = self.C(images) - outputs = self.R(features, captions_training_ids) # outputs > (B, L, |V|); - - # (B, L, |V|) -> (B * L, |V|) and captions > (B * L) - loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions_target_ids.reshape(-1)) - - # computing gradients and updating the network weights - loss.backward() # computing gradients - optimizer.step() # updating weights - - # with torch.no_grad(): - # self.C.eval() - # self.R.eval() - # features = self.C(images) - # import random - # numb = random.randint(0,2) - # caption = self.R.generate_caption(features[numb],30) - # print(vocabulary.rev_translate(captions_target_ids[numb])) - # print(vocabulary.rev_translate(caption[0])) - # self.C.train() - # self.R.train() - - with torch.no_grad(): - self.C.eval() - self.R.eval() - - # Compute captions as ids for all the training images - projections = self.C(images) - - captions_output = torch.zeros((projections.shape[0],captions_target_ids.shape[1])).to(torch.device(device)) - - for idx,projection in enumerate(range(projections.shape[0])): - _caption_no_pad = self.R.generate_caption(projections[idx],captions_target_ids.shape[1]) - captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad - # Fill the remaining portion of caption eventually with zeros - # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. - - captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors - - # computing performance - batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_target_ids) - - # accumulating performance measures to get a final estimate on the whole training set - epoch_train_acc += batch_train_acc * batch_num_train_examples - - # accumulating other stats - epoch_train_loss += loss.item() * batch_num_train_examples - self.C.train() - self.R.train() - - # printing (mini-batch related) stats on screen - print(" mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc)) - - val_acc = self.eval_classifier(validation_set) - - # saving the model if the validation accuracy increases - if val_acc > best_val_acc: - best_val_acc = val_acc - best_epoch = e + 1 - self.save("CaRNetvHC") - - epoch_train_loss /= epoch_num_train_examples - - # printing (epoch related) stats on screen - print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}" - + (", BEST!" if best_epoch == e + 1 else "")) - .format(e + 1, epochs, epoch_train_loss, - epoch_train_acc / epoch_num_train_examples, val_acc)) - - def eval_classifier(self, data_set): - """Evaluate the classifier on the given data set.""" - - # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state) - training_mode_originally_on = self.C.training and self.R.training - if training_mode_originally_on: - self.C.eval() - self.R.eval() # enforcing evaluation mode - - - - with torch.no_grad(): # keeping off the autograd engine - - # loop on mini-batches to accumulate the network outputs (creating a new iterator) - for images,_,captions_validation_target_ids in data_set: - images = images.to(self.device) - - captions_validation_target_ids = captions_validation_target_ids.to(self.device) - - projections = self.C(images) - - captions_output = torch.zeros((projections.shape[0],captions_validation_target_ids.shape[1])).to(torch.device(device)) - - for idx,projection in enumerate(range(projections.shape[0])): - _caption_no_pad = self.R.generate_caption(projections[idx],captions_validation_target_ids.shape[1]) - captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad - # Fill the remaining portion of caption eventually with zeros - # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. - - captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors - - # computing performance - acc = self.__accuracy(captions_output_padded.squeeze(1), captions_validation_target_ids) - - if training_mode_originally_on: - self.C.train() # restoring the training state, if needed - self.R.train() - return acc -# Example of usage -if __name__ == "__main__": - from Vocabulary import Vocabulary - from Dataset import MyDataset - from torch.utils.data import DataLoader - ds = MyDataset("./dataset/flickr30k_images/", percentage=8) - v = Vocabulary(ds,reload=True) - dc = ds.get_fraction_of_dataset(percentage=70) - df = ds.get_fraction_of_dataset(percentage=30) - # use dataloader facilities which requires a preprocessed dataset - - - dataloader_training = DataLoader(dc, batch_size=100, - shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_training(data,v)) - - dataloader_evaluation = DataLoader(df, batch_size=50, - shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v)) - - net = CaRNet1(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cpu") - net.load("CaRNetvHC") - net.train(dataloader_training,dataloader_evaluation,1e-3,500,v) diff --git a/Images Documentation/ResNet-50.png b/Images Documentation/ResNet-50.png new file mode 100644 index 0000000..a7ba321 Binary files /dev/null and b/Images Documentation/ResNet-50.png differ diff --git a/LICENSE b/LICENSE index 3ad989d..6fe794f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022 christianSistemiPos +Copyright (c) 2022 christiandimaio Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/NeuralModel/CaRNetvH.py b/NeuralModel/CaRNetvH.py deleted file mode 100644 index 906ba13..0000000 --- a/NeuralModel/CaRNetvH.py +++ /dev/null @@ -1,347 +0,0 @@ -##################################################### -## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! -# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :) -## -## -## pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html - - -import torch -import torch.nn as nn -import torchvision.models as models -from torch.nn.utils.rnn import pack_padded_sequence -import torch.nn.functional as F - -device="cpu" -class EncoderCNN(nn.Module): - def __init__(self, projection_size): - super(EncoderCNN, self).__init__() - resnet = models.resnet50(pretrained=True) - for param in resnet.parameters(): - param.requires_grad_(False) - - modules = list(resnet.children())[:-1] # remove last fc layer - self.resnet = nn.Sequential(*modules) - self.linear = nn.Linear(resnet.fc.in_features, projection_size) - - def forward(self, images): - features = self.resnet(images) - features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.) - features = self.linear(features) - return features - -class DecoderRNN(nn.Module): - def __init__(self, hidden_size, padding_index, vocab_size, embedding_size): - """[summary] - - Args: - hidden_size ([type]): [description] - padding_index ([type]): [description] - vocab_size ([type]): [description] - embedding_size ([type]): [description] - """ - super(DecoderRNN, self).__init__() - - # Embedding layer that turns words into a vector of a specified size - self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index) - - # The LSTM takes embedded word vectors (of a specified size) as input - # and outputs hidden states of size hidden_dim - self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size) - - self.hidden_size = hidden_size - - # The linear layer that maps the hidden state output dimension - # to the number of words we want as output, vocab_size - self.linear_1 = nn.Linear(hidden_size, vocab_size) - - - def forward(self, features, captions): - """[summary] - - Args: - features (torch.tensor(batch_size, hidden_size)): [description] - captions (torch.tensor(batch_size, max_captions_length, word_embedding)): [description] - - Returns: - [torch.tensor(batch_size, max_captions_length, vocab_size)]: [description] - """ - - # Initialize the hidden state - batch_size = features.shape[0] # features is of shape (batch_size, embed_size) - - # Create embedded word vector for each word in the captions - inputs = self.word_embeddings(captions) # In: Out: (batch_size, captions length, embed_size) - - # Feed LSTMCell with image features and retrieve the state - - _h, _c = tuple( features, torch.zeros((captions.shape[0],self.hidden_size))) # _h : (Batch size, Hidden size) - - # Deterministict Output as first word of the caption :) - start = torch.zeros(self.word_embeddings.num_embeddings) - start[1] = 1 - outputs = start.repeat(batch_size,1,1).to(torch.device(device)) # Bulk insert of embeddings to all the elements of the batch - - - - # How it works the loop? - # For each time step t \in {0, N-1}, where N is the caption length - - # Since the sequences are padded, how the forward is performed? Since the don't need to be feeded as input? - # The assumption is that the captions are of lenght N-1, so the captions provide by external as input are without token - - for idx in range(0,inputs.shape[1]): - _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) - _outputs = self.linear_1(_h) - outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) - - return outputs # (Batch Size, N, |Vocabulary|) - - def generate_caption(self, features, max_caption_length): - """Generate captions for given image features using greedy search.""" - - sampled_ids = [torch.tensor([1]).to(torch.device(device))] # Hardcoded - input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(device))).reshape((1,-1)) - with torch.no_grad(): - _h, _c = tuple( features.unsqueeze(0), torch.zeros((1,self.hidden_size))) - for _ in range(max_caption_length-1): - _h, _c = self.lstm_unit(input, (_h ,_c)) # _h: (1, 1, hidden_size) - outputs = self.linear_1(_h) # outputs: (1, vocab_size) - _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if device == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: (batch_size) - sampled_ids.append(predicted) - input = self.word_embeddings(predicted) # inputs: (batch_size, embed_size) - input = input.to(torch.device(device)) # inputs: (batch_size, 1, embed_size) - if predicted == 2: - break - sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (batch_size, max_seq_length) - return sampled_ids - - -class CaRNet1(nn.Module): - - def __init__(self, hidden_size, padding_index, vocab_size, embedding_size, device = "cpu"): - """[summary] - - Args: - hidden_size ([type]): [description] - padding_index ([type]): [description] - vocab_size ([type]): [description] - embedding_size ([type]): [description] - """ - super(CaRNet1, self).__init__() - self.padding_index = padding_index - self.device = torch.device(device) - self.C = EncoderCNN(embedding_size) - self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size) - - self.C.to(self.device) - self.R.to(self.device) - - def save(self, file_name): - """Save the classifier.""" - torch.save(self.C.state_dict(), f".saved/vH/{file_name}_C.pth") - torch.save(self.R.state_dict(), f".saved/vH/{file_name}_R.pth") - - def load(self, file_name): - """Load the classifier.""" - - # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device) - self.C.load_state_dict(torch.load(f".saved/vH/{file_name}_C.pth", map_location=self.device)) - self.R.load_state_dict(torch.load(f".saved/vH/{file_name}_R.pth", map_location=self.device)) - - def forward(self,images,captions): - features = self.C(images) - return self.R(features, captions) - - def __accuracy(self, outputs, labels): - """[summary] - - Args: - outputs ([type]): [description] - labels ([type]): [description] - - Returns: - [type]: [description] - """ - # Assume outputs and labels have same shape and already padded - # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch! - # With this technique we evaluate all major case: - # 1) Output caption is longer than expected : Output.ID - .ID != 0 - # 2) Output is less longer than expect : .ID - Target.ID != 0 - # 3) Output has equal dimension but different label : Output.ID - Target.ID != 0, - # Hp. 1 : Output.ID - Target.ID = 0 need to be considered as good match because it means that both output and target end before this token - # Hp. 2 : Both Outputs and Target need to be dropped on the first word because is evaluated in a deterministic fashion :) - # computing the accuracy - - right_predictions = torch.eq(outputs[:,1:], labels[:,1:]) - acc = torch.mean(right_predictions.to(torch.float32).sum(axis=1) / right_predictions.shape[1] ).item() # Accuracy = TP+TN / ALL - return acc - - # TO DO: Devo usare la confusion matrix????????? - - def train(self, train_set, validation_set, lr, epochs, vocabulary): - - criterion = nn.CrossEntropyLoss(ignore_index=self.padding_index,reduction="sum").cuda() if self.device.type == "cuda" \ - else nn.CrossEntropyLoss(ignore_index=0,reduction="sum") - - # initializing some elements - best_val_acc = -1. # the best accuracy computed on the validation data - best_epoch = -1 # the epoch in which the best accuracy above was computed - - - - # ensuring the classifier is in 'train' mode (pytorch) - self.C.train() - self.R.train() - - # creating the optimizer - optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr) - - # loop on epochs! - for e in range(0, epochs): - - # epoch-level stats (computed by accumulating mini-batch stats) - epoch_train_acc = 0. - epoch_train_loss = 0. - epoch_num_train_examples = 0 - - for images,captions_training_ids,captions_target_ids in train_set: - optimizer.zero_grad() - - batch_num_train_examples = images.shape[0] # mini-batch size (it might be different from 'batch_size') - epoch_num_train_examples += batch_num_train_examples - - - images = images.to(self.device) - captions_training_ids = captions_training_ids.to(self.device) # captions > (B, L) - captions_target_ids = captions_target_ids.to(self.device) # captions > (B, |L|-1) without end token - - # computing the network output on the current mini-batch - features = self.C(images) - outputs = self.R(features, captions_training_ids) # outputs > (B, L, |V|); - - # (B, L, |V|) -> (B * L, |V|) and captions > (B * L) - loss = criterion(outputs.reshape((-1,outputs.shape[2])), captions_target_ids.reshape(-1)) - - # computing gradients and updating the network weights - loss.backward() # computing gradients - optimizer.step() # updating weights - - # with torch.no_grad(): - # self.C.eval() - # self.R.eval() - # features = self.C(images) - # import random - # numb = random.randint(0,2) - # caption = self.R.generate_caption(features[numb],30) - # print(vocabulary.rev_translate(captions_target_ids[numb])) - # print(vocabulary.rev_translate(caption[0])) - # self.C.train() - # self.R.train() - - with torch.no_grad(): - self.C.eval() - self.R.eval() - - # Compute captions as ids for all the training images - projections = self.C(images) - - captions_output = torch.zeros((projections.shape[0],captions_target_ids.shape[1])).to(torch.device(device)) - - for idx,projection in enumerate(range(projections.shape[0])): - _caption_no_pad = self.R.generate_caption(projections[idx],captions_target_ids.shape[1]) - captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad - # Fill the remaining portion of caption eventually with zeros - # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. - - captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors - - # computing performance - batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_target_ids) - - # accumulating performance measures to get a final estimate on the whole training set - epoch_train_acc += batch_train_acc * batch_num_train_examples - - # accumulating other stats - epoch_train_loss += loss.item() * batch_num_train_examples - self.C.train() - self.R.train() - - # printing (mini-batch related) stats on screen - print(" mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc)) - - val_acc = self.eval_classifier(validation_set) - - # saving the model if the validation accuracy increases - if val_acc > best_val_acc: - best_val_acc = val_acc - best_epoch = e + 1 - self.save("CaRNetvH") - - epoch_train_loss /= epoch_num_train_examples - - # printing (epoch related) stats on screen - print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}" - + (", BEST!" if best_epoch == e + 1 else "")) - .format(e + 1, epochs, epoch_train_loss, - epoch_train_acc / epoch_num_train_examples, val_acc)) - - def eval_classifier(self, data_set): - """Evaluate the classifier on the given data set.""" - - # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state) - training_mode_originally_on = self.C.training and self.R.training - if training_mode_originally_on: - self.C.eval() - self.R.eval() # enforcing evaluation mode - - - - with torch.no_grad(): # keeping off the autograd engine - - # loop on mini-batches to accumulate the network outputs (creating a new iterator) - for images,_,captions_validation_target_ids in data_set: - images = images.to(self.device) - - captions_validation_target_ids = captions_validation_target_ids.to(self.device) - - projections = self.C(images) - - captions_output = torch.zeros((projections.shape[0],captions_validation_target_ids.shape[1])).to(torch.device(device)) - - for idx,projection in enumerate(range(projections.shape[0])): - _caption_no_pad = self.R.generate_caption(projections[idx],captions_validation_target_ids.shape[1]) - captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad - # Fill the remaining portion of caption eventually with zeros - # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. - - captions_output_padded = captions_output.type(torch.int32).to(torch.device(device)) # From list of tensors to tensors - - # computing performance - acc = self.__accuracy(captions_output_padded.squeeze(1), captions_validation_target_ids) - - if training_mode_originally_on: - self.C.train() # restoring the training state, if needed - self.R.train() - return acc -# Example of usage -if __name__ == "__main__": - from Vocabulary import Vocabulary - from Dataset import MyDataset - from torch.utils.data import DataLoader - ds = MyDataset("./dataset/flickr30k_images/", percentage=8) - v = Vocabulary(ds,reload=True) - dc = ds.get_fraction_of_dataset(percentage=70) - df = ds.get_fraction_of_dataset(percentage=30) - # use dataloader facilities which requires a preprocessed dataset - - - dataloader_training = DataLoader(dc, batch_size=100, - shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_training(data,v)) - - dataloader_evaluation = DataLoader(df, batch_size=50, - shuffle=True, num_workers=12, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v)) - - net = CaRNet1(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cpu") - net.load("CaRNetvH") - net.train(dataloader_training,dataloader_evaluation,1e-3,500,v) diff --git a/NeuralModel/CaRNetvHC.py b/NeuralModel/CaRNetvHC.py deleted file mode 100644 index 2cf2488..0000000 --- a/NeuralModel/CaRNetvHC.py +++ /dev/null @@ -1,444 +0,0 @@ -##################################################### -## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! -# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :) -## -## -## pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html - - -import torch -import torch.nn as nn -import torchvision.models as models -from torch.nn.utils.rnn import pack_padded_sequence -import torch.nn.functional as F -from typing import Tuple,List -from Dataset import MyDataset -from Vocabulary import Vocabulary - -class EncoderCNN(nn.Module): - def __init__(self, projection_size: int, device: str = "cpu"): - """Constructor of the Encoder NN - - Args: - projection_size (int): The dimension of projection into the space of RNN (Could be the input or the hidden state). - - device (str, optional): The device on which the operations will be performed. Default "cpu". - """ - super(EncoderCNN, self).__init__() - - self.device = torch.device(device) - resnet = models.resnet50(pretrained=True) - for param in resnet.parameters(): # Freezing weights - param.requires_grad_(False) - - modules = list(resnet.children())[:-1] # remove last fc layer - self.resnet = nn.Sequential(*modules) - - self.linear = nn.Linear(resnet.fc.in_features, projection_size) # define a last layer - - def forward(self, images: torch.Tensor) -> torch.Tensor: - """Forward operation of the nn - - Args: - images (torch.tensor): The tensor of the image in the form (Batch Size, Channels, Width, Height) - - Returns: - [torch.tensor]: Features Projection in the form (Batch Size, Projection Dim.) - """ - # To Do Add dimensionality - features = self.resnet(images) - - features = features.reshape(features.size(0), -1).to(self.device) - features = self.linear(features) - - return features - -class DecoderRNN(nn.Module): - def __init__(self, hidden_size: int, padding_index: int, vocab_size: int, embedding_size: int, device: str = "cpu"): - """Define the constructor for the RNN Net - - Args: - hidden_size (int): The Capacity of the LSTM Cell - padding_index (int): The index of the padding id, given from the vocabulary associated to the dataset - vocab_size (int)): The size of the vocabulary associated to the dataset - embedding_size (int): The number of dimension associated to the input of the LSTM cell - device (str, optional): The device on which the operations will be performed. Default "cpu" - """ - super(DecoderRNN, self).__init__() - - self.device = torch.device(device) - # Embedding layer that turns words into a vector of a specified size - self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index) - - # The LSTM takes embedded word vectors (of a specified size) as input - # and outputs hidden states of size hidden_dim - self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size) - - # The linear layer that maps the hidden state output dimension - # to the number of words we want as output, vocab_size - self.linear_1 = nn.Linear(hidden_size, vocab_size) - - - def forward(self, features: torch.tensor, captions: torch.tensor, captions_length: List[int]) -> Tuple[torch.tensor, List[int]]: - """Compute the forward operation of the RNN. - input of the LSTM cell for each time step: - t_{-1}: feature vector - t_0: Deterministict - . - . - . - t_{N-1}: The embedding vector associated to the S_{N-1} id. - - Args: - features (torch.tensor): The features associated to each element of the batch. (batch_size, embed_size) - - captions (torch.tensor): The caption associated to each element of the batch. (batch_size, max_captions_length, word_embedding) - REMARK Each caption is in the full form: + .... + - - caption_length ([int]): The length of each caption in the batch. - Returns: - (torch.tensor): The hidden state of each time step from t_1 to t_N. (batch_size, max_captions_length, vocab_size) - - (list(int)): The length of each decoded caption. - REMARK The is provided as input at t_0. - REMARK The token will be removed from the input of the LSTM. - """ - - # Retrieve batch size - batch_size = features.shape[0] # features is of shape (batch_size, embed_size) - - # Create embedded word vector for each word in the captions - inputs = self.word_embeddings(captions) # In: Out: (batch_size, captions length, embed_size) - - # Initialize the hidden state and the cell state at time t_{-1} - _h, _c = (features, features) # _h : (Batch size, Hidden size), _c : (Batch size, Hidden size) - - # Deterministict Output as first word of the caption t_{0} - start = self.word_embeddings(torch.LongTensor([1]).to(self.device)) # Get the embeddings of the token - - # Bulk insert of embeddings to all the elements of the batch - outputs = start.repeat(batch_size,1,1).to(self.device) - - # Feed LSTMCell with image features and retrieve the state - - # How it works the loop? - # For each time step t \in {0, N-1}, where N is the caption length - - # Since the sequences are padded, how the forward is performed? Since the don't need to be feeded as input? - # The assumption is that the decode captions will have a length - - for idx in range(0,inputs.shape[1]): - _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions - _outputs = self.linear_1(_h) - outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell for all the elements in batch - - return outputs, list(map(lambda length: length-1, captions_length)) - - def generate_caption(self, feature: torch.tensor, captions_length: int) -> torch.tensor: - """Given the features vector retrieved by the encoder, perform a decoding (Generate a caption) - - Args: - feature (torch.tensor): The features vector (1, embedding_size) - captions_length (int): The length of the caption - - Returns: - torch.tensor: The caption associated to the image given. - It includes at t_0 by default. - """ - - sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded - input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) - with torch.no_grad(): - _h ,_c = (feature.unsqueeze(0),feature.unsqueeze(0)) - for _ in range(captions_length-1): - _h, _c = self.lstm_unit(input, (_h ,_c)) # _h: (1, 1, hidden_size) - outputs = self.linear_1(_h) # outputs: (1, vocab_size) - _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if self.device.type == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: The predicted id - sampled_ids.append(predicted) - input = self.word_embeddings(predicted) # inputs: (batch_size, embed_size) - input = input.to(torch.device(self.device)) # inputs: (batch_size, 1, embed_size) - if predicted == 2: - break - sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (1, captions_length) - return sampled_ids - - -class CaRNetvHC(nn.Module): - - def __init__(self, hidden_size: int, padding_index: int, vocab_size: int, embedding_size: int, device: str = "cpu"): - """Create the CaRNet - - Args: - hidden_size (int): The Capacity of the LSTM Cell - padding_index (int): The index of the padding id, given from the vocabulary associated to the dataset - vocab_size (int)): The size of the vocabulary associated to the dataset - embedding_size (int): The number of dimension associated to the input of the LSTM cell - device (str, optional): The device on which the net does the computation. Defaults to "cpu". - """ - - super(CaRNetvHC, self).__init__() - self.padding_index = padding_index - self.device = torch.device(device) - - # Define Encoder and Decoder - self.C = EncoderCNN(hidden_size, device) - self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size, device) - - self.C.to(self.device) - self.R.to(self.device) - - def save(self, file_path: str) -> bool: - """Save the net in non-volatile memory - - Args: - file_name (str): Relative path to save the net. Ex. "home/pippo/saved" - - Returns: - bool: If True: Net saved correctly. False otherwise. - """ - try: - torch.save(self.C.state_dict(), f"{file_path}/CaRNetvHC_C.pth") - torch.save(self.R.state_dict(), f"{file_path}/CaRNetvHC_R.pth") - except Exception as ex: - print(ex) - return False - return True - - def load(self, file_path: str) -> bool: - """Load the net from non-volatile memory into RAM - - Args: - file_name (str): Relative path of the net. Ex. "home/pippo/saved" - - Returns: - bool: If True: Net loaded correctly. False otherwise. - """ - - # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device) - self.C.load_state_dict(torch.load(f"{file_path}/CaRNetvHC_C.pth", map_location=self.device)) - self.R.load_state_dict(torch.load(f"{file_path}/CaRNetvHC_R.pth", map_location=self.device)) - - def forward(self, images: torch.tensor, captions: torch.tensor) -> torch.tensor: - """Provide images to the net for retrieve captions - - Args: - images (torch.tensor): The images of the batch. (Batch Size, Channels, Width, Height) - captions (torch.tensor): (Batch Size, Max_Captions_Length). - ASSUMPION: The captions are padded with Token - - Returns: - (torch.tensor): The hidden state of each time step from t_1 to t_N. (batch_size, max_captions_length, vocab_size) - """ - features = self.C(images) - return self.R(features, captions) - - def __accuracy(self, outputs: torch.tensor, labels: torch.tensor, captions_length: List[int]) -> float: - """Evaluate the accuracy of the Net. - Assumption: outputs and labels have same shape and already padded. - - Args: - outputs (torch.tensor): [description] - labels (torch.tensor): [description] - captions_length (list): [description] - - Returns: - float: The accuracy of the Net - """ - - # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch! - - # computing the accuracy - - # To Do add dimensionality - outputs = torch.nn.utils.rnn.pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True).to(self.device) - labels = torch.nn.utils.rnn.pack_padded_sequence(labels, captions_length.cpu(), batch_first=True).to(self.device) - right_predictions = outputs.data - labels.data == 0 - - acc = right_predictions.to(torch.float32).sum(axis=0) / right_predictions.shape[0] - return acc - - # TO DO: Devo usare la confusion matrix????????? - - def train(self, train_set: MyDataset, validation_set: MyDataset, lr: float, epochs: int, vocabulary: Vocabulary): - """[summary] - - Args: - train_set (MyDataset): [description] - validation_set (MyDataset): [description] - lr (float): [description] - epochs (int): [description] - vocabulary (Vocabulary): [description] - """ - - # Initialize Loss: CrossEntropyLoss -> Softmax + NegativeLogLikelihoodLoss - # Q. Why ignore_index is setted to instead of ? - # A. In the training, both output of the CaRNet and Target label start as padded tensor, but when we compute the loss it will evaluate the tensor with pack_padded_sequence. - # And since token is hardcoded as output at t_0 we could avoid the computation of loss on it, since will be 0 fover. - - criterion = nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()[""],reduction="sum").cuda() if self.device.type == "cuda" \ - else nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()[""],reduction="sum") - - # initializing some elements - best_val_acc = -1. # the best accuracy computed on the validation data - best_epoch = -1 # the epoch in which the best accuracy above was computed - - # ensuring the classifier is in 'train' mode (pytorch) - self.C.train() - self.R.train() - - # creating the optimizer - optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr) - - # loop on epochs! - for e in range(0, epochs): - - # epoch-level stats (computed by accumulating mini-batch stats) - epoch_train_acc = 0. - epoch_train_loss = 0. - epoch_num_train_examples = 0 - - for images,captions_ids,captions_length in train_set: - optimizer.zero_grad() - - batch_num_train_examples = images.shape[0] # mini-batch size (it might be different from 'batch_size') -> last batch truncated - epoch_num_train_examples += batch_num_train_examples - - - images = images.to(self.device) - captions_ids = captions_ids.to(self.device) # captions > (B, L) - captions_length = captions_length.to(self.device) - - # computing the network output on the current mini-batch - features = self.C(images) - outputs, outputs_length = self.R(features, captions_ids, captions_length) # outputs > (B, L, |V|); - - outputs = pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength, |Vocabulary|) -> (Batch * CaptionLength, |Vocabulary|) - - targets = pack_padded_sequence(captions_ids, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength) -> (Batch * CaptionLength) - - - loss = criterion(outputs.data, targets.data) - - # computing gradients and updating the network weights - loss.backward() # computing gradients - optimizer.step() # updating weights - - with torch.no_grad(): - self.C.eval() - self.R.eval() - features = self.C(images) - import random - numb = random.randint(0,2) - caption = self.R.generate_caption(features[numb],30) - print(vocabulary.rev_translate(captions_ids[numb])) - print(vocabulary.rev_translate(caption[0])) - self.C.train() - self.R.train() - - with torch.no_grad(): - self.C.eval() - self.R.eval() - - # Compute captions as ids for all the training images - projections = self.C(images) - - captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device) - - for idx,projection in enumerate(range(projections.shape[0])): - _caption_no_pad = self.R.generate_caption(projections[idx],captions_ids.shape[1]) - captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad - # Fill the remaining portion of caption eventually with zeros - # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. - - captions_output_padded = captions_output.type(torch.int32).to(self.device) # From list of tensors to tensors - - # computing performance - batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length) - - # accumulating performance measures to get a final estimate on the whole training set - epoch_train_acc += batch_train_acc * batch_num_train_examples - - # accumulating other stats - epoch_train_loss += loss.item() * batch_num_train_examples - self.C.train() - self.R.train() - - # printing (mini-batch related) stats on screen - print(" mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc)) - - val_acc = self.eval_classifier(validation_set) - - # # saving the model if the validation accuracy increases - if val_acc > best_val_acc: - best_val_acc = val_acc - best_epoch = e + 1 - - self.save("/content/drive/MyDrive/Progetti/Neural Networks/.saved") - - epoch_train_loss /= epoch_num_train_examples - - # printing (epoch related) stats on screen - print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}" - + (", BEST!" if best_epoch == e + 1 else "")) - .format(e + 1, epochs, epoch_train_loss, - epoch_train_acc / epoch_num_train_examples, val_acc)) - - - def eval_classifier(self, data_set): - """Evaluate the classifier on the given data set.""" - - # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state) - training_mode_originally_on = self.C.training and self.R.training - if training_mode_originally_on: - self.C.eval() - self.R.eval() # enforcing evaluation mode - - - - with torch.no_grad(): # keeping off the autograd engine - - # loop on mini-batches to accumulate the network outputs (creating a new iterator) - for images,captions_ids,captions_length in data_set: - images = images.to(self.device) - - captions_ids = captions_ids.to(self.device) - - projections = self.C(images) - - captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device) - - for idx,projection in enumerate(range(projections.shape[0])): - _caption_no_pad = self.R.generate_caption(projections[idx],captions_ids.shape[1]) - captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad - # Fill the remaining portion of caption eventually with zeros - # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. - - captions_output_padded = captions_output.type(torch.int32).to(self.device) # From list of tensors to tensors - - # computing performance - acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length) - - if training_mode_originally_on: - self.C.train() # restoring the training state, if needed - self.R.train() - return acc - -# Example of usage -if __name__ == "__main__": - from torch.utils.data import DataLoader - ds = MyDataset("./dataset", percentage=1) - v = Vocabulary(ds,reload=True) - dc = ds.get_fraction_of_dataset(percentage=70, delete_transfered_from_source=True) - df = ds.get_fraction_of_dataset(percentage=30, delete_transfered_from_source=True) - # use dataloader facilities which requires a preprocessed dataset - - - dataloader_training = DataLoader(dc, batch_size=32, - shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_training(data,v)) - - dataloader_evaluation = DataLoader(df, batch_size=32, - shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v)) - - net = CaRNetvHC(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cuda:0") - #net.load("CaRNetvI") - net.train(dataloader_training,dataloader_evaluation,1e-3,500,v) diff --git a/NeuralModel/CaRNetvI.py b/NeuralModel/CaRNetvI.py deleted file mode 100644 index 32e35ac..0000000 --- a/NeuralModel/CaRNetvI.py +++ /dev/null @@ -1,449 +0,0 @@ -##################################################### -## DISCLAIMER: IL CODICE E` ESSENZIALMENTE HARDCODED, SOLO DI TESTING, NON RISPETTA I CANONI DELLO SGD, CERCO DI CAPIRE SOLO SE FUNZIONA! -# NON GIUDICARLO GENTILMENTE, SO CHE NON VA` FATTO COSI`, POI LO SISTEMO :) -## -## -## pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html - - -import torch -import torch.nn as nn -import torchvision.models as models -from torch.nn.utils.rnn import pack_padded_sequence -import torch.nn.functional as F -from typing import Tuple,List -from Dataset import MyDataset -from Vocabulary import Vocabulary -class EncoderCNN(nn.Module): - def __init__(self, projection_size: int, device: str = "cpu"): - """Constructor of the Encoder NN - - Args: - projection_size (int): The dimension of projection into the space of RNN (Could be the input or the hidden state). - - device (str, optional): The device on which the operations will be performed. Default "cpu". - """ - super(EncoderCNN, self).__init__() - - self.device = torch.device(device) - resnet = models.resnet50(pretrained=True) - for param in resnet.parameters(): # Freezing weights - param.requires_grad_(False) - - modules = list(resnet.children())[:-1] # remove last fc layer - self.resnet = nn.Sequential(*modules) - - self.linear = nn.Linear(resnet.fc.in_features, projection_size) # define a last layer - - def forward(self, images: torch.Tensor) -> torch.Tensor: - """Forward operation of the nn - - Args: - images (torch.tensor): The tensor of the image in the form (Batch Size, Channels, Width, Height) - - Returns: - [torch.tensor]: Features Projection in the form (Batch Size, Projection Dim.) - """ - # To Do Add dimensionality - features = self.resnet(images) - - features = features.reshape(features.size(0), -1).to(self.device) - features = self.linear(features) - - return features - -class DecoderRNN(nn.Module): - def __init__(self, hidden_size: int, padding_index: int, vocab_size: int, embedding_size: int, device: str = "cpu"): - """Define the constructor for the RNN Net - - Args: - hidden_size (int): The Capacity of the LSTM Cell - padding_index (int): The index of the padding id, given from the vocabulary associated to the dataset - vocab_size (int)): The size of the vocabulary associated to the dataset - embedding_size (int): The number of dimension associated to the input of the LSTM cell - device (str, optional): The device on which the operations will be performed. Default "cpu" - """ - super(DecoderRNN, self).__init__() - - self.device = torch.device(device) - # Embedding layer that turns words into a vector of a specified size - self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index) - - # The LSTM takes embedded word vectors (of a specified size) as input - # and outputs hidden states of size hidden_dim - self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size) - - # The linear layer that maps the hidden state output dimension - # to the number of words we want as output, vocab_size - self.linear_1 = nn.Linear(hidden_size, vocab_size) - - - def forward(self, features: torch.tensor, captions: torch.tensor, captions_length: list[int]) -> Tuple[torch.tensor, list[int]]: - """Compute the forward operation of the RNN. - input of the LSTM cell for each time step: - t_{-1}: feature vector - t_0: Deterministict - . - . - . - t_{N-1}: The embedding vector associated to the S_{N-1} id. - t_{N}: - . - . - . - - - Args: - features (torch.tensor): The features associated to each element of the batch. (batch_size, embed_size) - - captions (torch.tensor): The caption associated to each element of the batch. (batch_size, max_captions_length, word_embedding) - REMARK Each caption is in the full form: + .... + - - caption_length ([int]): The length of each caption in the batch. - Returns: - (torch.tensor): The hidden state of each time step from t_1 to t_{MaxN}. (batch_size, max_captions_length, vocab_size) - - (list(int)): The length of each decoded caption. - REMARK The is provided as input at t_0. - REMARK The token will be removed from the input of the LSTM. - """ - - # Retrieve batch size - batch_size = features.shape[0] # features is of shape (batch_size, embed_size) - - # Create embedded word vector for each word in the captions - inputs = self.word_embeddings(captions) # In: Out: (batch_size, captions length, embed_size) - - # Initialize the hidden state and the cell state at time t_{-1} - _h, _c = self.lstm_unit(features) # _h : (Batch size, Hidden size), _c : (Batch size, Hidden size) - - # Deterministict Output as first word of the caption t_{0} - start = self.word_embeddings(torch.LongTensor([1]).to(self.device)) # Get the embeddings of the token - - # Bulk insert of embeddings to all the elements of the batch - outputs = start.repeat(batch_size,1,1).to(self.device) - - # Feed LSTMCell with image features and retrieve the state - - # How it works the loop? - # For each time step t \in {0, N-1}, where N is the caption length - - # Since the sequences are padded, how the forward is performed? Since the don't need to be feeded as input? - # The assumption is that the decode captions will have a length - - for idx in range(0,inputs.shape[1]): - _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions - _outputs = self.linear_1(_h) - outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell for all the elements in batch - - return outputs, list(map(lambda length: length-1, captions_length)) - - def generate_caption(self, feature: torch.tensor, captions_length: int) -> torch.tensor: - """Given the features vector retrieved by the encoder, perform a decoding (Generate a caption) - - Args: - feature (torch.tensor): The features vector (1, embedding_size) - captions_length (int): The length of the caption - - Returns: - torch.tensor: The caption associated to the image given. - It includes at t_0 by default. - """ - - sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded - input = self.word_embeddings(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) - with torch.no_grad(): - _h ,_c = self.lstm_unit(feature.unsqueeze(0)) - for _ in range(captions_length-1): - _h, _c = self.lstm_unit(input, (_h ,_c)) # _h: (1, 1, hidden_size) - outputs = self.linear_1(_h) # outputs: (1, vocab_size) - _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if self.device.type == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: The predicted id - sampled_ids.append(predicted) - input = self.word_embeddings(predicted) # inputs: (batch_size, embed_size) - input = input.to(torch.device(self.device)) # inputs: (batch_size, 1, embed_size) - if predicted == 2: - break - sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (1, captions_length) - return sampled_ids - - -class CaRNetvI(nn.Module): - - def __init__(self, hidden_size: int, padding_index: int, vocab_size: int, embedding_size: int, device: str = "cpu"): - """Create the CaRNet - - Args: - hidden_size (int): The Capacity of the LSTM Cell - padding_index (int): The index of the padding id, given from the vocabulary associated to the dataset - vocab_size (int)): The size of the vocabulary associated to the dataset - embedding_size (int): The number of dimension associated to the input of the LSTM cell - device (str, optional): The device on which the net does the computation. Defaults to "cpu". - """ - - super(CaRNetvI, self).__init__() - self.padding_index = padding_index - self.device = torch.device(device) - - # Define Encoder and Decoder - self.C = EncoderCNN(embedding_size, device) - self.R = DecoderRNN(hidden_size, padding_index, vocab_size, embedding_size, device) - - self.C.to(self.device) - self.R.to(self.device) - - def save(self, file_path: str) -> bool: - """Save the net in non-volatile memory - - Args: - file_name (str): Relative path to save the net. Ex. "home/pippo/saved" - - Returns: - bool: If True: Net saved correctly. False otherwise. - """ - try: - torch.save(self.C.state_dict(), f"{file_path}/CaRNetvI_C.pth") - torch.save(self.R.state_dict(), f"{file_path}/CaRNetvI_C_R.pth") - except Exception as ex: - print(ex) - return False - return True - - def load(self, file_path: str) -> bool: - """Load the net from non-volatile memory into RAM - - Args: - file_name (str): Relative path of the net. Ex. "home/pippo/saved" - - Returns: - bool: If True: Net loaded correctly. False otherwise. - """ - - # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device) - self.C.load_state_dict(torch.load(f"{file_path}/CaRNetvI_C.pth", map_location=self.device)) - self.R.load_state_dict(torch.load(f"{file_path}/CaRNetvI_C_R.pth", map_location=self.device)) - - def forward(self, images: torch.tensor, captions: torch.tensor) -> torch.tensor: - """Provide images to the net for retrieve captions - - Args: - images (torch.tensor): The images of the batch. (Batch Size, Channels, Width, Height) - captions (torch.tensor): (Batch Size, Max_Captions_Length). - ASSUMPION: The captions are padded with Token - - Returns: - (torch.tensor): The hidden state of each time step from t_1 to t_N. (batch_size, max_captions_length, vocab_size) - """ - features = self.C(images) - return self.R(features, captions) - - def __accuracy(self, outputs: torch.tensor, labels: torch.tensor, captions_length: List[int]) -> float: - """Evaluate the accuracy of the Net. - Assumption: outputs and labels have same shape and already padded. - - Args: - outputs (torch.tensor): [description] - labels (torch.tensor): [description] - captions_length (list): [description] - - Returns: - float: The accuracy of the Net - """ - - # We could subtract labels.ids to outputs.ids tensor, all the values different from 0 (output_caption_id != target_caption_id) are mismatch! - - # computing the accuracy - - # To Do add dimensionality - outputs = torch.nn.utils.rnn.pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True).to(self.device) - labels = torch.nn.utils.rnn.pack_padded_sequence(labels, captions_length.cpu(), batch_first=True).to(self.device) - right_predictions = outputs.data - labels.data == 0 - - acc = right_predictions.to(torch.float32).sum(axis=0) / right_predictions.shape[0] - return acc - - # TO DO: Devo usare la confusion matrix????????? - - def train(self, train_set: MyDataset, validation_set: MyDataset, lr: float, epochs: int, vocabulary: Vocabulary): - """[summary] - - Args: - train_set (MyDataset): [description] - validation_set (MyDataset): [description] - lr (float): [description] - epochs (int): [description] - vocabulary (Vocabulary): [description] - """ - - # Initialize Loss: CrossEntropyLoss -> Softmax + NegativeLogLikelihoodLoss - # Q. Why ignore_index is setted to instead of ? - # A. In the training, both output of the CaRNet and Target label start as padded tensor, but when we compute the loss it will evaluate the tensor with pack_padded_sequence. - # And since token is hardcoded as output at t_0 we could avoid the computation of loss on it, since will be 0 fover. - - criterion = nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()[""],reduction="sum").cuda() if self.device.type == "cuda" \ - else nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()[""],reduction="sum") - - # initializing some elements - best_val_acc = -1. # the best accuracy computed on the validation data - best_epoch = -1 # the epoch in which the best accuracy above was computed - - # ensuring the classifier is in 'train' mode (pytorch) - self.C.train() - self.R.train() - - # creating the optimizer - optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr) - - # loop on epochs! - for e in range(0, epochs): - - # epoch-level stats (computed by accumulating mini-batch stats) - epoch_train_acc = 0. - epoch_train_loss = 0. - epoch_num_train_examples = 0 - - for images, captions_ids, captions_length in train_set: - optimizer.zero_grad() - - batch_num_train_examples = images.shape[0] # mini-batch size (it might be different from 'batch_size') -> last batch truncated - epoch_num_train_examples += batch_num_train_examples - - - images = images.to(self.device) - captions_ids = captions_ids.to(self.device) # captions > (B, L) - captions_length = captions_length.to(self.device) - - # computing the network output on the current mini-batch - features = self.C(images) - outputs, outputs_length = self.R(features, captions_ids, captions_length) # outputs > (B, L, |V|); - - outputs = pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength, |Vocabulary|) -> (Batch * CaptionLength, |Vocabulary|) - - targets = pack_padded_sequence(captions_ids, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength) -> (Batch * CaptionLength) - - - loss = criterion(outputs.data, targets.data) - - # computing gradients and updating the network weights - loss.backward() # computing gradients - optimizer.step() # updating weights - - with torch.no_grad(): - self.C.eval() - self.R.eval() - features = self.C(images) - import random - numb = random.randint(0,2) - caption = self.R.generate_caption(features[numb],30) - print(vocabulary.rev_translate(captions_ids[numb])) - print(vocabulary.rev_translate(caption[0])) - self.C.train() - self.R.train() - - with torch.no_grad(): - self.C.eval() - self.R.eval() - - # Compute captions as ids for all the training images - projections = self.C(images) - - captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device) - - for idx,projection in enumerate(range(projections.shape[0])): - _caption_no_pad = self.R.generate_caption(projections[idx],captions_ids.shape[1]) - captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad - # Fill the remaining portion of caption eventually with zeros - # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. - - captions_output_padded = captions_output.type(torch.int32).to(self.device) # From list of tensors to tensors - - # computing performance - batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length) - - # accumulating performance measures to get a final estimate on the whole training set - epoch_train_acc += batch_train_acc * batch_num_train_examples - - # accumulating other stats - epoch_train_loss += loss.item() * batch_num_train_examples - self.C.train() - self.R.train() - - # printing (mini-batch related) stats on screen - print(" mini-batch:\tloss={0:.4f}, tr_acc={1:.2f}".format(loss.item(), batch_train_acc)) - - val_acc = self.eval_classifier(validation_set) - - # # saving the model if the validation accuracy increases - # if val_acc > best_val_acc: - # best_val_acc = val_acc - # best_epoch = e + 1 - # self.save("CaRNetvI") - - epoch_train_loss /= epoch_num_train_examples - - # printing (epoch related) stats on screen - print(("epoch={0}/{1}:\tloss={2:.4f}, tr_acc={3:.2f}, val_acc={4:.2f}" - + (", BEST!" if best_epoch == e + 1 else "")) - .format(e + 1, epochs, epoch_train_loss, - epoch_train_acc / epoch_num_train_examples, val_acc)) - - - def eval_classifier(self, data_set): - """Evaluate the classifier on the given data set.""" - - # checking if the classifier is in 'eval' or 'train' mode (in the latter case, we have to switch state) - training_mode_originally_on = self.C.training and self.R.training - if training_mode_originally_on: - self.C.eval() - self.R.eval() # enforcing evaluation mode - - - - with torch.no_grad(): # keeping off the autograd engine - - # loop on mini-batches to accumulate the network outputs (creating a new iterator) - for images, captions_ids, captions_length in data_set: - images = images.to(self.device) - - captions_ids = captions_ids.to(self.device) - - projections = self.C(images) - - captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device) - - for idx,projection in enumerate(range(projections.shape[0])): - _caption_no_pad = self.R.generate_caption(projections[idx],captions_ids.shape[1]) - captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad - # Fill the remaining portion of caption eventually with zeros - # Accuracy is not altered since if the length of caption is smaller than the captions_target_ids(padded), feed it with PAD is valid. - - captions_output_padded = captions_output.type(torch.int32).to(self.device) # From list of tensors to tensors - - # computing performance - acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length) - - if training_mode_originally_on: - self.C.train() # restoring the training state, if needed - self.R.train() - return acc - -# Example of usage -if __name__ == "__main__": - from Vocabulary import Vocabulary - from Dataset import MyDataset - from torch.utils.data import DataLoader - ds = MyDataset("./dataset/images/", percentage=8) - v = Vocabulary(ds,reload=True) - dc = ds.get_fraction_of_dataset(percentage=70, delete_transfered_from_source=True) - df = ds.get_fraction_of_dataset(percentage=30, delete_transfered_from_source=True) - # use dataloader facilities which requires a preprocessed dataset - - - dataloader_training = DataLoader(dc, batch_size=5, - shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_training(data,v)) - - dataloader_evaluation = DataLoader(df, batch_size=5, - shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v)) - - net = CaRNetvI(512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cuda:0") - #net.load("CaRNetvI") - net.train(dataloader_training,dataloader_evaluation,1e-3,500,v) diff --git a/NeuralModel/Dataset.py b/NeuralModel/Dataset.py deleted file mode 100644 index 3ef9a28..0000000 --- a/NeuralModel/Dataset.py +++ /dev/null @@ -1,128 +0,0 @@ -import os -import pandas as pd -import torch -import numpy as np -from enum import Enum -from torch.utils.data import Dataset, DataLoader -import torch.nn as nn -from PIL import Image -import re -from torchvision import transforms - - -# ENV: -# MAX_CAPTION_LENGTH -MAX_CAPTION_LENGTH = 15 - -class MyDataset(Dataset): - - image_trasformation_parameter = { - "crop":{ - "size": 224 - }, - "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB) - "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB) - } - - def __init__(self, directory_of_data:str = None, percentage:int = 100, already_computed_dataframe: pd.DataFrame = None): - """Create a new dataset from source files - - Args: - directory_of_data (str): [description] - """ - if already_computed_dataframe is not None: - self.directory_of_data = directory_of_data - self._dataset = already_computed_dataframe - return - - if not os.path.exists(directory_of_data): - raise ValueError(f"{directory_of_data} not Exist!") - if not os.path.isdir(directory_of_data): - raise ValueError(f"{directory_of_data} is not a directory!") - - _temp_dataset=pd.read_csv(f"{directory_of_data}/results.csv", sep="|", skipinitialspace=True)[["image_name","comment"]] - _temp_dataset["comment"] = _temp_dataset["comment"].apply( lambda comment: re.findall("[\\w]+|\.|\,",str(comment).lower())) - _temp_dataset = _temp_dataset[ _temp_dataset["comment"].map(len) <= MAX_CAPTION_LENGTH] - self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100))) - self.directory_of_data = directory_of_data - - def get_fraction_of_dataset(self, percentage: int, delete_transfered_from_source: bool = False): - _temp_df_moved = self._dataset.head(int(len(self._dataset)*(percentage/100))).sample(frac=1) - _temp_df_copy = _temp_df_moved.copy() - - if delete_transfered_from_source: - self._dataset = self._dataset.drop(_temp_df_copy.index) - return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy) - - def get_all_distinct_words_in_dataset(self): - words = [] - for idx,row in self._dataset.iterrows(): - for word in row["comment"]: - if word not in words: - words.append(word) - return words - - def __len__(self): - return self._dataset.shape[0] - - def __getitem__(self, idx): - - image, caption = Image.open(f"{self.directory_of_data}/images/{self._dataset.iloc[idx]['image_name']}").convert('RGB'), \ - self._dataset.iloc[idx]["comment"] - - return image, caption - - def pack_minibatch_training(self, data, vocabulary): - - # Sort a data list by caption length (descending order). - data.sort(key=lambda x: len(x[1]), reverse=True) - - images, captions = zip(*data) - - operations = transforms.Compose([ - transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"],MyDataset.image_trasformation_parameter["crop"]["size"])), # Crop a random portion of image and resize it to a given size. - transforms.RandomHorizontalFlip(p=0), # Horizontally flip the given image randomly with a given probability. - transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor. (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] - transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"]), - ]) - images = list(map(lambda image: operations(image),list(images))) - - # Merge images (from tuple of 3D tensor to 4D tensor). - images = torch.stack(images, 0) # (Batch Size, Color, Height, Width) - - captions_length = torch.tensor([len(caption)+2 for caption in captions]) - - captions = [vocabulary.translate(caption,"complete") for caption in captions] - - captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True) - - - return images, captions.type(torch.LongTensor), captions_length.type(torch.int32) - - def pack_minibatch_evaluation(self, data, vocabulary): - - # Sort a data list by caption length (descending order). - data.sort(key=lambda x: len(x[1]), reverse=True) - - images, captions = zip(*data) - - operations = transforms.Compose([ - transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"], MyDataset.image_trasformation_parameter["crop"]["size"])), # Crops the given image at the center. - transforms.ToTensor(), - transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"]) - ]) - - images = list(map(lambda image: operations(image),list(images))) - - # Merge images (from tuple of 3D tensor to 4D tensor). - images = torch.stack(images, 0) # (Batch Size, Color, Height, Width) - - captions_length = torch.tensor([len(caption)+2 for caption in captions]) - - captions = [vocabulary.translate(caption,"complete") for caption in captions] - - captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True) - - - return images, captions.type(torch.LongTensor), captions_length.type(torch.int32) - \ No newline at end of file diff --git a/NeuralModel/Vocabulary.py b/NeuralModel/Vocabulary.py deleted file mode 100644 index 725945a..0000000 --- a/NeuralModel/Vocabulary.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import torch -import warnings -from Dataset import MyDataset -from typing import List - -class Vocabulary(): - # The vocabulary implementation is done with a pre-trained word embedding GLOVE50d - # each word is represented by a record in a dataframe with this structure - - - def __init__(self, source_dataset: MyDataset, verbose: bool = False, reload: bool = False): - - self.enriched = False # Tell that all the word coming from the dataset are in the vocabulary if it is set to True - self._make_enrich = False # Allow the user to enrich the vocabulary if it is set to True - # Check if the enriched vocabulary(glove + PAD + SOS + EOS + UNK + dataset vocabulary) already exists - if os.path.exists(".saved/rich_embeddings_v1.pt") and os.path.exists(".saved/rich_word2id_v1.pt") and not reload: - self.embeddings = torch.load(".saved/rich_embeddings_v1.pt") - self.word2id = torch.load(".saved/rich_word2id_v1.pt") - self.enriched = True - return - - # Since the constructor arrived here, we need to load for the 1st time all the possible words from the dataset - dataset_words = source_dataset.get_all_distinct_words_in_dataset() - - # Dictionary length - self.dictionary_length = len(dataset_words)+4 # Dictionary word + 4 Flavored Token (PAD + SOS + EOS + UNK) - - self.word2id = {} - self.embeddings = torch.zeros((self.dictionary_length, self.dictionary_length)) # DIM1: dict rows + 4 flavored token (PAD + SOS + EOS + UNK) | DIM2: Dict Rows +4 flavored token (PAD + SOS + EOS + UNK) as 1-hot - - # Initialize the token: - # , , , - self.word2id[""] = 0 - self.word2id[""] = 1 - self.word2id[""] = 2 - self.word2id[""] = 3 - - counter = 4 - for word in dataset_words: - self.word2id[word] = counter - counter += 1 - - self.embeddings = torch.eye(self.dictionary_length) - - def predefined_token_idx(self) -> dict: - return { - "":0, - "":1, - "":2, - "":3 - } - - def translate(self, word_sequence : List[str], type : str = "complete") -> torch.tensor: - """Given a sequence of word, translate into id list according to the vocabulary. - - Args: - word_sequence (str): [description] - """ - - # Initialize the translator - - if type == "uncomplete": - _sequence = torch.zeros(len(word_sequence)+1, dtype=torch.int32) # + ...Caption... - - if type == "complete": - _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # + ...Caption... + - _sequence[-1] = self.word2id[""] - - _sequence[0] = self.word2id[""] - - counter = 1 # Always skip - - # Evaluate all the word into the caption and translate it to an embeddings - for word in word_sequence: - if word.lower() in self.word2id.keys(): - _sequence[counter] = self.word2id[word.lower()] - else: - _sequence[counter] = self.word2id[""] - counter += 1 - - return _sequence - - def rev_translate(self, words_id : torch.tensor) -> List[str]: - """Given a sequence of word, translate into id list according to the vocabulary. - - Args: - word_sequence (str): [description] - """ - # Check if the Vocabulary is enriched with all the possible word outside glove, taken from the dataset. - return [list(self.word2id.keys())[idx] for idx in words_id[:].tolist()] # word_id (1,caption_length) - - - def __len__(self): - """The total number of words in this Vocabulary.""" - - return len(self.word2id.keys()) - - -# ---------------------------------------------------------------- -# Usage example - -if __name__ == '__main__': - #Load the vocabulary - v = Vocabulary(verbose=True) - # Make a translation - print(v.translate(["I","like","PLay","piano","."])) - # Enrich the vocabulary - v.make_enrich = True - dataset = ["I","Like","PLay","PIPPOplutopaperino"] - v.enrich(dataset) - v.make_enrich = False - # Enrich the vocabulary with a bulk insert - v.make_enrich = True - dataset = [["I","Like","PLay","PIPPOplutopaperino"],["I","Like","PLay","pizza"]] - v.bulk_enrich(dataset) - v.make_enrich = False - - - - - - - - - - - - - - \ No newline at end of file diff --git a/NeuralModels/Attention/IAttention.py b/NeuralModels/Attention/IAttention.py new file mode 100644 index 0000000..94b15f5 --- /dev/null +++ b/NeuralModels/Attention/IAttention.py @@ -0,0 +1,37 @@ +import torch +import torch.nn as nn + +class IAttention(nn.Module): + """ + Class interface for Attention unit + Args are intended as suggested. + """ + def __init__(self, *args): + """Constructor for an Attention model + + Args: + encoder_dim (int): + The number of features extracted from the image. + hidden_dim (int): + The capacity of the LSTM. + attention_dim (int): + The capacity of the Attention Model. + """ + super(IAttention, self).__init__() + + def forward(self, *args): + """Compute z_t given images and hidden state at t-1 for all the element in the batch. + + Args: + images (torch.Tensor): `(batch_dim, image_portions, encoder_dim)` + The tensor of the images in the batch. + lstm_hidden_states (torch.Tensor): `(batch_dim, hidden_dim)` + The hidden states at t-1 of the elements in the batch. + + Returns: + (Tuple[torch.Tensor,torch.Tensor]): `[(batch_dim, encoder_dim), (batch_dim, image_portions)]` + Z_t and the alphas evaluated for each portion of the image, for each image in the batch. + """ + pass + + \ No newline at end of file diff --git a/NeuralModels/Attention/SoftAttention.py b/NeuralModels/Attention/SoftAttention.py new file mode 100644 index 0000000..c50888f --- /dev/null +++ b/NeuralModels/Attention/SoftAttention.py @@ -0,0 +1,77 @@ +from re import S +import torch.nn as nn +import torch +import torchvision.models as models +from typing import Tuple + +class SoftAttention(nn.Module): + """ + Simple implementation of Bahdanau Attention model. + """ + + def __init__(self, encoder_dim: int , hidden_dim: int, attention_dim: int, number_of_splits: int = 7): + """Constructor for a SoftAttention model + + Args: + encoder_dim (int): + The number of features extracted from the image. + hidden_dim (int): + The capacity of the LSTM. + attention_dim (int): + The capacity of the Attention Model. + number_of_splits (int): + Number of image portions for Heigth (square resolution) + """ + super(SoftAttention, self).__init__() + + self.attention_dim = attention_dim + + self.encoder_dim = encoder_dim + + self.number_of_splits = number_of_splits + + self.image_attention_projection = nn.Linear(encoder_dim, attention_dim) + + self.lstm_hidden_state_attention_projection = nn.Linear(hidden_dim, attention_dim) + + print(f"Construction of Attention: \ + \n\t Attention dimension: {attention_dim},\ + \n\t Encoder dimension: {encoder_dim},\ + \n\t LSTM Capacity: {hidden_dim},\ + \n\t Alphas: {number_of_splits**2}") + + self.attention = nn.Linear(attention_dim, 1) + + self.ReLU = nn.ReLU() + + self.out = nn.Softmax(dim=1) + + + def forward(self, images: torch.Tensor, lstm_hidden_states: torch.Tensor) -> Tuple[torch.Tensor,torch.Tensor]: + """Compute z_t given images and hidden state at t-1 for all the element in the batch. + + Args: + images (torch.Tensor): `(batch_dim, image_portions, encoder_dim)` + The tensor of the images in the batch. + lstm_hidden_states (torch.Tensor): `(batch_dim, hidden_dim)` + The hidden states at t-1 of all the element in the batch. + + Returns: + (Tuple[torch.Tensor,torch.Tensor]): `[(batch_dim, encoder_dim), (batch_dim, image_portions)]` + Z_t and the alphas evaluated for each portion of the image, for each image in the batch. + """ + + _images_attention = self.image_attention_projection(images) # IN: (batch_dim, image_portions, encoder_dim) -> Out: (batch_dim, image_portions, attention_dim) + + _lstm_attention = self.lstm_hidden_state_attention_projection(lstm_hidden_states) # IN: (batch_dim, hidden_dim) -> Out: (batch_size, attention_dim) + + # (batch_size, image_portions, attention_dim) + (batch_size, 1, attention_dim) -> Broadcast on dim 2 -> (batch_size, image_portions, attention_dim) + _attention = self.attention(self.ReLU(_images_attention + _lstm_attention.unsqueeze(1))).squeeze(2) # IN: (batch_dim, image_portions, attention_dim) -> Out: (batch_size, image_portions) + + _alphas_t = self.out(_attention) # Out: (batch_dim, image_portions) + + # Retrieve z_t + attention_weighted_encoding = (images * _alphas_t.unsqueeze(2)).sum(dim=1) # Out: (batch_dim, encoder_dim) + + return attention_weighted_encoding, _alphas_t + \ No newline at end of file diff --git a/NeuralModels/CaRNet.py b/NeuralModels/CaRNet.py new file mode 100644 index 0000000..59dc6c5 --- /dev/null +++ b/NeuralModels/CaRNet.py @@ -0,0 +1,561 @@ +##################################################### +## +## +## pip install torch==1.3.0+cu100 torchvision==0.4.1+cu100 -f https://download.pytorch.org/whl/torch_stable.html + + +import torch +import torch.nn as nn +import torchvision.models as models +from torch.nn.utils.rnn import pack_padded_sequence +import torch.nn.functional as F +from typing import Tuple,List +from .Dataset import MyDataset +from .Vocabulary import Vocabulary +from .Decoder.IDecoder import IDecoder +from .Encoder.IEncoder import IEncoder +from .Attention.IAttention import IAttention +import numpy as np +from PIL import Image +from torchvision import transforms +from torchvision.utils import save_image +import matplotlib.pyplot as plt +from VARIABLE import MAX_CAPTION_LENGTH +from .Metrics import Result + +class CaRNet(nn.Module): + """ + The ConvolutionalandRecurrentNet (CaRNet). + CaRNet works with a Residual NeuralNet with 50layers (ResNet50) with the last layer removed. + In CaRNet it supports 3 types of LSTM: + - vI: the features extracted from the image are provided as input with token + - vH: the features extracted from the image becames the hidden state at t_0 + - vHC: the features extracted from the image becames both the hidden and cell state at t_0 + + When it is flavoured with Attention, it becames a ConvolutionalAttentionRecurrentNet (CARNet). + CARNet works with a Residual NeuralNet with 50layers (ResNet50) with the last convolutional layer exposed. + For now support only 1 type of LSTM: + - vHC + """ + + def __init__(self, encoder: IEncoder, decoder: IDecoder, net_name: str, encoder_dim: int, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, attention: IAttention = None, attention_dim: int = 1024, device: str = "cpu"): + """Create the C[aA]RNet + + Args: + encoder (IEncoder): + The encoder to use. + + decoder (IDecoder): + The decoder to use. + + net_name (str): + Name of the Neural Network. + + encoder_dim (int): + The dimensionality of the features vector extracted from the image. + + hidden_dim (int): + The Capacity of the LSTM Cell. + + padding_index (int): + The index of the padding id, given from the vocabulary associated to the dataset. + + vocab_size (int)): + The size of the vocabulary associated to the dataset. + + embedding_dim (int): + Size associated to the input of the LSTM cell. + + attention (IAttention, optional): (Default is None) + The attention if Provided. + + attention_dim (int, optional): (Default is 1024) + Size of the attention layer, used only if attention is not None. + + device (str, optional): + The device on which the net does the computation. Defaults to "cpu". + """ + + super(CaRNet, self).__init__() + self.padding_index = padding_index + self.device = torch.device(device) + self.name_net = net_name + self.result_storer = Result() + # Define Encoder and Decoder + self.C = encoder(encoder_dim = encoder_dim, device = device) + self.R = None + + # Take the attention in consideration + self.attention = False + + if attention is not None: # I know..some skilled dev. will hate me for this if-else statement. Forgive ME. + self.attention = True + self.R = decoder(hidden_dim, padding_index, vocab_size, embedding_dim, device, attention(self.C.encoder_dim, hidden_dim, attention_dim)) + else: + self.R = decoder(hidden_dim, padding_index, vocab_size, embedding_dim, device) + + # Check if the Recurrent net was initialized oth. we are in error state. + if self.R is None: + raise ValueError("Could not create the Recurrent network.") + + # Send both net to the defined device -> cpu or gpu + self.C.to(self.device) + self.R.to(self.device) + + def switch_mode(self, mode: str) -> bool: + """ Change the working modality of the net among "training" or "evaluation". + + Args: + mode (str): + New mode of work, "training" | "evaluation" + + Returns: + bool: + If True the state is correctly changed, oth. not. + """ + # Q. Why no control if they already stay in the wanted state? + # A. Increase the condition may lead to more than expected case to control. Avoid IfElse community addicted :) + if mode == "training": + self.C.train() # switch to training state + self.R.train() + return True + + if mode == "evaluation": + self.C.eval() # switch to evaluation state + self.R.eval() + return True + return False + + def save(self, file_path: str) -> bool: + """Save the net in non-volatile memory + + Args: + file_name (str): Relative path to save the net. Ex. "home/pippo/saved" + + Returns: + bool: If True: Net saved correctly. False otherwise. + """ + try: + # Name_type_encoderdim_embeddingdim_hiddendim_attentiondim + torch.save(self.C.state_dict(), f"{file_path}/{self.name_net}_{self.C.encoder_dim}_{self.R.hidden_dim}_{self.R.attention.attention_dim if self.attention == True else 0}_C.pth") + torch.save(self.R.state_dict(), f"{file_path}/{self.name_net}_{self.C.encoder_dim}_{self.R.hidden_dim}_{self.R.attention.attention_dim if self.attention == True else 0}_R.pth") + except Exception as ex: + print(ex) + return False + return True + + def load(self, file_path: str) -> bool: + """Load the net from non-volatile memory into RAM + + Args: + file_name (str): Relative path of the net. Ex. "home/pippo/saved" + + Returns: + bool: If True: Net loaded correctly. False otherwise. + """ + + # since our classifier is a nn.Module, we can load it using pytorch facilities (mapping it to the right device) + self.C.load_state_dict(torch.load(f"{file_path}/{self.name_net}_{self.C.encoder_dim}_{self.R.hidden_dim}_{self.R.attention.attention_dim if self.attention == True else 0}_C.pth", map_location=self.device)) + self.R.load_state_dict(torch.load(f"{file_path}/{self.name_net}_{self.C.encoder_dim}_{self.R.hidden_dim}_{self.R.attention.attention_dim if self.attention == True else 0}_R.pth", map_location=self.device)) + + def forward(self, images: torch.tensor, captions: torch.tensor) -> torch.tensor: + """Provide images to the net for retrieve captions + + Args: + images (torch.tensor): `(Batch Size, Channels, Width, Height)` + The images of the batch. + + captions (torch.tensor): `(Batch Size, Max_Captions_Length)`. + ASSUMPION: The captions are padded with Token + + Returns: + (torch.tensor): `(batch_size, max_captions_length, vocab_size)` + The output of each time step from t_1 to t_N. + REMARK token is provided as output at t_0 + """ + features = self.C(images) + return self.R(features, captions) + + def __accuracy(self, outputs: torch.tensor, labels: torch.tensor, captions_length: List[int]) -> float: + """Evaluate the accuracy of the Net with Jaccard Similarity. + Assumption: outputs and labels have same shape and already padded. + + Args: + outputs (torch.tensor): `(batch_dim, MAX_CAPTION_LENGTH)` + The captions generated from the net. + labels (torch.tensor): `(batch_dim, MAX_CAPTION_LENGTH)` + The Real captions. + captions_length (list): + + Returns: + float: The accuracy of the Net + """ + + + # computing the accuracy with Jaccard Similarity, pytorch unique facility has bugs with cuda....it can be done "a manella" :) + # from python 3.9 you could use the package torchmetrics + # from torchmetrics import JaccardIndex + # intersection_over_union = JaccardIndex(num_classes=self.R.vocab_size).cuda() if self.device.type != "cpu" else JaccardIndex(num_classes=self.R.vocab_size) + # return intersection_over_union(outputs, labels) + outputs = np.array(list(map(lambda output: np.unique(output), outputs.cpu())), dtype=object) # Remove duplicate from each caption + labels = np.array(list(map(lambda label: np.unique(label), labels.cpu())), dtype=object) # Remove duplicate from each caption + + unions = list(map(lambda index: len(np.union1d(outputs[index],labels[index])), range(labels.shape[0]))) + intersections = list(map(lambda index: len(np.intersect1d(outputs[index],labels[index])), range(labels.shape[0]))) + return torch.mean(torch.tensor(intersections).type(torch.float)/torch.tensor(unions).type(torch.float), axis=0) + + + def train(self, train_set: MyDataset, validation_set: MyDataset, lr: float, epochs: int, vocabulary: Vocabulary): + """Train the net + + Args: + train_set (MyDataset): + The associate training set. + + validation_set (MyDataset): + The associate validation set. + + lr (float): + The learning rate. + + epochs (int): + The number of epochs. + + vocabulary (Vocabulary): + The vocabulary associate to the Dataset + """ + + # Initialize Loss: CrossEntropyLoss -> Softmax + NegativeLogLikelihoodLoss + # Q. Why ignore_index is setted to instead of ? + # A. In the training, both output of the CaRNet and Target is a padded tensor, but when we compute the loss it will evaluate the tensor with pack_padded_sequence. + # And since token is hardcoded as output at t_0 and it is contained into the Target we could avoid the computation of loss on it, since will be 1. + + criterion = nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()[""],reduction="sum").cuda() if self.device.type == "cuda" \ + else nn.CrossEntropyLoss(ignore_index=vocabulary.predefined_token_idx()[""],reduction="sum") + + # initializing some elements + best_val_acc = -1. # the best accuracy computed on the validation data + best_epoch = -1 # the epoch in which the best accuracy above was computed + + # ensuring the classifier is in 'train' mode (pytorch) + self.switch_mode("training") + + # creating the optimizer + optimizer = torch.optim.Adam(list(self.R.parameters()) + list(self.C.parameters()), lr) + + # loop on epochs + for e in range(0, epochs): + + # epoch stats (computed by accumulating mini-batch stats) + epoch_train_acc = 0. + epoch_train_loss = 0. + epoch_num_train_examples = 0 + batch_id_reporter = 0 + for images,captions_ids,captions_length in train_set: + optimizer.zero_grad() + + batch_num_train_examples = images.shape[0] # mini-batch size (it might be different from 'batch_size') -> last batch truncated + epoch_num_train_examples += batch_num_train_examples + + # Send data to the appropriate device + images = images.to(self.device) + captions_ids = captions_ids.to(self.device) + captions_length = captions_length.to(self.device) + + # computing the network output on the current mini-batch + # If Attention is on: + # In: (batch_dim, channels, height, width) Out: (batch_dim,H_portions, W_portions, encoder_dim) + # Else: + # In: (batch_dim, channels, height, width) Out: (batch_dim, encoder_dim) + # Retrieve Features for each image + features = self.C(images) + + # Check if attention is provided, if yes the output will change accordly for fitting doubly stochastic gradient + if self.attention == False: # I know..some skilled dev. will hate me for this if-else statement. Forgive ME. + outputs, _ = self.R(features, captions_ids, captions_length) # outputs > (B, L, |V|); + else: + outputs, _, alphas = self.R(features, captions_ids, captions_length) + + outputs = pack_padded_sequence(outputs, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength, |Vocabulary|) -> (Batch * CaptionLength, |Vocabulary|) + + targets = pack_padded_sequence(captions_ids, captions_length.cpu(), batch_first=True) #(Batch, MaxCaptionLength) -> (Batch * CaptionLength) + + loss = criterion(outputs.data, targets.data) + + # Doubly stochastic gradient if attention is ON + if self.attention == True: + loss += float(torch.sum(( + 0.5 * torch.sum(( + (1 - torch.sum(alphas, dim=1,keepdim=True)) ** 2 # caption_length sum + ), dim=2, keepdim=True) # alpha_dim sum + ), dim=0).squeeze(1)) # batch_dim sum + + # computing gradients and updating the network weights + loss.backward() # computing gradients + optimizer.step() # updating weights + + # Training set accuracy evaluation + with torch.no_grad(): + self.switch_mode("evaluation") + + # computing the network output on the current mini-batch + # If Attention is on: + # In: (batch_dim, channels, height, width) Out: (batch_dim,H_portions, W_portions, encoder_dim) + # Else: + # In: (batch_dim, channels, height, width) Out: (batch_dim, encoder_dim) + # Retrieve Features for each image + projections = self.C(images) + + # Create a padded tensor manually + captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device) + + for idx, _ in enumerate(range(projections.shape[0])): + # OUT: (1, CAPTION_LENGTH) + if self.attention == True: + _caption_no_pad, _ = self.R.generate_caption(projections[idx].unsqueeze(0),captions_ids.shape[1]) # IN: ((1, H_portions, W_portions, encoder_dim), 1) + else: + _caption_no_pad = self.R.generate_caption(projections[idx].unsqueeze(0),captions_ids.shape[1]) # IN: ((1, encoder_dim), 1) + # Add for each batch element the caption. The surplus element are already feeded with zeros + captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad + + + captions_output_padded = captions_output.type(torch.int32).to(self.device) # Out: (batch_dim, MAX_CAPTION_LENGTH) + + # computing performance + batch_train_acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length) + + # accumulating performance measures to get a final estimate on the whole training set + epoch_train_acc += batch_train_acc * batch_num_train_examples + + # accumulating other stats + epoch_train_loss += loss.item() * batch_num_train_examples + + self.switch_mode("training") + + # printing (mini-batch related) stats on screen + print(f" mini-batch:\tloss={loss.item():.4f}, tr_acc={batch_train_acc:.5f}") + + # Store result of this batch in a dataframe + self.result_storer.add_train_info(epoch=int(e), batch_id=int(batch_id_reporter),loss=float(loss.item()),accuracy=float(batch_train_acc) ) + batch_id_reporter += 1 + # Evaluate the accuracy of the validation set + val_acc = self.eval_net(validation_set,vocabulary) + + # # saving the model if the validation accuracy increases + if val_acc > best_val_acc: + best_val_acc = val_acc + best_epoch = e + 1 + self.save("./.saved") + + epoch_train_loss /= epoch_num_train_examples + # Store the result of the validation set in this epoch + self.result_storer.add_validation_info(epoch=int(e), accuracy=float(val_acc)) + # printing (epoch related) stats on screen + print(f"epoch={e + 1}/{epochs}:\tloss={epoch_train_loss:.4f}, tr_acc={epoch_train_acc / epoch_num_train_examples:.5f}, val_acc={val_acc:.5f}, {'BEST!' if best_epoch == e+1 else ''}") + # store data in files + self.result_storer.flush() + + def eval_net(self, data_set, vocabulary): + """ Evaluate a data set + + Args: + data_set (MyDataset): + The associate data set. + + vocabulary (Vocabulary): + The vocabulary associate to the Dataset + + Returns: + (int): + Accuracy on given dataset + """ + + self.switch_mode("evaluation") # enforcing evaluation mode + with torch.no_grad(): # keeping off the autograd engine + _images = None + # loop on mini-batches to accumulate the network outputs (creating a new iterator) + for images,captions_ids,captions_length in data_set: + images = images.to(self.device) + + captions_ids = captions_ids.to(self.device) + + # If Attention is on: + # In: (batch_dim, channels, height, width) Out: (batch_dim,H_portions, W_portions, encoder_dim) + # Else: + # In: (batch_dim, channels, height, width) Out: (batch_dim, encoder_dim) + # Retrieve Features for each image + projections = self.C(images) + + # Create a padded tensor manually + captions_output = torch.zeros((projections.shape[0],captions_ids.shape[1])).to(self.device) + + for idx, _ in enumerate(range(projections.shape[0])): + # OUT: (1, CAPTION_LENGTH) + if self.attention == True: + _caption_no_pad, _ = self.R.generate_caption(projections[idx].unsqueeze(0),captions_ids.shape[1]) # IN: ((1, H_portions, W_portions, encoder_dim), 1) + else: + _caption_no_pad = self.R.generate_caption(projections[idx].unsqueeze(0),captions_ids.shape[1]) # IN: ((1, encoder_dim), 1) + # Add for each batch element the caption. The surplus element are already feeded with zeros + captions_output[idx,:_caption_no_pad.shape[1]] = _caption_no_pad + + # Pick the 1st image of the last batch for printing out the result + _image = images[0] + captions_output_padded = captions_output.type(torch.int32).to(self.device) # Out: (batch_dim, MAX_CAPTION_LENGTH) + + # computing performance + acc = self.__accuracy(captions_output_padded.squeeze(1), captions_ids, captions_length) + + self.eval(_image,vocabulary) + self.switch_mode("training") + + return acc + + def __generate_image_caption(self, image: torch.Tensor, vocabulary: Vocabulary, image_name: str = "caption.png"): + """ Genareate an image with caption. + + Args: + image (torch.Tensor): `(channels, height, width)` + The tensorial representation of the image in resnet50 form. + + vocabulary (Vocabulary): + The vocabulary associated to the dataset. + + image_name (str, optional): Defaults to "caption.png". + The image of the generated file + """ + self.switch_mode("evaluation") # enforcing evaluation mode + + # If Attention is on: + # Out: 1st step (batch_dim,H_portions, W_portions, encoder_dim) -> 2nd step (batch_dim, H_portions * W_portions, encoder_dim) + # Else: + # Out: (1, encoder_dim) + features = self.C(image.unsqueeze(0)) + + if self.attention == True: + caption, alphas = self.R.generate_caption(features,MAX_CAPTION_LENGTH) + else: + caption = self.R.generate_caption(features,MAX_CAPTION_LENGTH) + + # Generate image caption + caption = vocabulary.rev_translate(caption[0]) + + # Adjust the color of the image wrt the transform operation of the resnet50 + image[0] = image[0] * 0.229 + image[1] = image[1] * 0.224 + image[2] = image[2] * 0.225 + image[0] += 0.485 + image[1] += 0.456 + image[2] += 0.406 + + # Swap color channels + image = image.permute((1,2,0)) # IN: (height, width, channels) + + # If attention is ON perform the evaluation of attention over the immage + if self.attention == True: + self.__generate_image_attention(image, caption, alphas) + + plt.figure(figsize=(15, 15)) + plt.imshow(image.cpu()) + plt.title(caption) + plt.savefig("caption.png") + plt.close() + + self.switch_mode("training") + + def __generate_image_attention(self, image: torch.tensor, caption, alphas, image_name: str = "attention.png"): + """Perform the evaluation of the attention over the image. + + Args: + image (torch.Tensor): + The tensorial representation of the image. + + caption (list(str)): + The caption. + + alphas (torch.Tensor): + + image_name (str, optional): Defaults to "attention.png". + The image of the generated file + """ + self.switch_mode("evaluation") + + fig = plt.figure(figsize=(15, 15)) + _caption_len = len(caption) + for t in range(_caption_len): + # from 49 element to 7x7 + _att = alphas[t].reshape(self.R.attention.number_of_splits,self.R.attention.number_of_splits) + + # Add a subplot accordly to the word in caption position + ax = fig.add_subplot(_caption_len//2, _caption_len//2, t+1) + + ax.set_title(f"{caption[t]}", fontsize=12) + + img = ax.imshow(image.cpu()) + + # Add attention layer + ax.imshow(_att, cmap='gray', alpha=0.7, extent=img.get_extent()) + plt.tight_layout() + plt.savefig(image_name) + plt.close() + + self.switch_mode("training") + + # Inspiration is taken from this example https://www.kaggle.com/mdteach/image-captioning-with-attention-pytorch + # Thanks ABISHEK BASHYAL :) + def eval(self, image: object, vocabulary: Vocabulary): + """Evaluate an image and retrieve the associated caption. + + Args: + image (PIL.Image.Image or torch.Tensor): if tensor `(channels, height, width)` + The image for which it evaluate the caption. + + vocabulary (Vocabulary): + The vocabulary. + + Raises: + ValueError: If the image is not a tensor or an image. + """ + # enforcing evaluation mode + self.switch_mode("evaluation") + + if isinstance(image, Image.Image): + operations = transforms.Compose([ + transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"], MyDataset.image_trasformation_parameter["crop"]["size"])), # Crops the given image at the center. + transforms.ToTensor(), + transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"]) + ]) + image = operations(image) + + if not(isinstance(image,torch.Tensor)): + raise ValueError(f"Image is not the expected type, got: {type(image)}.") + + self.__generate_image_caption(image,vocabulary) + + self.switch_mode("training") + +# Example of usage +if __name__ == "__main__": + from torch.utils.data import DataLoader + from FactoryModels import * + ds = MyDataset("./dataset", percentage=1) + v = Vocabulary(ds,reload=True) + + # Load Encoder and Decoder models + decoder = FactoryDecoder(Decoder.RNetvI) + encoder = FactoryEncoder(Encoder.CResNet50Attention) + + dc = ds.get_fraction_of_dataset(percentage=70, delete_transfered_from_source=True) + df = ds.get_fraction_of_dataset(percentage=30, delete_transfered_from_source=True) + # use dataloader facilities which requires a preprocessed dataset + + + dataloader_training = DataLoader(dc, batch_size=32, + shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_training(data,v)) + + dataloader_evaluation = DataLoader(df, batch_size=32, + shuffle=True, num_workers=2, collate_fn = lambda data: ds.pack_minibatch_evaluation(data,v)) + + + net = CaRNet(encoder, decoder, "CaRNetvI",1596,512,0,len(v.word2id.keys()),v.embeddings.shape[1],"cuda:0") + #net.load("CaRNetvI") + net.train(dataloader_training,dataloader_evaluation,1e-3,500,v) diff --git a/NeuralModels/Dataset.py b/NeuralModels/Dataset.py new file mode 100644 index 0000000..56c9154 --- /dev/null +++ b/NeuralModels/Dataset.py @@ -0,0 +1,269 @@ +# Typing trick for avoid circular import dependencies valid for python > 3.9 +# from __future__ import annotations +# from typing import TYPE_CHECKING +# if TYPE_CHECKING: +# from .Vocabulary import Vocabulary + +import os +import pandas as pd +import torch +from torch.utils.data import Dataset +import torch.nn as nn +from PIL import Image +import re +from torchvision import transforms +from VARIABLE import MAX_CAPTION_LENGTH, IMAGES_SUBDIRECTORY_NAME, CAPTION_FILE_NAME +from typing import Tuple, List, Iterable + + +class MyDataset(Dataset): + """ + Wrapper of Dataset Pytorch Object. + For our scopes the dataset folder must follow this rule: + + 1) As a child of the directory, we must have a csv named `CAPTION_FILE_NAME` that follow this pattern:\n + `image_name| comment_number| comment`\n + Example: 1000092795.jpg| 0| Two young guys with shaggy hair look at their hands while hanging out in the yard . + + 2) As brother of the csv file we must have the folder of the images, the directory name is a variable `IMAGES_SUBDIRECTORY_NAME` + + Assumption: + + 1) The dataset will pick only the caption less then the variable `MAX_CAPTION_LENGTH` + + """ + image_trasformation_parameter = { + "crop":{ + "size": 224 + }, + "mean": torch.tensor([0.485, 0.456, 0.406]), # the mean of the training data on the 3 channels (RGB) + "std_dev": torch.tensor([0.229, 0.224, 0.225]) # the standard deviation of the training data on the 3 channels (RGB) + } + + def __init__(self, directory_of_data:str , percentage:int = 100, already_computed_dataframe: pd.DataFrame = None): + """Create a new dataset from source files or from a preprocessed dataset. + + Args: + directory_of_data (str, mandatory): + The directory tagged as root for the dataset. + + percentage (int, optional): Default is 100. + The percentage of row that we want store in our object. + + already_computed_dataframe (pd.DataFrame, Optional): Default is None. + If the dataset is computed outside put it there. + REMARK Please follow the rule: + | Index | image_name | Caption |\n + |:-----:|:----------:|:--------------------:|\n + | 0 | pippo.jpg | ["i","like","pizza"] |\n + + Raises: + ValueError: if the dataset directory is invalid (Not Exist, Not a directory). + """ + + # If the constructor receive a dataframe, we assume that it is already manipulated for doing our operation, no further op. needed. + if already_computed_dataframe is not None: + self.directory_of_data = directory_of_data + self._dataset = already_computed_dataframe + return + + # Input checking + if not os.path.exists(directory_of_data): + raise ValueError(f"{directory_of_data} not Exist!") + if not os.path.isdir(directory_of_data): + raise ValueError(f"{directory_of_data} is not a directory!") + + self.directory_of_data = directory_of_data + + # Load the dataset + _temp_dataset: pd.DataFrame = pd.read_csv(f"{directory_of_data}/{CAPTION_FILE_NAME}", sep="|", skipinitialspace=True)[["image_name","comment"]] + + # Split every caption in its words. + _temp_dataset["comment"] = _temp_dataset["comment"].apply( lambda comment: re.findall("[\\w]+|\.|\,",str(comment).lower())) + + # Filter for retrieve only caption with a length less than MAX_CAPTION_LENGTH length + _temp_dataset = _temp_dataset[ _temp_dataset["comment"].map(len) <= MAX_CAPTION_LENGTH] + + # Pick only a given percentage of the row in the dataset + self._dataset = _temp_dataset.head(int(len(_temp_dataset)*(percentage/100))) + + def get_fraction_of_dataset(self, percentage: int, delete_transfered_from_source: bool = False): + """Get a fraction of the dataset + + Args: + percentage (int): + The percentage of row that we want store in our new object. + + delete_transfered_from_source (bool, optional): Defaults to False. + Tell if you want to delete the row in the source object that are transfered to the new object. + + Returns: + (MyDataset): + The new computed dataset object. + """ + # Retrieve the number of rows + _temp_df_moved: pd.DataFrame = self._dataset.head(int(len(self._dataset)*(percentage/100))).sample(frac=1) + + # Deep copy of the dataframe + _temp_df_copy = _temp_df_moved.copy() + + # If delete_transfered_from_source == True delete the rows in the source object. + if delete_transfered_from_source: + self._dataset: pd.DataFrame = self._dataset.drop(_temp_df_copy.index) + + # Return a fresh MyDataset object. + return MyDataset(directory_of_data=self.directory_of_data, already_computed_dataframe=_temp_df_copy) + + def get_all_distinct_words_in_dataset(self) -> List[str]: + """Return all the words in each caption of the dataset as a big list of strings (No Repetition). + + Returns: + (List[str]): All the words in the dataset. + """ + words = [] + # Iterate over each sample in the dataset. + for idx,row in self._dataset.iterrows(): + for word in row["comment"]: + if word not in words: + words.append(word) + return words + + def __len__(self) -> int: + """Evaluate the length of the dataset. + The length is the number of rows in the dataset. + + Returns: + int: The legth of the dataset. + + """ + return self._dataset.shape[0] + + def __getitem__(self, idx: int) -> Tuple[Image.Image, List[str]]: + """Get the associated image and caption of a given index. + + Args: + idx (int): + The index associated univocally to a row of the dataset. + + Returns: + (Tuple[Image.Image, List[str]]): + Image and caption of the input index. + """ + image: Image.Image = Image.open(f"{self.directory_of_data}/{IMAGES_SUBDIRECTORY_NAME}/{self._dataset.iloc[idx]['image_name']}").convert('RGB') + caption: List[str] = self._dataset.iloc[idx]["comment"] + + return image, caption + + # For python > 3.9 -> def pack_minibatch_training(self, data: List[Tuple[Image.Image, List[str]]], vocabulary: Vocabulary) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def pack_minibatch_training(self, data: List[Tuple[Image.Image, List[str]]], vocabulary) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Custom method for packing a mini-batch for training. + + Args: + data (List[Tuple[image.Image, List[str]]]): + A list of tuples coming from the calls of the __getitem__ method. + + vocabulary (Vocabulary): + Vocabulary associated to the dataset. + + Returns: + (Tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor + ]): [`(batch_dim, channels, height, width)`, `(batch_dim,min(MAX_CAPTION_LENGTH,captions[0]))`, `(batch_dim)`] + + Tuple[0]: The images of the mini-batch converted to Tensor. + Tuple[1]: The caption of each image the mini-batch, the dim 2 depends on the maximum caption length inside the batch. + Tuple[2]: The length of each caption +2 for and token. + """ + # Sort the data list by caption length (descending order). + data.sort(key=lambda x: len(x[1]), reverse=True) + + images, captions = zip(*data) + + # Type annotation for zip extraction, no clear way to determine type with this kind of built-in method in a pythonic way. + images: List[Image.Image] = images + captions: List[List[str]] = captions + + # Trasnform the images from PIL.Image into a pytorch.Tensor + operations = transforms.Compose([ + transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"],MyDataset.image_trasformation_parameter["crop"]["size"])), # Crop a random portion of image and resize it to a given size. + transforms.RandomHorizontalFlip(p=0.3), # Horizontally flip the given image randomly with a given probability. + transforms.ToTensor(), # Convert a PIL Image or numpy.ndarray to tensor. (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] + transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"]), + ]) + images = list(map(lambda image: operations(image),list(images))) # Out: List[(channels, height, width)] + # Merge images (from list of 3D tensor to a tensor). + images = torch.stack(images, 0) # Out: (batch_dim, channels, height, width) + + # Evaluate captions: Devo + # Q. Why +2? + # A. For the and Token. + captions_length = torch.tensor([len(caption)+2 for caption in captions]) # Out: (batch_dim) + + # From to words to ids of vocabulary, add .id at beginning and .id at end. + captions = [vocabulary.translate(caption,"complete") for caption in captions] + + # Pad the captions with zeros id == .id. + captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True) # Out: (batch_dim,min(MAX_CAPTION_LENGTH,captions[0])) + + + return images, captions.type(torch.LongTensor), captions_length.type(torch.int32) + + # For python > 3.9 -> def pack_minibatch_training(self, data: List[Tuple[Image.Image, List[str]]], vocabulary: Vocabulary) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def pack_minibatch_evaluation(self, data: List[Tuple[Image.Image, List[str]]], vocabulary) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Custom method for packing a mini-batch for evaluation. + + Args: + data (List[Tuple[image.Image, List[str]]]): + A list of tuples coming from the calls of the __getitem__ method. + + vocabulary (Vocabulary): + Vocabulary associated to the dataset. + + Returns: + (Tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor + ]): [`(batch_dim, channels, height, width)`, `(batch_dim,min(MAX_CAPTION_LENGTH,captions[0]))`, `(batch_dim)`] + + Tuple[0]: The images of the mini-batch converted to Tensor. + Tuple[1]: The caption of each image the mini-batch, the dim 2 depends on the maximum caption length inside the batch. + Tuple[2]: The length of each caption +2 for and token. + """ + # Sort the data list by caption length (descending order). + data.sort(key=lambda x: len(x[1]), reverse=True) + + images, captions = zip(*data) + + # Type annotation for zip extraction, no clear way to determine type with this kind of built-in method in a pythonic way. + images: List[Image.Image] = images + captions: List[List[str]] = captions + + # Trasnform the images from PIL.Image into a pytorch.Tensor) + + operations = transforms.Compose([ + transforms.Resize((MyDataset.image_trasformation_parameter["crop"]["size"], MyDataset.image_trasformation_parameter["crop"]["size"])), # Crops the given image at the center. + transforms.ToTensor(), + transforms.Normalize(mean=MyDataset.image_trasformation_parameter["mean"], std=MyDataset.image_trasformation_parameter["std_dev"]) + ]) + + images = list(map(lambda image: operations(image),list(images))) # Out: List[(channels, height, width)] + # Merge images (from list of 3D tensor to a tensor). + images = torch.stack(images, 0) # Out: (batch_dim, channels, height, width) + + # Evaluate captions: Devo + # Q. Why +2? + # A. For the and Token. + captions_length = torch.tensor([len(caption)+2 for caption in captions]) # Out: (batch_dim) + + # From to words to ids of vocabulary, add .id at beginning and .id at end. + captions = [vocabulary.translate(caption,"complete") for caption in captions] + + # Pad the captions with zeros id == .id. + captions = nn.utils.rnn.pad_sequence(captions, padding_value=0, batch_first=True) # Out: (batch_dim,min(MAX_CAPTION_LENGTH,captions[0])) + + + return images, captions.type(torch.LongTensor), captions_length.type(torch.int32) + \ No newline at end of file diff --git a/NeuralModels/Decoder/IDecoder.py b/NeuralModels/Decoder/IDecoder.py new file mode 100644 index 0000000..1ff83da --- /dev/null +++ b/NeuralModels/Decoder/IDecoder.py @@ -0,0 +1,79 @@ +##### INTERFACE CLASS DON'T USE IT (You at most use only as Type Hint), JUST READ IT. +################################################################ + +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Tuple,List + +class IDecoder(nn.Module): + """ + Class interface for a LSTM unit + Args are intended as suggested. + """ + + def __init__(self, *args): + """Define the interface of a generic constructor for the Decoder Net. + + Args (Suggested): + + hidden_dim (int): + The Capacity of the LSTM Cell. + + padding_index (int): + The index of the padding id, given from the vocabulary associated to the dataset. + + vocab_size (int)): + The size of the vocabulary associated to the dataset. + + embedding_dim (int): + The number of features associated to a word. + + device (str, optional): Default "cpu" + The device on which the operations will be performed. + """ + super(IDecoder, self).__init__() + + def forward(self, *args) -> Tuple[torch.Tensor, List[int]]: + """Interface for the forward operation of the RNN. + + Args (Suggested): + + images (torch.Tensor): `(batch_dim, encoder_dim)` + The features associated to each image of the batch. + + captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)` + The caption associated to each image of the batch. + _REMARK Each caption is in the full form: + .... + _ + + caption_length (list(int)): + The length of each caption in the batch. + + Returns: `[(batch_size, max_captions_length, vocab_size), list(int)]` + + (torch.Tensor): The hidden state of each time step from t_1 to t_N. + + (list(int)): The length of each decoded caption. + REMARK The is provided as input at t_0. + REMARK The token will be removed from the input of the LSTM. + """ + pass + + def generate_caption(self, *args) -> torch.Tensor: + """ Interface for generate a caption + + Args (Suggested): + + images (torch.Tensor): `(1, encoder_dim)` + The features associated to the image. + + max_caption_length (int): + The maximum ammisible length of the caption. + + Returns: + + (torch.Tensor): `(1, )` + The caption associated to the image given. + REMARK It includes at t_0 by default. + """ + pass \ No newline at end of file diff --git a/NeuralModels/Decoder/RNetvH.py b/NeuralModels/Decoder/RNetvH.py new file mode 100644 index 0000000..5724a79 --- /dev/null +++ b/NeuralModels/Decoder/RNetvH.py @@ -0,0 +1,154 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Tuple,List + +class RNetvH(nn.Module): + """ + Class implementing LSTM unit with Hidden state initialized with custom features vector and Cell state initialized with ZEROS. + """ + + def __init__(self, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, device: str = "cpu"): + """Define the constructor for the RNN Net + + Args: + + hidden_dim (int): + Capacity of the LSTM Cell. + + padding_index (int): + The index of the padding id, given from the vocabulary associated to the dataset. + + vocab_size (int)): + The size of the vocabulary associated to the dataset. + + embedding_dim (int): + The number of features associated to a word. + + device (str, optional): Default "cpu" + The device on which the operations will be performed. + """ + super(RNetvH, self).__init__() + + print(f"Construction of RNetvH:\n \ + LSTM Capacity: {hidden_dim},\n \ + Padding Index: {padding_index},\n \ + Vocabulary Size: {vocab_size},\n \ + Embedding dimension: {embedding_dim},\n \ + Device: {device}") + + self.device = torch.device(device) + self.hidden_dim = hidden_dim + self.vocab_size = vocab_size + # Embedding layer that turns words into a vector. + self.words_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index) + + # The LSTM takes embedded word vectors (of a specified size) as input + # and outputs hidden states of size hidden_dim + self.lstm_unit = torch.nn.LSTMCell(embedding_dim, hidden_dim) + + # The linear layer that maps the hidden state + # to the number of words we want as output = vocab_size + self.linear_1 = nn.Linear(hidden_dim, vocab_size) + + + def forward(self, images: torch.Tensor, captions: torch.Tensor, captions_length: List[int]) -> Tuple[torch.Tensor, List[int]]: + """Compute the forward operation of the RNN. + input of the LSTM cell for each time step: + t_{-1}: NONE + t_0: Deterministict + . + . + . + t_{N-1}: The embedding vector associated to the S_{N-1} id. + + Args (Suggested): + + images (torch.Tensor): `(batch_dim, encoder_dim)` + The features associated to each image of the batch. + + captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)` + The caption associated to each image of the batch. + REMARK Each caption is in the full form: + .... + + REMARK The Tensor is padded with zeros + + caption_length (List(int)): + The length of each caption in the batch. + + Returns: `[(batch_dim, max_captions_length, vocab_size), List(int)]` + + (torch.Tensor): + The output of LSTM for each time step from t_1 to t_N, + at t_0 + REMARK is the 1st element in the output caption for each element in batch. + + (List(int)): + The length of each decoded caption. + REMARK The is provided as input at t_0. + REMARK The token will be removed from the input of the LSTM. + """ + # Check if encoder_dim and self.hidden_dim are equal, assert by construction + if images.shape[1] != self.hidden_dim: + raise ValueError("The dimensionality of the encoder output is not equal to the dimensionality of the hidden state.") + + # Retrieve batch size + batch_dim = images.shape[0] # images is of shape (batch_dim, embedding_dim) + + # Create embedded word vector for each word in the captions + inputs = self.words_embedding(captions) # In: (batch_dim, max_captions_length, embedding_dim) -> Out: (batch_dim, captions length, embedding_dim) + + # Initialize the hidden state and the cell state at time t_{-1} + _h, _c = ( images, torch.zeros((captions.shape[0],self.hidden_dim)).to(self.device)) # In: ((batch_dim, hidden_dim),(batch_dim, hidden_dim)) -> Out ((batch_dim, hidden_dim), (batch_dim, hidden_dim)) + + # Deterministict Output as first word of the caption t_{0} + start = torch.zeros(self.vocab_size) + start[1] = 1 + start = start.to(self.device) # Out: (1, vocab_size) + + # Bulk insert of to all the elements of the batch + outputs = start.repeat(batch_dim,1,1).to(self.device) # Out: (batch_dim, 1, vocab_size) + + # Feed LSTMCell with image features and retrieve the state + + # How it works the loop? + # For each time step t \in {0, N-1}, where N is the caption length + + for idx in range(0,inputs.shape[1]): + _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions + _outputs = self.linear_1(_h) # In: (batch_dim, hidden_dim), Out: (batch_dim, vocab_size) + outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell + + return outputs, list(map(lambda length: length-1, captions_length)) + + def generate_caption(self, image: torch.Tensor, captions_length: int) -> torch.Tensor: + """Given the features vector of the image, perform a decoding (Generate a caption) + + Args: + + image (torch.Tensor): `(1, encoder_dim)` + The features associated to the image. + + max_caption_length (int): + The maximum ammisible length of the caption. + + Returns: + + (torch.Tensor): `(1, )` + The caption associated to the image given. + REMARK It includes at t_0 by default. + """ + + sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded + input = self.words_embedding(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) # Out: (1, embedding_dim) + with torch.no_grad(): + _h ,_c = ( image, torch.zeros((1,self.hidden_dim)).to(self.device)) + for _ in range(captions_length-1): + _h, _c = self.lstm_unit(input, (_h ,_c)) # Out : ((1, 1, hidden_dim) , (1, 1, hidden_dim)) + outputs = self.linear_1(_h) # Out: (1, vocab_size) + _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if self.device.type == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: The predicted id + sampled_ids.append(predicted) + input = self.words_embedding(predicted) # Out: (1, embeddings_dim) + input = input.to(torch.device(self.device)) # In: (1, embedding_dim) + if predicted == 2: + break + sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (1, captions_length) + return sampled_ids \ No newline at end of file diff --git a/NeuralModels/Decoder/RNetvHC.py b/NeuralModels/Decoder/RNetvHC.py new file mode 100644 index 0000000..449434a --- /dev/null +++ b/NeuralModels/Decoder/RNetvHC.py @@ -0,0 +1,152 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Tuple,List + +class RNetvHC(nn.Module): + """ + Class implementing LSTM unit with Cell and Hidden state initialized with custom features vector + """ + def __init__(self, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, device: str = "cpu"): + """Define the constructor for the RNN Net + + Args: + + hidden_dim (int): + Capacity of the LSTM Cell. + + padding_index (int): + The index of the padding id, given from the vocabulary associated to the dataset. + + vocab_size (int): + The size of the vocabulary associated to the dataset. + + embedding_dim (int): + The number of features associated to a word. + + device (str, optional): Default "cpu" + The device on which the operations will be performed. + """ + super(RNetvHC, self).__init__() + + print(f"Construction of RNetvH:\n \ + LSTM Capacity: {hidden_dim},\n \ + Padding Index: {padding_index},\n \ + Vocabulary Size: {vocab_size},\n \ + Embedding dimension: {embedding_dim},\n \ + Device: {device}") + + self.device = torch.device(device) + self.hidden_dim = hidden_dim + self.vocab_size = vocab_size + # Embedding layer that turns words into a vector. + self.words_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index) + + # The LSTM takes embedded word vectors (of a specified size) as input + # and outputs hidden states of size hidden_dim + self.lstm_unit = torch.nn.LSTMCell(embedding_dim, hidden_dim) + + # The linear layer that maps the hidden state + # to the number of words we want as output = vocab_size + self.linear_1 = nn.Linear(hidden_dim, vocab_size) + + + def forward(self, images: torch.Tensor, captions: torch.Tensor, captions_length: List[int]) -> Tuple[torch.Tensor, List[int]]: + """Compute the forward operation of the RNN. + input of the LSTM cell for each time step: + t_{-1}: NONE + t_0: Deterministict + . + . + . + t_{N-1}: The embedding vector associated to the S_{N-1} id. + + Args (Suggested): + + images (torch.Tensor): `(batch_dim, encoder_dim)` + The features associated to each image of the batch. + + captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)` + The caption associated to each image of the batch. + _REMARK Each caption is in the full form: + .... + _ + REMARK The Tensor is padded with zeros + + caption_length (List(int)): + The length of each caption in the batch. + + Returns: `[(batch_dim, max_captions_length, vocab_size), List(int)]` + + (torch.Tensor): + The output of LSTM for each time step from t_1 to t_N, + at t_0 + REMARK is the 1st element in the output caption for each element in batch. + + (List(int)): The length of each decoded caption. + REMARK The is provided as input at t_0. + REMARK The token will be removed from the input of the LSTM. + """ + # Check if encoder_dim and self.hidden_dim are equal, assert by construction + if images.shape[1] != self.hidden_dim: + raise ValueError("The dimensionality of the encoder output is not equal to the dimensionality of the hidden state.") + + # Retrieve batch size + batch_dim = images.shape[0] + + # Create embedded word vector for each word in the captions + inputs = self.words_embedding(captions) # In: (batch_dim, max_captions_length, embedding_dim) -> Out: (batch_dim, captions length, embedding_dim) + + # Initialize the hidden state and the cell state at time t_{-1} + _h, _c = (images, images) # In: ((batch_dim, hidden_dim),(batch_dim, hidden_dim)) -> Out ((batch_dim, hidden_dim), (batch_dim, hidden_dim)) + + # Deterministict Output as first word of the caption t_{0} + start = torch.zeros(self.vocab_size) + start[1] = 1 + start = start.to(self.device) # Out: (1, vocab_size) + + # Bulk insert of to all the elements of the batch + outputs = start.repeat(batch_dim,1,1).to(self.device) # Out: (batch_dim, 1, vocab_size) + + # Feed LSTMCell with image features and retrieve the state + + # How it works the loop? + # For each time step t \in {0, N-1}, where N is the caption length + + for idx in range(0,inputs.shape[1]): + _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions + _outputs = self.linear_1(_h) # In: (batch_dim, hidden_dim), Out: (batch_dim, vocab_size) + outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell + + return outputs, list(map(lambda length: length-1, captions_length)) + + def generate_caption(self, image: torch.Tensor, captions_length: int) -> torch.Tensor: + """Given the features vector of the image, perform a decoding (Generate a caption) + + Args: + + image (torch.Tensor): `(1, encoder_dim)` + The features associated to the image. + + max_caption_length (int): + The maximum ammisible length of the caption. + + Returns: + + (torch.Tensor): `(1, )` + The caption associated to the image given. + REMARK It includes at t_0 by default. + """ + + sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded + input = self.words_embedding(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) # Out: (1, embedding_dim) + with torch.no_grad(): + _h ,_c = (image,image) + for _ in range(captions_length-1): + _h, _c = self.lstm_unit(input, (_h ,_c)) # Out : ((1, hidden_dim) , (1, hidden_dim)) + outputs = self.linear_1(_h) # outputs: (1, vocab_size) + _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if self.device.type == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: The predicted id + sampled_ids.append(predicted) + input = self.words_embedding(predicted) # Out: (1, embedding_dim) + input = input.to(torch.device(self.device)) # Out: (1, embedding_dim) + if predicted == 2: + break + sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (1, captions_length) + return sampled_ids \ No newline at end of file diff --git a/NeuralModels/Decoder/RNetvHCAttention.py b/NeuralModels/Decoder/RNetvHCAttention.py new file mode 100644 index 0000000..f2786f9 --- /dev/null +++ b/NeuralModels/Decoder/RNetvHCAttention.py @@ -0,0 +1,200 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Tuple,List +from ..Attention.IAttention import IAttention + + +class RNetvHCAttention(nn.Module): + """ + Class implementing LSTM unit with Attention model + """ + def __init__(self, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, device: str = "cpu", attention: IAttention = None): + """Define the constructor for the RNN Net + + Args: + hidden_dim (int): + The Capacity of the LSTM Cell. + padding_index (int): + The index of the padding id, given from the vocabulary associated to the dataset. + vocab_size (int)): + The size of the vocabulary associated to the dataset. + embedding_size (int): + The number of dimension associated to the input of the LSTM cell. + device (str, optional): Default "cpu" + The device on which the operations will be performed. + """ + super(RNetvHCAttention, self).__init__() + + print(f"Construction of RNetvHCAttention:\n \ + LSTM Capacity: {hidden_dim},\n \ + Padding Index: {padding_index},\n \ + Vocabulary Size: {vocab_size},\n \ + Embedding dimension: {embedding_dim},\n \ + Attention Dimension: {attention.attention_dim},\n \ + Device: {device}") + + self.device = torch.device(device) + # Embedding layer that turns words into a vector of a specified size + self.words_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index) + + self.attention = attention + + self.attention_dim = attention.attention_dim + + self.encoder_dim = attention.encoder_dim + + self.vocab_size = vocab_size + + self.hidden_dim = hidden_dim + + # The initial memory state and hidden state of the LSTM are predicted by an average of the annotation vectors fed through two separate MLPs (init,c and init,h): + self.h_0 = nn.Linear(self.encoder_dim, hidden_dim) + self.c_0 = nn.Linear(self.encoder_dim, hidden_dim) + + # The LSTM takes embedded word vectors (of a specified size) as input + # and outputs hidden states of size hidden_dim + self.lstm_unit = torch.nn.LSTMCell(self.encoder_dim + embedding_dim, hidden_dim) + + + # The linear layer that maps the hidden state output dimension + # to the number of words we want as output, vocab_size + self.linear_1 = nn.Linear(hidden_dim, vocab_size) + + # the soft attention model predicts a gating scalar β from previous hidden state ht_1 at each time step t + # Par. 4.2.1 + self.f_beta = nn.Linear(hidden_dim, self.encoder_dim) + self.sigmoid = nn.Sigmoid() + + def init_h_0_c_0(self, images: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Init hidden and cell state at t_0 + + Args: + images (torch.Tensor): `(batch_dim, H_portions * W_portions, encoder_dim)` + The images coming from the encoder. + + Returns: + (torch.Tensor, torch.Tensor): `[(batch_dim, hidden_dim), (batch_dim, hidden_dim)]` + Hiddent state and cell state ready for the 1st input + """ + images = images.mean(dim=1) # Dim=0 -> batch_dim, Dim=1 -> H_portions * W_portions, Dim=2 -> encoder_dim + return self.h_0(images), self.c_0(images) + + + def forward(self, images: torch.Tensor, captions: torch.Tensor, captions_length: List[int]) -> Tuple[torch.Tensor, List[int], torch.Tensor]: + """Compute the forward operation of the RNN. + input of the LSTM cell for each time step: + t_{-1}: NONE + t_0: Deterministict + . + . + . + t_{N-1}: The embedding vector associated to the S_{N-1} id. + + Args: + images (torch.Tensor): `(batch_dim, H_portions, W_portions, encoder_dim)` + The features associated to each image of the batch. + + captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)` + The caption associated to each element of the batch. + REMARK Each caption is in the full form: + .... + + REMARK The Tensor is padded with zeros + + caption_length ([int]): + The length of each caption in the batch. + + Returns: + (torch.Tensor): `(batch_dim, max_captions_length, vocab_size)` + The output of LSTM for each time step from t_1 to t_N, + at t_0 + REMARK is the 1st element in the output caption for each element in batch. + + (List(int)): + The length of each decoded caption. + REMARK The is provided as input at t_0. + REMARK The token will be removed from the inputs of the LSTM. + + (torch.Tensor): `(batch_dim, max_captions_length, alphas)` + All the alphas evaluated over timestep t (from t_0 to t_{N-1}), for each image in the batch. + """ + + # Retrieve batch size + batch_dim = images.shape[0] # images is of shape (batch_dim, H_portions, W_portions, encoder_dim) + + # Create embedded word vector for each word in the captions + inputs = self.words_embedding(captions) # In: Out: (batch_dim, captions length, embedding_dim) + + + # Initialize the hidden state and the cell state at time t_{-1} + images = images.reshape(batch_dim,-1, images.shape[3]) # Out: (batch_dim, H_portions * W_portions, encoder_dim) + _h, _c = self.init_h_0_c_0(images) # _h : (batch_dim, hidden_dim), _c : (batch_dim, hidden_dim) + + # Deterministict Output as first word of the caption t_{0} + start = torch.zeros(self.vocab_size).unsqueeze(0) + start[0][1] = 1 + start = start.to(self.device) # Out: (1, vocab_size) + + # Bulk insert of to all the elements of the batch + outputs = start.repeat(batch_dim,1,1).to(self.device) # Out: (batch_dim, 1, vocab_size) + + # Tensor for storing alphas at each timestep t, structure (batch_dim, MaxN, number_of_splits^2) -> number_of_splits intended for a single Measure like Heigth and assuming square images + alphas_t = torch.zeros((batch_dim,inputs.shape[1],self.attention.number_of_splits**2)).to(self.device) + + # Feed LSTMCell with image features and retrieve the state + + # How it works the loop? + # For each time step t \in {0, N-1}, where N is the caption length + + + for idx in range(0,inputs.shape[1]): + attention_encoding, alphas_t_i = self.attention(images, _h) # Out: attention_encoding->(batch_dim,encoder_dim), alphas_t_i->(batch_dim, number_of_splits) + gate = self.sigmoid(self.f_beta(_h)) # IN: (batch_dim, hidden_dim) -> Out: (batch_dim, encoder_dim) + attention_encoding = gate * attention_encoding # Gating z_t + alphas_t[:,idx,:] = alphas_t_i + _h, _c = self.lstm_unit(torch.cat([inputs[:,idx,:], attention_encoding], dim=1), (_h,_c)) # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions + _outputs = self.linear_1(_h) # In: (batch_dim, hidden_dim), Out: (batch_dim, vocab_size) + outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell for all the elements in batch + + return outputs, list(map(lambda length: length-1, captions_length)),alphas_t + + def generate_caption(self, image: torch.Tensor, captions_length: int) -> torch.Tensor: + """Given the features vector retrieved by the encoder, perform a decoding (Generate a caption) + + Args: + + image (torch.Tensor): `(1, H_portions, W_portions, encoder_dim)` + The image. + + captions_length (int): + The length of the caption. + + Returns: + + (torch.Tensor): + The caption associated to the image given. + It includes at t_0 by default. + + (torch.Tensor): + The alphas evaluated at each time t + + """ + + sampled_ids = [torch.Tensor([1]).type(torch.int64).to(self.device)] # Hardcoded + input = self.words_embedding(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) + alphas = torch.zeros(captions_length, self.attention.number_of_splits **2) # Out: (MaxCaptionLength, number_of_splits) + with torch.no_grad(): + image = image.reshape(1,-1, image.shape[3]) # Out: (1, H_portions * W_portions, encoder_dim) + _h, _c = self.init_h_0_c_0(image) + for idx in range(captions_length-1): + attention_encoding, alphas[idx,:] = self.attention(image, _h) + gate = self.sigmoid(self.f_beta(_h)) # IN: (1, hidden_dim) -> Out: (1, encoder_dim) + attention_encoding = gate * attention_encoding # Gating z_t + _h, _c = self.lstm_unit(torch.cat([input,attention_encoding], dim=1), (_h ,_c)) # _h: (1, hidden_dim) + outputs = self.linear_1(_h) # outputs: (1, vocab_size) + _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if self.device.type == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: The predicted id + sampled_ids.append(predicted) + input = self.words_embedding(predicted) # In: (1, embedding_dim) + input = input.to(torch.device(self.device)) # In: (1, embedding_dim) + if predicted == 2: + break + sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (1, captions_length) + return sampled_ids, alphas \ No newline at end of file diff --git a/NeuralModels/Decoder/RNetvI.py b/NeuralModels/Decoder/RNetvI.py new file mode 100644 index 0000000..786ae80 --- /dev/null +++ b/NeuralModels/Decoder/RNetvI.py @@ -0,0 +1,152 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Tuple,List + +class RNetvI(nn.Module): + """ + Class implementing LSTM unit with Cell and Hidden state initialized at ZEROS and features coming from external as 1st input + """ + + def __init__(self, hidden_dim: int, padding_index: int, vocab_size: int, embedding_dim: int, device: str = "cpu"): + """Define the constructor for the RNN Net + + Args: + + hidden_dim (int): + Capacity of the LSTM Cell. + + padding_index (int): + The index of the padding id, given from the vocabulary associated to the dataset. + + vocab_size (int)): + The size of the vocabulary associated to the dataset. + + embedding_dim (int): + The number of features associated to a word. + + device (str, optional): Default "cpu" + The device on which the operations will be performed. + """ + super(RNetvI, self).__init__() + + print(f"Construction of RNetvI:\n \ + LSTM Capacity: {hidden_dim},\n \ + Padding Index: {padding_index},\n \ + Vocabulary Size: {vocab_size},\n \ + Embedding dimension: {embedding_dim},\n \ + Device: {device}") + + self.device = torch.device(device) + self.hidden_dim = hidden_dim + self.vocab_size = vocab_size + # Embedding layer that turns words into a vector. + self.words_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index) + + # The LSTM takes embedded word vectors (of a specified size) as input + # and outputs hidden states of size hidden_dim + self.lstm_unit = torch.nn.LSTMCell(embedding_dim, hidden_dim) + + # The linear layer that maps the hidden state + # to the number of words we want as output = vocab_size + self.linear_1 = nn.Linear(hidden_dim, vocab_size) + + + def forward(self, images: torch.Tensor, captions: torch.Tensor, captions_length: List[int]) -> Tuple[torch.Tensor, List[int]]: + """Compute the forward operation of the RNN. + input of the LSTM cell for each time step: + t_{-1}: features vector + t_0: Deterministict + . + . + . + t_{N-1}: The embedding vector associated to the S_{N-1} id. + + Args (Suggested): + + images (torch.Tensor): `(batch_dim, encoder_dim)` + The features associated to each image of the batch. + + captions (torch.Tensor): `(batch_dim, max_captions_length, embedding_dim)` + The caption associated to each image of the batch. + _REMARK Each caption is in the full form: + .... + _ + REMARK The Tensor is padded with zeros + + caption_length (List(int)): + The length of each caption in the batch. + + Returns: `[(batch_dim, max_captions_length, vocab_size), List(int)]` + + (torch.Tensor): + The output of LSTM for each time step from t_1 to t_N, + at t_0 + REMARK is the 1st element in the output caption for each element in batch. + + (List(int)): + The length of each decoded caption. + REMARK The is provided as input at t_0. + REMARK The token will be removed from the input of the LSTM. + """ + + # Retrieve batch size + batch_dim = images.shape[0] # images is of shape (batch_dim, embedding_dim) + + # Create embedded word vector for each word in the captions + inputs = self.words_embedding(captions) # In: Out: (batch_dim, captions length, embedding_dim) + + # Initialize the hidden state and the cell state at time t_{-1} + _h, _c = self.lstm_unit(images) # _h : (batch_dim, hidden_dim), _c : (batch_dim, hidden_dim) + + # Deterministict Output as first word of the caption t_{0} + start = torch.zeros(self.vocab_size) + start[1] = 1 + start = start.to(self.device) # Out: (1, vocab_size) + + # Bulk insert of to all the elements of the batch + outputs = start.repeat(batch_dim,1,1).to(self.device) # Out: (batch_dim, 1, vocab_size) + + # Feed LSTMCell with image features and retrieve the state + + # How it works the loop? + # For each time step t \in {0, N-1}, where N is the caption length + + + for idx in range(0,inputs.shape[1]): + _h, _c = self.lstm_unit(inputs[:,idx,:], (_h,_c)) # inputs[:,idx,:]: for all the captions in the batch, pick the embedding vector of the idx-th word in all the captions + _outputs = self.linear_1(_h) # In: (batch_dim, hidden_dim), Out: (batch_dim, vocab_size) + outputs = torch.cat((outputs,_outputs.unsqueeze(1)),dim=1) # Append in dim `1` the output of the LSTMCell for all the elements in batch + + return outputs, list(map(lambda length: length-1, captions_length)) + + def generate_caption(self, image: torch.Tensor, captions_length: int) -> torch.Tensor: + """Given the features vector of the image, perform a decoding (Generate a caption) + + Args: + + image (torch.Tensor): `(1, encoder_dim)` + The features associated to the image. + + max_caption_length (int): + The maximum ammisible length of the caption. + + Returns: + + (torch.Tensor): `(1, )` + The caption associated to the image given. + REMARK It includes at t_0 by default. + """ + + sampled_ids = [torch.tensor([1]).to(self.device)] # Hardcoded + input = self.words_embedding(torch.LongTensor([1]).to(torch.device(self.device))).reshape((1,-1)) # Out: (1, embeddings_dim) + with torch.no_grad(): + _h ,_c = self.lstm_unit(image) + for _ in range(captions_length-1): + _h, _c = self.lstm_unit(input, (_h ,_c)) # _h: (1, 1, hidden_dim) + outputs = self.linear_1(_h) # outputs: (1, vocab_size) + _ , predicted = F.softmax(outputs,dim=1).cuda().max(1) if self.device.type == "cuda" else F.softmax(outputs,dim=1).max(1) # predicted: The predicted id + sampled_ids.append(predicted) + input = self.words_embedding(predicted) # inputs: (1, embedding_dim) + input = input.to(torch.device(self.device)) # In: (1, embedding_dim) + if predicted == 2: + break + sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (1, captions_length) + return sampled_ids diff --git a/NeuralModels/Encoder/CResNet50.py b/NeuralModels/Encoder/CResNet50.py new file mode 100644 index 0000000..09ffdf9 --- /dev/null +++ b/NeuralModels/Encoder/CResNet50.py @@ -0,0 +1,55 @@ +import torch.nn as nn +import torch +import torchvision.models as models + +class CResNet50(nn.Module): + """ + Encoder Built with a resnet50 with the last layer removed. + """ + + def __init__(self, encoder_dim: int, device: str = "cpu"): + """Constructor of the Encoder + + Args: + encoder_dim (int): + The dimensionality of the features vector extracted from the image + + device (str, optional): Default "cpu". + The device on which the operations will be performed. + """ + super(CResNet50, self).__init__() + + self.encoder_dim = encoder_dim + self.device = torch.device(device) + resnet = models.resnet50(pretrained=True) + for param in resnet.parameters(): # Freezing weights + param.requires_grad_(False) + + print(f"Construction of CResNet50:\n \ + Encoder dimension: {encoder_dim},\n \ + Device: {device}") + + modules = list(resnet.children())[:-1] # remove last fc layer, expose the GlobalAveragePooling + self.resnet = nn.Sequential(*modules) + + self.linear = nn.Linear(resnet.fc.in_features, encoder_dim) # define a last fc layer + + def forward(self, images: torch.Tensor) -> torch.Tensor: + """Forward operation of the nn + + Args: + images (torch.tensor): `(batch_dim, channels, heigth, width)` + The tensor of the images. + + Returns: + [torch.tensor]: `(batch_dim, encoder_dim)` + Features Projection for each image in the batch. + + """ + + features = self.resnet(images) # Out: (batch_dim, 2048, 1, 1), 2048 is a Design choice of ResNet50 of last conv.layer. + + features = features.reshape(features.size(0), -1).to(self.device) + features = self.linear(features) # In: (batch_dim, 2048) + + return features \ No newline at end of file diff --git a/NeuralModels/Encoder/CResNet50Attention.py b/NeuralModels/Encoder/CResNet50Attention.py new file mode 100644 index 0000000..06949af --- /dev/null +++ b/NeuralModels/Encoder/CResNet50Attention.py @@ -0,0 +1,57 @@ +import torch.nn as nn +import torch +import torchvision.models as models + +class CResNet50Attention(nn.Module): + def __init__(self, encoder_dim: int, number_of_splits: int = 7, device: str = "cpu"): + """Constructor of the Encoder NN + + Args: + encoder_dim (int): Unused. Internally resnet with 2 layers removed represent each pixel in a vector of 2048 components. + + number_of_splits (int): + How many pieces do you want to split the images, for border. + Examples: + number_of_splits = 7 -> The images will be splitted into 49 pieces (7x7). + + device (str, optional): Default "cpu". + The device on which the operations will be performed. + """ + super(CResNet50Attention, self).__init__() + + self.device = torch.device(device) + resnet = models.resnet50(pretrained=True) + for param in resnet.parameters(): # Freezing weights + param.requires_grad_(False) + + modules = list(resnet.children())[:-2] # Expose the last convolutional layer. 2048 Filters of size 1x1. Output of the ConvLayer -> (H_in/32,W_in/32,2048) + + self.encoder_dim = 2048 + + self.number_of_splits = number_of_splits + + print(f"Construction of CResNet50Attention:\n \ + Encoder dimension: {self.encoder_dim},\n \ + Number of splits: {number_of_splits**2},\n \ + Device: {device}") + # Q. Why (H_in/32, W_in/32) + # A. Due to the resnet50 implementation, each convolutional layer will reduce the dimensionality of Heigth and Width by 2 times. + + self.resnet = nn.Sequential(*modules) + + + def forward(self, images: torch.Tensor) -> torch.Tensor: + """Forward operation of the nn + + Args: + images (torch.tensor): `(batch_dim, Channels, Width, Height)` + The tensor of the images. + + Returns: + [torch.tensor]: `(batch_dim, H_splits, W_splits, encoder_dim)` + Features Projection Tensor + """ + + features = self.resnet(images) # Out: (batch_dim, 2048,Heigth/32, Width/32) + features = features.permute(0, 2, 3, 1) # (batch_dim, H_splits, W_splits, 2048) + return features \ No newline at end of file diff --git a/NeuralModels/Encoder/IEncoder.py b/NeuralModels/Encoder/IEncoder.py new file mode 100644 index 0000000..745af14 --- /dev/null +++ b/NeuralModels/Encoder/IEncoder.py @@ -0,0 +1,35 @@ +##### INTERFACE CLASS DON'T USE IT (You at most use only as Type Hint), JUST READ IT. +################################################################ + +import torch.nn as nn +import torch +import torchvision.models as models + +class IEncoder(nn.Module): + """ + Interface for a generic Encoder + """ + def __init__(self, *args): + """Constructor of the Encoder NN + + Args: + encoder_dim (int): + The dimensionality of the features vector extracted from the image + + device (str, optional): Default "cpu". + The device on which the operations will be performed. + """ + super(IEncoder, self).__init__() + + def forward(self, *args) -> torch.Tensor: + """Interface of forward operation of the nn + + Args: + images (torch.tensor): `(batch_dim, channels, heigth, width)` + The tensor of the images. + + Returns: + [torch.tensor]: `(batch_dim, encoder_dim)` + Features Projection for each image in the batch. + """ + pass \ No newline at end of file diff --git a/NeuralModels/FactoryModels.py b/NeuralModels/FactoryModels.py new file mode 100644 index 0000000..f39d9e0 --- /dev/null +++ b/NeuralModels/FactoryModels.py @@ -0,0 +1,183 @@ +from .Encoder.CResNet50 import CResNet50 +from .Encoder.CResNet50Attention import CResNet50Attention +from .Decoder.RNetvHC import RNetvHC +from .Decoder.RNetvI import RNetvI +from .Decoder.RNetvH import RNetvH +from .Decoder.RNetvHCAttention import RNetvHCAttention +from .CaRNet import CaRNet +from .Attention.SoftAttention import SoftAttention +from enum import Enum + +# Open source is a development methodology; free software is a social movement. +# - Richard Stallman + +####### How to continue implementation? +## Everyone is free of enrich this library, remember to follow the IInterface.py for each type of Elements +## At the end of your code session, add your element to the factory. +## Follow the rule: + +class Attention(Enum): + """ + Attention type list. + """ + Attention = "Attention" + + def __str__(self): + return self.name + + def __repr__(self): + return str(self) + + @staticmethod + def argparse(s): + try: + return Attention[s] + except KeyError: + return s + +def FactoryAttention(attention: Attention): + """ Attention Factory + + Args: + attention (Attention): + The expected attention to produce + + Raises: + NotImplementedError: Raise when external ask for an implementation that is not covered yet. + + Returns: + (IAttetion): + A Class reference + """ + if attention == Attention.Attention: + return SoftAttention + raise NotImplementedError("This attention model is not implemented yet") + +#################################################################### + +class Encoder(Enum): + """ + Encoder type list. + """ + CResNet50 = "CResNet50" + CResNet50Attention = "CResNet50Attention" + + def __str__(self): + return self.name + + def __repr__(self): + return str(self) + + @staticmethod + def argparse(s): + try: + return Encoder[s] + except KeyError: + return s + +def FactoryEncoder(encoder: Encoder): + """ Encoder Factory + + Args: + encoder (Encoder): + The expected encoder to produce + + Raises: + NotImplementedError: Raise when external ask for an implementation that is not covered yet. + + Returns: + (IEncoder): + A Class reference + """ + if encoder == Encoder.CResNet50: + return CResNet50 + if encoder == Encoder.CResNet50Attention: + return CResNet50Attention + raise NotImplementedError("This encoder is not implemented yet") + +#################################################################### + +class Decoder(Enum): + """ + Decoder type list. + """ + RNetvI = "RNetvI" + RNetvH = "RNetvH" + RNetvHC = "RNetvHC" + RNetvHCAttention = "RNetvHCAttention" + + def __str__(self): + return self.name + + def __repr__(self): + return str(self) + + @staticmethod + def argparse(s): + try: + return Decoder[s] + except KeyError: + return s + +def FactoryDecoder(decoder: Decoder): + """ Decoder Factory + + Args: + decoder (Decoder): + The expected decoder to produce + + Raises: + NotImplementedError: Raise when external ask for an implementation that is not covered yet. + + Returns: + (IDecoder): + A Class reference + """ + if decoder == decoder.RNetvI: + return RNetvI + if decoder == decoder.RNetvH: + return RNetvH + if decoder == decoder.RNetvHC: + return RNetvHC + if decoder == decoder.RNetvHCAttention: + return RNetvHCAttention + raise NotImplementedError("This decoder is not implemented yet") + +##################################################################### + +class NeuralNet(Enum): + """ + NeuralNet type list. + """ + CaRNet = "CaRNet" + + def __str__(self): + return self.name + + def __repr__(self): + return str(self) + + @staticmethod + def argparse(s): + try: + return NeuralNet[s] + except KeyError: + return s + +def FactoryNeuralNet(net: NeuralNet): + """ NeuralNet Factory + + Args: + net (NeuralNet): + The expected neural net to produce + + Raises: + NotImplementedError: Raise when external ask for an implementation that is not covered yet. + + Returns: + (NeuralNet): + A Class reference + """ + if net == NeuralNet.CaRNet: + return CaRNet + raise NotImplementedError("This neural net is not implemented yet") \ No newline at end of file diff --git a/NeuralModels/Metrics.py b/NeuralModels/Metrics.py new file mode 100644 index 0000000..32b7185 --- /dev/null +++ b/NeuralModels/Metrics.py @@ -0,0 +1,58 @@ +import pandas as pd + +class Result(): + """ + Class for storing result of the net + """ + def __init__(self): + """Constructor of the class + """ + self.train_results = pd.DataFrame([], columns=["Epoch", "IDBatch", "Loss", "Accuracy"]) + convert_dict = {'Epoch': int, + "IDBatch": int, + "Loss":float, + 'Accuracy': float + } + self.train_results = self.train_results.astype(convert_dict) + + convert_dict = {'Epoch': int, + 'Accuracy': float + } + self.validation_results = pd.DataFrame([], columns=["Epoch", "Accuracy"]) + self.validation_results = self.validation_results.astype(convert_dict) + + def add_train_info(self, epoch: int, batch_id: int, loss: float, accuracy: float): + """Add a row to the dataframe of the training set info + + Args: + epoch (int): + The epoch. + batch_id (int): + The id of the batch. + loss (float): + Loss of the given epoch-batch. + accuracy (float): + Accuracy of the given epoch-batch. + """ + self.train_results = self.train_results.append({"Epoch":epoch,"IDBatch": batch_id, "Loss": loss, "Accuracy":accuracy}, ignore_index=True) + + def add_validation_info(self, epoch: int, accuracy: float): + """Add a row to the dataframe of the validation set info + + Args: + epoch (int): + The epoch. + accuracy (float): + Accuracy of the given epoch-batch. + """ + self.validation_results = self.validation_results.append({"Epoch":epoch, "Accuracy":accuracy}, ignore_index=True) + + def flush(self, directory: str = "."): + """Flush the dataframes to non-volatile memory in a csv format + + Args: + directory (str, optional): Defaults to ".". + The directory to store the files as csv. + """ + self.train_results.to_csv(f'{directory}/train_results.csv', encoding='utf-8', index=False) + self.validation_results.to_csv(f'{directory}/validation_results.csv', encoding='utf-8', index=False) \ No newline at end of file diff --git a/NeuralModels/Vocabulary.py b/NeuralModels/Vocabulary.py new file mode 100644 index 0000000..df41b16 --- /dev/null +++ b/NeuralModels/Vocabulary.py @@ -0,0 +1,170 @@ +# Typing trick for avoid circular import dependencies valid for python > 3.9 +# from __future__ import annotations +# from typing import TYPE_CHECKING +# if TYPE_CHECKING: +# from .Dataset import MyDataset + +import torch +from typing import List +import os +import pickle + +class Vocabulary(): + """ + Implementation of the vocabulary. + + Assumption: + + 1) The vocabulary is enriched with 4 special words:\n + : Padding ------> ID: 0\n + : Start Of String ------> ID: 1\n + : End Of String ------> ID: 2\n + : Out of vocabulary word ------> ID: 3\n + + Example: I Love Pizza -> Translate into ids -> 1 243 5343 645655 2 0 0 + """ + + + def __init__(self, source_dataset = None): # for python > 3.9 -> def __init__(self, source_dataset: MyDataset): + """Vocabulary constructor + + Args: + source_dataset (MyDataset): + The source Dataset, if None try to load a vocabulary from the hidden .saved folder + """ + + if source_dataset is None: + print("Try to load the vocabulary from file..") + + if not os.path.exists(".saved/word2id.pickle") or not os.path.exists(".saved/embeddings.pickle"): + raise FileNotFoundError("You request a loading from file but the file doesn't exist, first generate the vocabulary!") + + with open('.saved/word2id.pickle', 'rb') as word2id, open('.saved/embeddings.pickle', 'rb') as embeddings: + self.word2id = pickle.load(word2id) + self.embeddings = pickle.load(embeddings) + self.dictionary_length = len(self.word2id.keys()) + return + + # Load for the 1st time all the possible words from the dataset + dataset_words = source_dataset.get_all_distinct_words_in_dataset() + + # Dictionary length + self.dictionary_length = len(dataset_words)+4 # Dictionary word + 4 Flavored Token (PAD + START + END + UNK) + + self.word2id = {} + self.embeddings = torch.zeros((self.dictionary_length, self.dictionary_length)) # DIM1: dict rows + 4 flavored token (PAD + START + END + UNK) | DIM2: Dict Rows +4 flavored token (PAD + START + END + UNK) as 1-hot + + # Initialize the token: + # , , , + self.word2id[""] = 0 + self.word2id[""] = 1 + self.word2id[""] = 2 + self.word2id[""] = 3 + + counter = 4 + for word in dataset_words: + self.word2id[word] = counter + counter += 1 + + # Identiry matrix == 1-hot vector :) + self.embeddings = torch.eye(self.dictionary_length) + + with open('.saved/word2id.pickle', 'wb') as word2id, open('.saved/embeddings.pickle', 'wb') as embeddings: + pickle.dump(self.word2id, word2id, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(self.embeddings, embeddings, protocol=pickle.HIGHEST_PROTOCOL) + + def predefined_token_idx(self) -> dict: + """Return the predefined token indexes. + + Returns: + dict: The token dictionary + """ + return { + "":0, + "":1, + "":2, + "":3 + } + + def translate(self, word_sequence : List[str], type : str = "complete") -> torch.tensor: + """Given a sequence of word, translate into id list according to the vocabulary. + + Args: + word_sequence (list(str)): + The sequence of words to translate + + type (str, optional): Default is complete + The type of translation. + + Returns: + (torch.Tensor): `(1,caption_length)` + The caption in IDs form. + `if` complete: <1> + ...Caption... + <2> + `else`: <1> + ...Caption... + """ + + # Initialize the translator + + if type == "uncomplete": + _sequence = torch.zeros(len(word_sequence)+1, dtype=torch.int32) # + ...Caption... + + if type == "complete": + _sequence = torch.zeros(len(word_sequence)+2, dtype=torch.int32) # + ...Caption... + + _sequence[-1] = self.word2id[""] + + _sequence[0] = self.word2id[""] + + counter = 1 # Always skip + + # Evaluate all the word into the caption and translate it to an embeddings + for word in word_sequence: + if word.lower() in self.word2id.keys(): + _sequence[counter] = self.word2id[word.lower()] + else: + _sequence[counter] = self.word2id[""] + counter += 1 + + return _sequence + + def rev_translate(self, words_id : torch.tensor) -> List[str]: + """Given a sequence of word, translate into id list according to the vocabulary. + + Args: + words_id (torch.Tensor): `(1,caption_length)` + The sequence of IDs. + Returns: + (List(str)): + The caption in words form. + """ + return [list(self.word2id.keys())[idx] for idx in words_id[:].tolist()] # word_id (1,caption_length) + + + def __len__(self): + """The total of words in this Vocabulary.""" + + return len(self.word2id.keys()) + + +# ---------------------------------------------------------------- +# Usage example + +if __name__ == '__main__': + #Load the vocabulary + pippo = MyDataset(...) + v = Vocabulary(source_dataset=pippo) + # Make a translation + print(v.translate(["I","like","PLay","piano","."])) + + + + + + + + + + + + + + \ No newline at end of file diff --git a/NeuralModel/__init__.py b/NeuralModels/__init__.py similarity index 100% rename from NeuralModel/__init__.py rename to NeuralModels/__init__.py diff --git a/VARIABLE.py b/VARIABLE.py new file mode 100644 index 0000000..6319683 --- /dev/null +++ b/VARIABLE.py @@ -0,0 +1,9 @@ +# Dataset Variable +MAX_CAPTION_LENGTH = 15 +IMAGES_SUBDIRECTORY_NAME = "flickr30k_images" +CAPTION_FILE_NAME = "results.csv" + +# Vocabulary +EMBEDDINGS_REPRESENTATION = "1-HOT" + + diff --git a/attention.png b/attention.png new file mode 100644 index 0000000..8bb154b Binary files /dev/null and b/attention.png differ diff --git a/caption.png b/caption.png new file mode 100644 index 0000000..b963742 Binary files /dev/null and b/caption.png differ diff --git a/img1.png b/img1.png new file mode 100644 index 0000000..e1c93dc Binary files /dev/null and b/img1.png differ diff --git a/main.py b/main.py index e69de29..148768e 100644 --- a/main.py +++ b/main.py @@ -0,0 +1,178 @@ +from torch.utils.data import DataLoader +from NeuralModels.FactoryModels import * +from NeuralModels.Dataset import MyDataset +from NeuralModels.Vocabulary import Vocabulary +import argparse +import sys, os +from PIL import Image + +def parse_command_line_arguments(): + + parser = argparse.ArgumentParser(description='CLI for C[aA]RNet, some static definition are placed in the VARIABLE.py file') + + parser.add_argument('decoder', type=Decoder.argparse, choices=list(Decoder), + help="What type of decoder do you want use?") + + parser.add_argument('mode', choices=['train', 'eval'], + help='train or evaluate C[aA]RNet.') + + parser.add_argument('encoder_dim', type=int, + help = 'Size of the encoder output. IF Attention is True, fixed at 2048. IF CaRNetvI as net, encoder_dim == |vocabulary|.') + + parser.add_argument('hidden_dim', type=int, + help = 'Capacity of the LSTM Cell.') + + parser.add_argument('--attention', default=False, type=bool, + help='Use attention model. IF True, vHCAttention decoder and CResNet50Attention encoder are mandatories. (default: False)') + + + parser.add_argument('--attention_dim', type=int, default=0, + help="The attention capacity. Valid only if attention is true. (default: 0)") + + parser.add_argument('--dataset_folder', type=str, default="./dataset", + help='Data set folder. Used only if mode = train (Default: "./dataset")') + + parser.add_argument('--image_path', type=str, default="", + help = "The absolute path of the image that we want to retrieve the caption. Used only if mode = eval (Default: ''") + + parser.add_argument('--splits', type=int, nargs="+", default=[60,30,10], + help='Fraction of data to be used in train set, val set and test set (default: 60 30,10)') + + parser.add_argument('--batch_size', type=int, default=32, + help='mini-batch size (default: 32)') + + parser.add_argument('--epochs', type=int, default=500, + help='number of training epochs (default: 500)') + + parser.add_argument('--lr', type=float, default=1e-3, + help='learning rate (Adam) (default: 1e-3)') + + parser.add_argument('--workers', type=int, default=4, + help='number of working units used to load the data (default: 4)') + + parser.add_argument('--device', default='cpu', type=str, + help='device to be used for computations (in {cpu, cuda:0, cuda:1, ...}, default: cpu)') + + parsed_arguments = parser.parse_args() + + return parsed_arguments + + + +if __name__ == "__main__": + print("Coded with love by christiandimaio aka gnekt :* \n ") + args = parse_command_line_arguments() + + for k, v in args.__dict__.items(): + print(k + '=' + str(v)) + + #################################### Define Encoder/Decoder + encoder = None + decoder = None + attention = None + if args.attention == True: + # Attention is true, encoder = CResNet50Attention, decoder = RNetvHCAttention + encoder = FactoryEncoder(Encoder.CResNet50Attention) + decoder = FactoryDecoder(Decoder.RNetvHCAttention) + attention = FactoryAttention(Attention.Attention) + args.net_name = "CARNetvHCAttention" + + if args.attention == False: + args.net_name = f"Ca{args.decoder}" + encoder = FactoryEncoder(Encoder.CResNet50) + decoder = FactoryDecoder(args.decoder) + #################################### + + #################################### Construct Data + print("Construct data..") + + if args.mode == "train": + print("Define dataset..") + dataset = MyDataset(args.dataset_folder, percentage=8) # Percentage is fixed cause the dataset is HUGE, 8% is enough for sperimental test. + print("OK.") + + print("Define vocabulary..") + vocabulary = Vocabulary(dataset) + print("OK.") + + # Obtain train, validation and test set + print("Obtain train, validation and test set..") + train_set = dataset.get_fraction_of_dataset(percentage=args.splits[0], delete_transfered_from_source=True) + validation_set = dataset.get_fraction_of_dataset(percentage=args.splits[1], delete_transfered_from_source=True) + test_set = dataset.get_fraction_of_dataset(percentage=args.splits[2], delete_transfered_from_source=True) + print("OK.") + + # Define the associate dataloader + print("Define the associate dataloader") + dataloader_training = DataLoader(train_set, batch_size=args.batch_size, + shuffle=True, num_workers=args.workers, collate_fn = lambda data: dataset.pack_minibatch_training(data,vocabulary)) + dataloader_validation = DataLoader(validation_set, batch_size=args.batch_size, + shuffle=True, num_workers=args.workers, collate_fn = lambda data: dataset.pack_minibatch_evaluation(data,vocabulary)) + dataloader_test = DataLoader(test_set, batch_size=args.batch_size, + shuffle=True, num_workers=args.workers, collate_fn = lambda data: dataset.pack_minibatch_evaluation(data,vocabulary)) + print("OK.") + + if args.mode == "eval": + print("Define vocabulary..") + vocabulary = Vocabulary() + print("Ok.") + + print("Load the image..") + if not os.path.exists(args.image_path) or os.path.isdir(args.image_path): + raise ValueError(f"Got {args.image_path} as file path, error!") + image: Image.Image = Image.open(args.image_path).convert('RGB') + print("Ok.") + #################################### + + #################################### Define Net + print("Create the net..") + net = FactoryNeuralNet(NeuralNet.CaRNet)( + encoder=encoder, + decoder=decoder, + attention=attention, # != None only if Attention is requested + attention_dim = args.attention_dim, # != 0 only if Attention is True + net_name=args.net_name, + encoder_dim = args.encoder_dim if args.decoder is not Decoder.RNetvI else vocabulary.embeddings.shape[1], # if Attention is True encoder_dim hasn't any meaning, cause it is 2048 internally by construction. + hidden_dim= args.hidden_dim, + padding_index= vocabulary.predefined_token_idx()[""], + vocab_size= len(vocabulary.word2id.keys()), + embedding_dim = vocabulary.embeddings.shape[1], + device=args.device + ) + print("OK.") + #################################### Load a previous trained net, if exist + + print("Check if it is present a previous version of the Net..") + try: + net.load("./.saved") + print("Found.") + except Exception as ex: + print("An exception has occurred.") + print(ex) + if args.mode == "eval": # If the mode is eval the script cannot continue + print("Since you want an evaluation, the script cannot continue, please retrain the network.") + sys.exit(0) + # In training it creates new files. + print("Not Found.") + print("Since the selected mode is training, a new instance of the net will saved during the training activity.") + + #################################### Training or Evaluate + + if args.mode == "train": + print("Start training..") + net.train( + train_set=dataloader_training, + validation_set=dataloader_validation, + lr=args.lr, + epochs=args.epochs, + vocabulary=vocabulary + ) + # Evaluate Test set + print("Done") + print(f"Test set Accuracy: {net.eval_net(dataloader_test, vocabulary):.4f}") + + if args.mode == "eval": + print("Start evaluation..") + net.eval(image, vocabulary) + print("OK.") + #################################### \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..8be15c5 --- /dev/null +++ b/test.py @@ -0,0 +1,35 @@ +from piou import Cli, Option + +cli = Cli(description='A CLI tool') + + + + +sub_cmd = cli.add_sub_parser(cmd='sub', help='A sub command') +sub_cmd.add_option('--test', help='Test mode') + + +@sub_cmd.command(cmd='bar', help='Run bar command') +def sub_bar_main(**kwargs): + pass + + +@cli.command(cmd='foo', help='Run foo command') +def foo_main( + bar: int = Option(..., help='Bar positional argument (required)'), + baz: str = Option(..., '-b', '--baz', help='Baz keyword argument (required)'), + foo: str = Option(None, '--foo', help='Foo keyword argument'), +): + """ + A longer description on what the function is doing. + You can run it with: + ```bash + poetry run python -m piou.test.simple foo 1 -b baz + ``` + And you are good to go! + """ + pass + + +if __name__ == '__main__': + cli.run() \ No newline at end of file