77class EncoderCNN (nn .Module ):
88 def __init__ (self , embed_size ):
99 super (EncoderCNN , self ).__init__ ()
10- resnet = torch . hub . load ( 'pytorch/vision:v0.6.0' , ' resnet50' , pretrained = True )
10+ resnet = models . resnet50 ( pretrained = True )
1111 for param in resnet .parameters ():
1212 param .requires_grad_ (False )
1313
@@ -16,7 +16,8 @@ def __init__(self, embed_size):
1616 self .embed = nn .Linear (resnet .fc .in_features , embed_size ) # attach a linear layer ()
1717
1818 def forward (self , images ):
19- features = self .resnet (images )
19+ with torch .no_grad ():
20+ features = self .resnet (images )
2021 features = features .reshape (features .size (0 ), - 1 )
2122 features = self .embed (features )
2223 return features
@@ -60,7 +61,8 @@ def forward(self, features, captions,caption_lengths):
6061
6162 # Initialize the hidden state
6263 batch_size = features .shape [0 ] # features is of shape (batch_size, embed_size)
63-
64+ self .hidden = self .init_hidden (batch_size )
65+
6466 # Create embedded word vectors for each word in the captions
6567 embeddings = self .word_embeddings (captions ) # embeddings new shape : (batch_size, captions length -1, embed_size)
6668
@@ -82,14 +84,16 @@ def sample(self, features, states=None):
8284 sampled_ids = []
8385 inputs = features .unsqueeze (1 )
8486 inputs = inputs .reshape ((1 ,1 ,inputs .shape [0 ]))
85- for _ in range (30 ):
86- hiddens , states = self .lstm (inputs , states ) # hiddens: (batch_size, 1, hidden_size)
87- outputs = self .linear (hiddens .squeeze (1 )) # outputs: (batch_size, vocab_size)
88- _ , predicted = outputs .max (1 ) # predicted: (batch_size)
89- sampled_ids .append (predicted )
90- inputs = self .word_embeddings (predicted ) # inputs: (batch_size, embed_size)
91- inputs = inputs .unsqueeze (1 ) # inputs: (batch_size, 1, embed_size)
92- sampled_ids = torch .stack (sampled_ids , 1 ) # sampled_ids: (batch_size, max_seq_length)
87+ self .init_hidden (1 )
88+ with torch .no_grad ():
89+ for _ in range (30 ):
90+ hiddens , states = self .lstm (inputs , states ) # hiddens: (batch_size, 1, hidden_size)
91+ outputs = self .linear (hiddens .squeeze (1 )) # outputs: (batch_size, vocab_size)
92+ _ , predicted = outputs .max (1 ) # predicted: (batch_size)
93+ sampled_ids .append (predicted )
94+ inputs = self .word_embeddings (predicted ) # inputs: (batch_size, embed_size)
95+ inputs = inputs .unsqueeze (1 ) # inputs: (batch_size, 1, embed_size)
96+ sampled_ids = torch .stack (sampled_ids , 1 ) # sampled_ids: (batch_size, max_seq_length)
9397 return sampled_ids
9498
9599def save (self , file_name ):
@@ -112,7 +116,7 @@ def train(train_set, validation_set, lr, epochs, vocabulary):
112116 best_epoch = - 1 # the epoch in which the best accuracy above was computed
113117
114118 encoder = EncoderCNN (50 )
115- decoder = DecoderRNN (100 ,0 ,len (v_enriched .word2id .keys ()),v_enriched .embeddings )
119+ decoder = DecoderRNN (2048 ,0 ,len (v_enriched .word2id .keys ()),v_enriched .embeddings )
116120
117121 # ensuring the classifier is in 'train' mode (pytorch)
118122 decoder .train ()
@@ -129,9 +133,9 @@ def train(train_set, validation_set, lr, epochs, vocabulary):
129133 epoch_num_train_examples = 0
130134
131135 for images ,captions ,captions_length in train_set :
132- # zeroing the memory areas that were storing previously computed gradients
133136 decoder .zero_grad ()
134- encoder .zero_grad ()
137+ encoder .zero_grad ()
138+ # zeroing the memory areas that were storing previously computed gradients
135139 batch_num_train_examples = images .shape [0 ] # mini-batch size (it might be different from 'batch_size')
136140 epoch_num_train_examples += batch_num_train_examples
137141
@@ -147,8 +151,10 @@ def train(train_set, validation_set, lr, epochs, vocabulary):
147151 targets = pack_padded_sequence (captions , captions_length , batch_first = True )[0 ]
148152
149153 # computing the loss function
150- loss = criterion (outputs , targets )
151-
154+ try :
155+ loss = criterion (outputs , targets )
156+ except Exception as ex :
157+ print (ex )
152158 # computing gradients and updating the network weights
153159 loss .backward () # computing gradients
154160 optimizer .step () # updating weights
@@ -157,6 +163,7 @@ def train(train_set, validation_set, lr, epochs, vocabulary):
157163 torch .save (decoder .state_dict (),".saved/decoder.pt" )
158164 features = encoder (images )
159165 caption = decoder .sample (features [0 ])
166+ print (v_enriched .rev_translate (captions ))
160167 print (v_enriched .rev_translate (caption ))
161168 # computing the performance of the net on the current training mini-batch
162169 # with torch.no_grad(): # keeping these operations out of those for which we will compute the gradient
@@ -198,14 +205,14 @@ def train(train_set, validation_set, lr, epochs, vocabulary):
198205 from PreProcess import PreProcess
199206 from Dataset import MyDataset
200207 from torch .utils .data import DataLoader
201- ds = MyDataset ("./dataset/flickr30k_images/flickr30k_images" )
202- df = ds .get_fraction_of_dataset (percentage = 10 )
208+ ds = MyDataset ("./dataset" , percentage = 2 )
209+ df = ds .get_fraction_of_dataset (percentage = 100 )
203210
204211 # use dataloader facilities which requires a preprocessed dataset
205212 v = Vocabulary (verbose = True )
206213 df_pre_processed ,v_enriched = PreProcess .DatasetForTraining .process (dataset = df ,vocabulary = v )
207214
208- dataloader = DataLoader (df , batch_size = 30 ,
209- shuffle = False , num_workers = 0 , collate_fn = df .pack_minibatch )
215+ dataloader = DataLoader (df , batch_size = 10 ,
216+ shuffle = True , num_workers = 0 , collate_fn = df .pack_minibatch )
210217
211218 train (dataloader , dataloader , 1e-2 , 10 , v_enriched )
0 commit comments