|
9 | 9 |
|
10 | 10 | device = "cuda:0" |
11 | 11 | class EncoderCNN(nn.Module): |
12 | | - def __init__(self, embed_size): |
| 12 | + def __init__(self, embedding_size): |
13 | 13 | super(EncoderCNN, self).__init__() |
14 | 14 | resnet = models.resnet50(pretrained=True) |
15 | 15 | for param in resnet.parameters(): |
16 | 16 | param.requires_grad_(False) |
17 | 17 |
|
18 | 18 | modules = list(resnet.children())[:-1] # remove last fc layer |
19 | 19 | self.resnet = nn.Sequential(*modules) |
20 | | - self.linear = nn.Linear(resnet.fc.in_features, 50) |
| 20 | + self.linear = nn.Linear(resnet.fc.in_features, embedding_size) |
21 | 21 |
|
22 | 22 | def forward(self, images): |
23 | | - |
24 | 23 | features = self.resnet(images) |
25 | | - features = features.reshape(features.size(0), -1) |
| 24 | + features = features.reshape(features.size(0), -1) # (Batch Size, Embedding Dim.) |
26 | 25 | features = self.linear(features) |
27 | 26 | return features |
28 | 27 |
|
29 | 28 | class DecoderRNN(nn.Module): |
30 | | - def __init__(self, hidden_size, padding_index, vocab_size, embeddings ): |
| 29 | + def __init__(self, hidden_size, padding_index, vocab_size, embeddings, embedding_size): |
31 | 30 | """Set the hyper-parameters and build the layers.""" |
32 | 31 | super(DecoderRNN, self).__init__() |
33 | | - # Keep track of hidden_size for initialization of hidden state |
34 | | - self.hidden_size = hidden_size |
35 | 32 |
|
36 | 33 | # Embedding layer that turns words into a vector of a specified size |
37 | | - self.word_embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True, padding_idx = 0) |
| 34 | + self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_index) |
38 | 35 |
|
39 | 36 | # The LSTM takes embedded word vectors (of a specified size) as input |
40 | 37 | # and outputs hidden states of size hidden_dim |
41 | | - self.lstm = nn.LSTM(input_size=50, \ |
42 | | - hidden_size=1024, # LSTM hidden units |
43 | | - num_layers=1, # number of LSTM layer |
44 | | - batch_first=True, # input & output will have batch size as 1st dimension |
45 | | - dropout=0, # Not applying dropout |
46 | | - bidirectional=False, # unidirectional LSTM |
47 | | - ) |
| 38 | + self.lstm_unit = torch.nn.LSTMCell(embedding_size, hidden_size) |
48 | 39 |
|
49 | 40 | # The linear layer that maps the hidden state output dimension |
50 | 41 | # to the number of words we want as output, vocab_size |
51 | | - self.linear_1 = nn.Linear(1024, vocab_size) |
| 42 | + self.linear_1 = nn.Linear(hidden_size, vocab_size) |
52 | 43 |
|
53 | 44 | def init_hidden_state(self, encoder_out): |
54 | 45 | """ |
|
0 commit comments