1
1
###############################################################################
2
2
# Language Modeling on Penn Tree Bank
3
+ #
4
+ # With the default parameters, this should achieve ~116 perplexity on the
5
+ # test set.
3
6
###############################################################################
4
7
5
8
import argparse
10
13
import torch .nn as nn
11
14
from torch .autograd import Variable
12
15
13
- from rnn_modules import *
14
16
import data
15
17
16
18
parser = argparse .ArgumentParser (description = 'PyTorch PTB Language Model' )
17
19
18
20
# Data parameters
19
21
parser .add_argument ('-data' , type = str , default = './data/penn' , help = 'Location of the data corpus' )
20
22
# Model parameters.
21
- parser .add_argument ('-model' , type = str , default = 'LSTM' , help = 'Type of recurrent net. RNN , LSTM, or GRU.' )
23
+ parser .add_argument ('-model' , type = str , default = 'LSTM' , help = 'Type of recurrent net. RNN_TANH, RNN_RELU , LSTM, or GRU.' )
22
24
parser .add_argument ('-emsize' , type = int , default = 200 , help = 'Size of word embeddings' )
23
25
parser .add_argument ('-nhid' , type = int , default = 200 , help = 'Number of hidden units per layer.' )
24
26
parser .add_argument ('-nlayers' , type = int , default = 2 , help = 'Number of layers.' )
25
27
# Optimization parameters.
26
- parser .add_argument ('-lr' , type = float , default = 1 , help = 'Initial learning rate.' )
28
+ parser .add_argument ('-lr' , type = float , default = 20 , help = 'Initial learning rate.' )
27
29
parser .add_argument ('-clip' , type = float , default = 0.5 , help = 'Gradient clipping.' )
28
30
parser .add_argument ('-maxepoch' , type = int , default = 6 , help = 'Upper epoch limit.' )
29
31
parser .add_argument ('-batchsize' , type = int , default = 20 , help = 'Batch size.' )
49
51
50
52
corpus = data .Corpus (args .data )
51
53
52
- def batchify (data , bsz , bptt ):
53
- nbatch = int (math .floor (data .size (0 ) / bsz / bptt ))
54
- data = data .narrow (0 , 0 , nbatch * bptt * bsz )
54
+ def batchify (data , bsz ):
55
+ nbatch = int (math .floor (data .size (0 ) / bsz ))
56
+ data = data .narrow (0 , 0 , nbatch * bsz )
55
57
data = data .view (bsz , - 1 ).t ().contiguous ()
56
58
if args .cuda :
57
59
data = data .cuda ()
58
60
return data
59
61
60
- train = batchify ( corpus . train , args . batchsize , args . bptt )
61
- valid = batchify (corpus .valid , 10 , 1 )
62
- test = batchify (corpus .test , 10 , 1 )
63
-
62
+ eval_bsz = 10
63
+ train = batchify (corpus .train , args . batchsize )
64
+ valid = batchify (corpus .valid , eval_bsz )
65
+ test = batchify ( corpus . test , eval_bsz )
64
66
bptt = args .bptt
65
67
bsz = args .batchsize
66
68
@@ -73,63 +75,62 @@ class RNNModel(nn.Container):
73
75
and a decoder. Runs one RNN step at a time.
74
76
"""
75
77
76
- @staticmethod
77
- def name2module (name ):
78
- if name == 'RNN' :
79
- return RNN
80
- elif name == 'LSTM' :
81
- return LSTM
82
- elif name == 'GRU' :
83
- return GRU
84
- else :
85
- error ("Unknown RNN module: " + name )
86
-
87
78
def __init__ (self , rnnType , ntoken , ninp , nhid , nlayers ):
88
- rnnModule = RNNModel .name2module (rnnType )
89
79
super (RNNModel , self ).__init__ (
90
80
encoder = nn .sparse .Embedding (ntoken , ninp ),
91
- rnn = StackedRNN ( rnnModule , ninp , nhid , nlayers ),
81
+ rnn = nn . rnn . RNNBase ( rnnType , ninp , nhid , nlayers , bias = False ),
92
82
decoder = nn .Linear (nhid , ntoken ),
93
83
)
94
84
95
- # FIXME: is this better than the standard init? probably
96
- # FIXME: we need better reset_parameters methods in stdlib
85
+ # FIXME: add stdv named argument to reset_parameters
86
+ # (and/or to the constructors)
97
87
initrange = 0.1
98
88
self .encoder .weight .data .uniform_ (- initrange , initrange )
99
89
self .decoder .bias .data .fill_ (0 )
100
90
self .decoder .weight .data .uniform_ (- initrange , initrange )
101
91
102
- def forward (self , hidden , input ):
92
+ def forward (self , input , hidden ):
103
93
emb = self .encoder (input )
104
- hidden , output = self .rnn (hidden , emb )
105
- decoded = self .decoder (output )
106
- return hidden , decoded
107
-
108
- def initHidden (self , bsz ):
109
- return self .rnn .initHidden (bsz )
94
+ output , hidden = self .rnn (emb , hidden )
95
+ decoded = self .decoder (output .view (output .size (0 )* output .size (1 ), output .size (2 )))
96
+ return decoded .view (output .size (0 ), output .size (1 ), decoded .size (1 )), hidden
110
97
111
- model = RNNModel (args .model , corpus .dic .ntokens (), args .emsize , args .nhid , args .nlayers )
98
+ ntokens = corpus .dic .ntokens ()
99
+ model = RNNModel (args .model , ntokens , args .emsize , args .nhid , args .nlayers )
112
100
if args .cuda :
113
101
model .cuda ()
114
102
115
103
criterion = nn .CrossEntropyLoss ()
116
104
105
+ def initHidden (model , bsz ):
106
+ weight = next (model .parameters ()).data
107
+ if args .model == 'LSTM' :
108
+ return (Variable (weight .new (args .nlayers , bsz , args .nhid ).zero_ ()),
109
+ Variable (weight .new (args .nlayers , bsz , args .nhid ).zero_ ()))
110
+ else :
111
+ return Variable (weight .new (args .nlayers , bsz , args .nhid ).zero_ ())
112
+
113
+
117
114
########################################
118
115
# TRAINING
119
116
########################################
120
117
121
118
lr = args .lr
122
119
clip = args .clip
123
- reportinterval = args .reportint * args .batchsize
120
+ reportinterval = args .reportint
121
+
124
122
125
123
# Perform the forward pass only.
126
- def evaluate (model , data , criterion ):
124
+ def evaluate (model , data , criterion , bsz ):
127
125
loss = 0
128
- hidden = model . initHidden (data . size ( 1 ) )
126
+ hidden = initHidden (model , bsz )
129
127
# Loop over validation data.
130
- for i in range (0 , data .size (0 ) - 1 ):
131
- hidden , output = model (hidden , Variable (data [i ], requires_grad = False ))
132
- loss += criterion (output , Variable (data [i + 1 ], requires_grad = False )).data
128
+ for i in range (0 , data .size (0 ) - 1 , bptt ):
129
+ seq_len = min (bptt , data .size (0 ) - 1 - i )
130
+ output , hidden = model (Variable (data [i :i + seq_len ], requires_grad = False ), hidden )
131
+ targets = data [i + 1 :i + seq_len + 1 ].view (- 1 )
132
+ loss += bptt * criterion (output .view (seq_len * bsz , - 1 ), Variable (targets , requires_grad = False )).data
133
+ hidden = repackageHidden (hidden )
133
134
134
135
return loss [0 ] / data .size (0 )
135
136
@@ -157,46 +158,60 @@ def repackageHidden(h):
157
158
total_loss = 0
158
159
epoch_start_time = time .time ()
159
160
# Start with an initial hidden state.
160
- hidden = model . initHidden (bsz )
161
- # Loop over the training data.
161
+ hidden = initHidden (model , bsz )
162
+
162
163
loss = 0
163
164
i = 0
164
165
model .zero_grad ()
165
-
166
166
total_loss = 0
167
167
start_time = epoch_start_time = time .time ()
168
- while i < train .size (0 ) - 1 :
169
- hidden , output = model (hidden , Variable (train [i ], requires_grad = False ))
170
- loss += criterion (output , Variable (train [i + 1 ], requires_grad = False ))
171
- i += 1
168
+ ntokens = corpus .dic .ntokens ()
169
+ # Loop over the training data.
170
+ for batch , i in enumerate (range (0 , train .size (0 ) - 1 , bptt )):
171
+ seq_len = min (bptt , train .size (0 ) - 1 - i )
172
+ output , hidden = model (Variable (train [i :i + seq_len ], requires_grad = False ), hidden )
173
+ targets = train [i + 1 :i + seq_len + 1 ].view (- 1 )
174
+ loss = criterion (output .view (- 1 , ntokens ), Variable (targets , requires_grad = False ))
175
+
176
+ # FIXME: this is the result of a double bug
177
+ # bug #1: you can't have dangling nodes in the graph to call backward
178
+ # bug #2: hidden.sum() doesn't work, gives me an error in backward, which I can't reproduce in a simple way
179
+ # File "/data/users/alerer/pytorch/pytorch/torch/autograd/variable.py", line 82, in backward
180
+ # self._execution_engine.run_backward(self, gradient, retain_variables)
181
+ # File "/data/users/alerer/pytorch/pytorch/torch/autograd/functions/reduce.py", line 27, in backward
182
+ # return grad_output.new(*self.input_size).fill_(grad_output[0])
183
+ #ValueError: fill_ recieved an invalid combination of argument types - got (torch.cuda.FloatTensor), but expected (float value)
184
+ if args .model == 'LSTM' :
185
+ loss += 0 * hidden [0 ].sum (0 ).sum (1 ).sum (2 )
186
+ loss += 0 * hidden [1 ].sum (0 ).sum (1 ).sum (2 )
187
+ else :
188
+ loss += 0 * hidden .sum (0 ).sum (1 ).sum (2 )
172
189
173
- if i % bptt == 0 :
174
- loss .backward ()
190
+ loss .backward ()
175
191
176
- clipped_lr = lr * clipGradient (model , args .clip )
192
+ clipped_lr = lr * clipGradient (model , args .clip )
177
193
178
- for p in model .parameters ():
179
- p .data .sub_ (p .grad .mul (clipped_lr ))
194
+ for p in model .parameters ():
195
+ p .data .sub_ (p .grad .mul (clipped_lr ))
180
196
181
- hidden = repackageHidden (hidden )
182
- model .zero_grad ()
183
- total_loss += loss .data
184
- loss = 0
197
+ hidden = repackageHidden (hidden )
198
+ model .zero_grad ()
199
+ total_loss += loss .data
200
+ loss = 0
185
201
186
- if i % reportinterval == 0 :
202
+ if batch % reportinterval == 0 and batch > 0 :
187
203
cur_loss = total_loss [0 ] / reportinterval
188
204
elapsed = time .time () - start_time
189
205
print (
190
206
('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
191
207
+ 'train loss {:5.2f} | train ppl {:8.2f}' ).format (
192
- epoch , i // bptt ), train .size (0 ) // bptt , lr ,
193
- elapsed * 1000 / reportinterval * bptt ,
208
+ epoch , batch , train .size (0 ) // bptt , lr , elapsed * 1000 / reportinterval ,
194
209
cur_loss , math .exp (cur_loss )
195
210
))
196
211
total_loss = 0
197
212
start_time = time .time ()
198
213
199
- val_loss = evaluate (model , valid , criterion )
214
+ val_loss = evaluate (model , valid , criterion , eval_bsz )
200
215
201
216
print (
202
217
'| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}' .format (
@@ -210,7 +225,7 @@ def repackageHidden(h):
210
225
prev_loss = val_loss
211
226
212
227
# Run on test data.
213
- test_loss = evaluate (model , test , criterion )
228
+ test_loss = evaluate (model , test , criterion , eval_bsz )
214
229
print (
215
230
'| End of training | test loss {:5.2f} | test ppl {:8.2f}' .format (
216
231
test_loss , math .exp (test_loss )
0 commit comments