23
23
parser .add_argument ('-nhid' , type = int , default = 200 , help = 'Number of hidden units per layer.' )
24
24
parser .add_argument ('-nlayers' , type = int , default = 2 , help = 'Number of layers.' )
25
25
# Optimization parameters.
26
- parser .add_argument ('-lr' , type = float , default = 20 , help = 'Initial learning rate.' )
26
+ parser .add_argument ('-lr' , type = float , default = 1 , help = 'Initial learning rate.' )
27
27
parser .add_argument ('-clip' , type = float , default = 0.5 , help = 'Gradient clipping.' )
28
28
parser .add_argument ('-maxepoch' , type = int , default = 6 , help = 'Upper epoch limit.' )
29
29
parser .add_argument ('-batchsize' , type = int , default = 20 , help = 'Batch size.' )
@@ -68,8 +68,6 @@ def batchify(data, bsz, bptt):
68
68
# MAKE MODEL
69
69
###############################################################################
70
70
71
- initrange = 0.1
72
-
73
71
class RNNModel (nn .Container ):
74
72
"""A container module with an encoder, an RNN (one of several flavors),
75
73
and a decoder. Runs one RNN step at a time.
@@ -96,11 +94,12 @@ def __init__(self, rnnType, ntoken, ninp, nhid, nlayers):
96
94
97
95
# FIXME: is this better than the standard init? probably
98
96
# FIXME: we need better reset_parameters methods in stdlib
97
+ initrange = 0.1
99
98
self .encoder .weight .data .uniform_ (- initrange , initrange )
100
99
self .decoder .bias .data .fill_ (0 )
101
100
self .decoder .weight .data .uniform_ (- initrange , initrange )
102
101
103
- def __call__ (self , hidden , input ):
102
+ def forward (self , hidden , input ):
104
103
emb = self .encoder (input )
105
104
hidden , output = self .rnn (hidden , emb )
106
105
decoded = self .decoder (output )
@@ -130,9 +129,9 @@ def evaluate(model, data, criterion):
130
129
# Loop over validation data.
131
130
for i in range (0 , data .size (0 ) - 1 ):
132
131
hidden , output = model (hidden , Variable (data [i ], requires_grad = False ))
133
- loss += criterion (output , Variable (data [i + 1 ], requires_grad = False )).data [ 0 ]
132
+ loss += criterion (output , Variable (data [i + 1 ], requires_grad = False )).data
134
133
135
- return loss / data .size (0 )
134
+ return loss [ 0 ] / data .size (0 )
136
135
137
136
# simple gradient clipping, using the total norm of the gradient
138
137
def clipGradient (model , clip ):
@@ -193,7 +192,8 @@ def repackageHidden(h):
193
192
print (
194
193
('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
195
194
+ 'train loss {:5.2f} | train ppl {:8.2f}' ).format (
196
- epoch , i / bptt , train .size (0 ) / bptt , lr , elapsed * 1000 / reportinterval ,
195
+ epoch , i / bptt , train .size (0 ) / bptt , lr ,
196
+ elapsed * 1000 / reportinterval * bptt ,
197
197
cur_loss , math .exp (cur_loss )
198
198
))
199
199
total_loss = 0
@@ -204,7 +204,7 @@ def repackageHidden(h):
204
204
# ps = pstats.Stats(pr, stream=s).sort_stats("time")
205
205
# ps.print_stats()
206
206
# print(s.getvalue())
207
- # val_loss = evaluate(model, valid, criterion)
207
+ val_loss = evaluate (model , valid , criterion )
208
208
209
209
print (
210
210
'| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}' .format (
0 commit comments