10
10
import torch .nn as nn
11
11
from torch .autograd import Variable
12
12
13
- from rnn_modules import *
14
13
import data
15
14
16
15
parser = argparse .ArgumentParser (description = 'PyTorch PTB Language Model' )
17
16
18
17
# Data parameters
19
18
parser .add_argument ('-data' , type = str , default = './data/penn' , help = 'Location of the data corpus' )
20
19
# Model parameters.
21
- parser .add_argument ('-model' , type = str , default = 'LSTM' , help = 'Type of recurrent net. RNN , LSTM, or GRU.' )
20
+ parser .add_argument ('-model' , type = str , default = 'LSTM' , help = 'Type of recurrent net. RNN_TANH, RNN_RELU , LSTM, or GRU.' )
22
21
parser .add_argument ('-emsize' , type = int , default = 200 , help = 'Size of word embeddings' )
23
22
parser .add_argument ('-nhid' , type = int , default = 200 , help = 'Number of hidden units per layer.' )
24
23
parser .add_argument ('-nlayers' , type = int , default = 2 , help = 'Number of layers.' )
25
24
# Optimization parameters.
26
- parser .add_argument ('-lr' , type = float , default = 1 , help = 'Initial learning rate.' )
25
+ parser .add_argument ('-lr' , type = float , default = 20 , help = 'Initial learning rate.' )
27
26
parser .add_argument ('-clip' , type = float , default = 0.5 , help = 'Gradient clipping.' )
28
27
parser .add_argument ('-maxepoch' , type = int , default = 6 , help = 'Upper epoch limit.' )
29
28
parser .add_argument ('-batchsize' , type = int , default = 20 , help = 'Batch size.' )
49
48
50
49
corpus = data .Corpus (args .data )
51
50
52
- def batchify (data , bsz , bptt ):
53
- nbatch = int (math .floor (data .size (0 ) / bsz / bptt ))
54
- data = data .narrow (0 , 0 , nbatch * bptt * bsz )
51
+ def batchify (data , bsz ):
52
+ nbatch = int (math .floor (data .size (0 ) / bsz ))
53
+ data = data .narrow (0 , 0 , nbatch * bsz )
55
54
data = data .view (bsz , - 1 ).t ().contiguous ()
56
55
if args .cuda :
57
56
data = data .cuda ()
58
57
return data
59
58
60
- train = batchify (corpus .train , args .batchsize , args .bptt )
61
- valid = batchify (corpus .valid , 10 , 1 )
62
- test = batchify (corpus .test , 10 , 1 )
63
-
59
+ eval_bsz = 10
60
+ train = batchify (corpus .train , args .batchsize )
61
+ valid = batchify (corpus .valid , eval_bsz )
62
+ test = batchify (corpus .test , eval_bsz )
63
+ #train = train[:123*args.bptt]
64
64
bptt = args .bptt
65
65
bsz = args .batchsize
66
66
@@ -73,22 +73,10 @@ class RNNModel(nn.Container):
73
73
and a decoder. Runs one RNN step at a time.
74
74
"""
75
75
76
- @staticmethod
77
- def name2module (name ):
78
- if name == 'RNN' :
79
- return RNN
80
- elif name == 'LSTM' :
81
- return LSTM
82
- elif name == 'GRU' :
83
- return GRU
84
- else :
85
- error ("Unknown RNN module: " + name )
86
-
87
76
def __init__ (self , rnnType , ntoken , ninp , nhid , nlayers ):
88
- rnnModule = RNNModel .name2module (rnnType )
89
77
super (RNNModel , self ).__init__ (
90
78
encoder = nn .sparse .Embedding (ntoken , ninp ),
91
- rnn = StackedRNN ( rnnModule , ninp , nhid , nlayers ),
79
+ rnn = nn . rnn . RNNBase ( rnnType , ninp , nhid , nlayers , bias = False ),
92
80
decoder = nn .Linear (nhid , ntoken ),
93
81
)
94
82
@@ -99,37 +87,48 @@ def __init__(self, rnnType, ntoken, ninp, nhid, nlayers):
99
87
self .decoder .bias .data .fill_ (0 )
100
88
self .decoder .weight .data .uniform_ (- initrange , initrange )
101
89
102
- def forward (self , hidden , input ):
90
+ def forward (self , input , hidden ):
103
91
emb = self .encoder (input )
104
- hidden , output = self .rnn (hidden , emb )
105
- decoded = self .decoder (output )
106
- return hidden , decoded
92
+ output , hidden = self .rnn (emb , hidden )
93
+ decoded = self .decoder (output . view ( output . size ( 0 ) * output . size ( 1 ), output . size ( 2 )) )
94
+ return decoded . view ( output . size ( 0 ), output . size ( 1 ), decoded . size ( 1 )), hidden
107
95
108
- def initHidden (self , bsz ):
109
- return self .rnn .initHidden (bsz )
110
-
111
- model = RNNModel (args .model , corpus .dic .ntokens (), args .emsize , args .nhid , args .nlayers )
96
+ ntokens = corpus .dic .ntokens ()
97
+ model = RNNModel (args .model , ntokens , args .emsize , args .nhid , args .nlayers )
112
98
if args .cuda :
113
99
model .cuda ()
114
100
115
101
criterion = nn .CrossEntropyLoss ()
116
102
103
+ def initHidden (model , bsz ):
104
+ weight = next (model .parameters ()).data
105
+ if args .model == 'LSTM' :
106
+ return (Variable (weight .new (args .nlayers , bsz , args .nhid ).zero_ ()),
107
+ Variable (weight .new (args .nlayers , bsz , args .nhid ).zero_ ()))
108
+ else :
109
+ return Variable (weight .new (args .nlayers , bsz , args .nhid ).zero_ ())
110
+
111
+
117
112
########################################
118
113
# TRAINING
119
114
########################################
120
115
121
116
lr = args .lr
122
117
clip = args .clip
123
- reportinterval = args .reportint * args .batchsize
118
+ reportinterval = args .reportint
119
+
124
120
125
121
# Perform the forward pass only.
126
- def evaluate (model , data , criterion ):
122
+ def evaluate (model , data , criterion , bsz ):
127
123
loss = 0
128
- hidden = model . initHidden (data . size ( 1 ) )
124
+ hidden = initHidden (model , bsz )
129
125
# Loop over validation data.
130
- for i in range (0 , data .size (0 ) - 1 ):
131
- hidden , output = model (hidden , Variable (data [i ], requires_grad = False ))
132
- loss += criterion (output , Variable (data [i + 1 ], requires_grad = False )).data
126
+ for i in range (0 , data .size (0 ) - 1 , bptt ):
127
+ seq_len = min (bptt , data .size (0 ) - 1 - i )
128
+ output , hidden = model (Variable (data [i :i + seq_len ], requires_grad = False ), hidden )
129
+ targets = data [i + 1 :i + seq_len + 1 ].view (- 1 )
130
+ loss += bptt * criterion (output .view (seq_len * bsz , - 1 ), Variable (targets , requires_grad = False )).data
131
+ hidden = repackageHidden (hidden )
133
132
134
133
return loss [0 ] / data .size (0 )
135
134
@@ -157,46 +156,61 @@ def repackageHidden(h):
157
156
total_loss = 0
158
157
epoch_start_time = time .time ()
159
158
# Start with an initial hidden state.
160
- hidden = model .initHidden (bsz )
159
+ hidden = initHidden (model , bsz )
160
+
161
161
# Loop over the training data.
162
162
loss = 0
163
163
i = 0
164
164
model .zero_grad ()
165
165
166
166
total_loss = 0
167
167
start_time = epoch_start_time = time .time ()
168
- while i < train .size (0 ) - 1 :
169
- hidden , output = model (hidden , Variable (train [i ], requires_grad = False ))
170
- loss += criterion (output , Variable (train [i + 1 ], requires_grad = False ))
171
- i += 1
168
+ ntokens = corpus .dic .ntokens ()
169
+ for batch , i in enumerate (range (0 , train .size (0 ) - 1 , bptt )):
170
+ seq_len = min (bptt , train .size (0 ) - 1 - i )
171
+ output , hidden = model (Variable (train [i :i + seq_len ], requires_grad = False ), hidden )
172
+ targets = train [i + 1 :i + seq_len + 1 ].view (- 1 )
173
+ loss = criterion (output .view (- 1 , ntokens ), Variable (targets , requires_grad = False ))
174
+
175
+ # FIXME: this is the result of a double bug
176
+ # bug #1: you can't have dangling nodes in the graph to call backward
177
+ # bug #2: hidden.sum() doesn't work, gives me an error in backward, which I can't reproduce in a simple way
178
+ # File "/data/users/alerer/pytorch/pytorch/torch/autograd/variable.py", line 82, in backward
179
+ # self._execution_engine.run_backward(self, gradient, retain_variables)
180
+ # File "/data/users/alerer/pytorch/pytorch/torch/autograd/functions/reduce.py", line 27, in backward
181
+ # return grad_output.new(*self.input_size).fill_(grad_output[0])
182
+ #ValueError: fill_ recieved an invalid combination of argument types - got (torch.cuda.FloatTensor), but expected (float value)
183
+ if args .model == 'LSTM' :
184
+ loss += 0 * hidden [0 ].sum (0 ).sum (1 ).sum (2 )
185
+ loss += 0 * hidden [1 ].sum (0 ).sum (1 ).sum (2 )
186
+ else :
187
+ loss += 0 * hidden .sum (0 ).sum (1 ).sum (2 )
172
188
173
- if i % bptt == 0 :
174
- loss .backward ()
189
+ loss .backward ()
175
190
176
- clipped_lr = lr * clipGradient (model , args .clip )
191
+ clipped_lr = lr * clipGradient (model , args .clip )
177
192
178
- for p in model .parameters ():
179
- p .data .sub_ (p .grad .mul (clipped_lr ))
193
+ for p in model .parameters ():
194
+ p .data .sub_ (p .grad .mul (clipped_lr ))
180
195
181
- hidden = repackageHidden (hidden )
182
- model .zero_grad ()
183
- total_loss += loss .data
184
- loss = 0
196
+ hidden = repackageHidden (hidden )
197
+ model .zero_grad ()
198
+ total_loss += loss .data
199
+ loss = 0
185
200
186
- if i % reportinterval == 0 :
201
+ if batch % reportinterval == 0 and batch > 0 :
187
202
cur_loss = total_loss [0 ] / reportinterval
188
203
elapsed = time .time () - start_time
189
204
print (
190
205
('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
191
206
+ 'train loss {:5.2f} | train ppl {:8.2f}' ).format (
192
- epoch , i // bptt ), train .size (0 ) // bptt , lr ,
193
- elapsed * 1000 / reportinterval * bptt ,
207
+ epoch , batch , train .size (0 ) // bptt , lr , elapsed * 1000 / reportinterval ,
194
208
cur_loss , math .exp (cur_loss )
195
209
))
196
210
total_loss = 0
197
211
start_time = time .time ()
198
212
199
- val_loss = evaluate (model , valid , criterion )
213
+ val_loss = evaluate (model , valid , criterion , eval_bsz )
200
214
201
215
print (
202
216
'| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}' .format (
@@ -210,7 +224,7 @@ def repackageHidden(h):
210
224
prev_loss = val_loss
211
225
212
226
# Run on test data.
213
- test_loss = evaluate (model , test , criterion )
227
+ test_loss = evaluate (model , test , criterion , eval_bsz )
214
228
print (
215
229
'| End of training | test loss {:5.2f} | test ppl {:8.2f}' .format (
216
230
test_loss , math .exp (test_loss )
0 commit comments