16
16
parser = argparse .ArgumentParser (description = 'PyTorch PTB Language Model' )
17
17
18
18
# Data parameters
19
- parser .add_argument ('-data' , type = str , default = './data/penn' , help = 'Location of the data corpus' )
19
+ parser .add_argument ('-data' , type = str , default = './data/penn' , help = 'Location of the data corpus' )
20
20
# Model parameters.
21
- parser .add_argument ('-model' , type = str , default = 'RNN' , help = 'Type of recurrent net. RNN, LSTM, or GRU.' )
22
- parser .add_argument ('-emsize' , type = int , default = 200 , help = 'Size of word embeddings' )
23
- parser .add_argument ('-nhid' , type = int , default = 200 , help = 'Number of hidden units per layer.' )
21
+ parser .add_argument ('-model' , type = str , default = 'LSTM' , help = 'Type of recurrent net. RNN, LSTM, or GRU.' )
22
+ parser .add_argument ('-emsize' , type = int , default = 200 , help = 'Size of word embeddings' )
23
+ parser .add_argument ('-nhid' , type = int , default = 200 , help = 'Number of hidden units per layer.' )
24
+ parser .add_argument ('-nlayers' , type = int , default = 2 , help = 'Number of layers.' )
24
25
# Optimization parameters.
25
- parser .add_argument ('-lr' , type = float , default = 20 , help = 'Initial learning rate.' )
26
- parser .add_argument ('-clip' , type = float , default = 0.5 , help = 'Gradient clipping.' )
27
- parser .add_argument ('-maxepoch' , type = int , default = 6 , help = 'Upper epoch limit.' )
28
- parser .add_argument ('-batchsize' , type = int , default = 20 , help = 'Batch size.' )
29
- parser .add_argument ('-bptt' , type = int , default = 20 , help = 'Sequence length.' )
26
+ parser .add_argument ('-lr' , type = float , default = 20 , help = 'Initial learning rate.' )
27
+ parser .add_argument ('-clip' , type = float , default = 0.5 , help = 'Gradient clipping.' )
28
+ parser .add_argument ('-maxepoch' , type = int , default = 6 , help = 'Upper epoch limit.' )
29
+ parser .add_argument ('-batchsize' , type = int , default = 20 , help = 'Batch size.' )
30
+ parser .add_argument ('-bptt' , type = int , default = 20 , help = 'Sequence length.' )
30
31
# Device parameters.
31
- parser .add_argument ('-seed' , type = int , default = 1111 , help = 'Random seed.' )
32
- parser .add_argument ('-cuda' , action = 'store_true' , help = 'Use CUDA.' )
32
+ parser .add_argument ('-seed' , type = int , default = 1111 , help = 'Random seed.' )
33
+ parser .add_argument ('-cuda' , action = 'store_true' , help = 'Use CUDA.' )
33
34
# Misc parameters.
34
- parser .add_argument ('-reportint' , type = int , default = 1000 , help = 'Report interval.' )
35
- parser .add_argument ('-save' , type = str , default = 'model.pt' , help = 'Path to save the final model.' )
35
+ parser .add_argument ('-reportint' , type = int , default = 200 , help = 'Report interval.' )
36
+ parser .add_argument ('-save' , type = str , default = 'model.pt' , help = 'Path to save the final model.' )
36
37
args = parser .parse_args ()
37
38
38
39
# Set the random seed manually for reproducibility.
@@ -59,8 +60,6 @@ def batchify(data, bsz, bptt):
59
60
train = batchify (corpus .train , args .batchsize , args .bptt )
60
61
valid = batchify (corpus .valid , 10 , 1 )
61
62
test = batchify (corpus .test , 10 , 1 )
62
- train = train [:10000 ]
63
- valid = valid [:100 ]
64
63
65
64
bptt = args .bptt
66
65
bsz = args .batchsize
@@ -87,11 +86,11 @@ def name2module(name):
87
86
else :
88
87
error ("Unknown RNN module: " + name )
89
88
90
- def __init__ (self , rnnType , ntoken , ninp , nhid ):
89
+ def __init__ (self , rnnType , ntoken , ninp , nhid , nlayers ):
91
90
rnnModule = RNNModel .name2module (rnnType )
92
91
super (RNNModel , self ).__init__ (
93
92
encoder = nn .sparse .Embedding (ntoken , ninp ),
94
- rnn = rnnModule ( ninp , nhid ),
93
+ rnn = StackedRNN ( rnnModule , ninp , nhid , nlayers ),
95
94
decoder = nn .Linear (nhid , ntoken ),
96
95
)
97
96
@@ -110,7 +109,7 @@ def __call__(self, hidden, input):
110
109
def initHidden (self , bsz ):
111
110
return self .rnn .initHidden (bsz )
112
111
113
- model = RNNModel (args .model , corpus .dic .ntokens (), args .emsize , args .nhid )
112
+ model = RNNModel (args .model , corpus .dic .ntokens (), args .emsize , args .nhid , args . nlayers )
114
113
if args .cuda :
115
114
model .cuda ()
116
115
@@ -122,7 +121,7 @@ def initHidden(self, bsz):
122
121
123
122
lr = args .lr
124
123
clip = args .clip
125
- reportinterval = args .reportint
124
+ reportinterval = args .reportint * args . batchsize
126
125
127
126
# Perform the forward pass only.
128
127
def evaluate (model , data , criterion ):
@@ -151,7 +150,7 @@ def repackageHidden(h):
151
150
if type (h ) == Variable :
152
151
return Variable (h .data )
153
152
else :
154
- return tuple (repackageVariable (v ) for v in h )
153
+ return tuple (repackageHidden (v ) for v in h )
155
154
156
155
# Loop over epochs.
157
156
prev_loss = None
@@ -167,6 +166,9 @@ def repackageHidden(h):
167
166
168
167
total_loss = 0
169
168
start_time = epoch_start_time = time .time ()
169
+ # import cProfile, pstats, StringIO
170
+ # pr = cProfile.Profile()
171
+ # pr.enable()
170
172
while i < train .size (0 ) - 1 :
171
173
hidden , output = model (hidden , Variable (train [i ], requires_grad = False ))
172
174
loss += criterion (output , Variable (train [i + 1 ], requires_grad = False ))
@@ -182,26 +184,31 @@ def repackageHidden(h):
182
184
183
185
hidden = repackageHidden (hidden )
184
186
model .zero_grad ()
185
- total_loss += loss .data [ 0 ]
187
+ total_loss += loss .data
186
188
loss = 0
187
189
188
190
if i % reportinterval == 0 :
189
- cur_loss = total_loss / reportinterval
191
+ cur_loss = total_loss [ 0 ] / reportinterval
190
192
elapsed = time .time () - start_time
191
193
print (
192
194
('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
193
195
+ 'train loss {:5.2f} | train ppl {:8.2f}' ).format (
194
- epoch , i , train .size (0 ), lr , elapsed * 1000 / reportinterval * bsz ,
196
+ epoch , i / bptt , train .size (0 ) / bptt , lr , elapsed * 1000 / reportinterval ,
195
197
cur_loss , math .exp (cur_loss )
196
198
))
197
199
total_loss = 0
198
200
start_time = time .time ()
199
201
200
- val_loss = evaluate (model , valid , criterion )
202
+ # pr.disable()
203
+ # s = StringIO.StringIO()
204
+ # ps = pstats.Stats(pr, stream=s).sort_stats("time")
205
+ # ps.print_stats()
206
+ # print(s.getvalue())
207
+ # val_loss = evaluate(model, valid, criterion)
201
208
202
209
print (
203
- '| end of epoch {:3d} | ms/batch {:5.2f} | valid loss {:5.2f} | valid ppl {:8.2f}' .format (
204
- epoch , (time .time () - epoch_start_time ) * 1000 / train . size ( 0 ) , val_loss , math .exp (val_loss )
210
+ '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}' .format (
211
+ epoch , (time .time () - epoch_start_time ), val_loss , math .exp (val_loss )
205
212
))
206
213
207
214
# The annealing schedule.
0 commit comments