Skip to content

Commit 23ac4e3

Browse files
committed
Switch to torch rnn, using cudnn
1 parent bd4d385 commit 23ac4e3

File tree

3 files changed

+76
-170
lines changed

3 files changed

+76
-170
lines changed

word_language_model/data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def __init__(self):
1313
def addword(self, word):
1414
if word not in self.word2idx:
1515
self.idx2word.append(word)
16-
self.word2idx[word] = len(self.idx2word)
16+
self.word2idx[word] = len(self.idx2word) - 1
1717

1818
return self.word2idx[word]
1919

word_language_model/main.py

Lines changed: 75 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
###############################################################################
22
# Language Modeling on Penn Tree Bank
3+
#
4+
# With the default parameters, this should achieve ~116 perplexity on the
5+
# test set.
36
###############################################################################
47

58
import argparse
@@ -10,20 +13,19 @@
1013
import torch.nn as nn
1114
from torch.autograd import Variable
1215

13-
from rnn_modules import *
1416
import data
1517

1618
parser = argparse.ArgumentParser(description='PyTorch PTB Language Model')
1719

1820
# Data parameters
1921
parser.add_argument('-data' , type=str, default='./data/penn', help='Location of the data corpus' )
2022
# Model parameters.
21-
parser.add_argument('-model' , type=str, default='LSTM' , help='Type of recurrent net. RNN, LSTM, or GRU.')
23+
parser.add_argument('-model' , type=str, default='LSTM' , help='Type of recurrent net. RNN_TANH, RNN_RELU, LSTM, or GRU.')
2224
parser.add_argument('-emsize' , type=int, default=200 , help='Size of word embeddings' )
2325
parser.add_argument('-nhid' , type=int, default=200 , help='Number of hidden units per layer.' )
2426
parser.add_argument('-nlayers' , type=int, default=2 , help='Number of layers.' )
2527
# Optimization parameters.
26-
parser.add_argument('-lr' , type=float, default=1 , help='Initial learning rate.' )
28+
parser.add_argument('-lr' , type=float, default=20 , help='Initial learning rate.' )
2729
parser.add_argument('-clip' , type=float, default=0.5 , help='Gradient clipping.' )
2830
parser.add_argument('-maxepoch' , type=int, default=6 , help='Upper epoch limit.' )
2931
parser.add_argument('-batchsize' , type=int, default=20 , help='Batch size.' )
@@ -49,18 +51,18 @@
4951

5052
corpus = data.Corpus(args.data)
5153

52-
def batchify(data, bsz, bptt):
53-
nbatch = int(math.floor(data.size(0) / bsz / bptt))
54-
data = data.narrow(0, 0, nbatch * bptt * bsz)
54+
def batchify(data, bsz):
55+
nbatch = int(math.floor(data.size(0) / bsz))
56+
data = data.narrow(0, 0, nbatch * bsz)
5557
data = data.view(bsz, -1).t().contiguous()
5658
if args.cuda:
5759
data = data.cuda()
5860
return data
5961

60-
train = batchify(corpus.train, args.batchsize, args.bptt)
61-
valid = batchify(corpus.valid, 10, 1)
62-
test = batchify(corpus.test, 10, 1)
63-
62+
eval_bsz = 10
63+
train = batchify(corpus.train, args.batchsize)
64+
valid = batchify(corpus.valid, eval_bsz)
65+
test = batchify(corpus.test, eval_bsz)
6466
bptt = args.bptt
6567
bsz = args.batchsize
6668

@@ -73,63 +75,62 @@ class RNNModel(nn.Container):
7375
and a decoder. Runs one RNN step at a time.
7476
"""
7577

76-
@staticmethod
77-
def name2module(name):
78-
if name == 'RNN':
79-
return RNN
80-
elif name == 'LSTM':
81-
return LSTM
82-
elif name == 'GRU':
83-
return GRU
84-
else:
85-
error("Unknown RNN module: " + name)
86-
8778
def __init__(self, rnnType, ntoken, ninp, nhid, nlayers):
88-
rnnModule = RNNModel.name2module(rnnType)
8979
super(RNNModel, self).__init__(
9080
encoder = nn.sparse.Embedding(ntoken, ninp),
91-
rnn = StackedRNN(rnnModule, ninp, nhid, nlayers),
81+
rnn = nn.rnn.RNNBase(rnnType, ninp, nhid, nlayers, bias=False),
9282
decoder = nn.Linear(nhid, ntoken),
9383
)
9484

95-
# FIXME: is this better than the standard init? probably
96-
# FIXME: we need better reset_parameters methods in stdlib
85+
# FIXME: add stdv named argument to reset_parameters
86+
# (and/or to the constructors)
9787
initrange = 0.1
9888
self.encoder.weight.data.uniform_(-initrange, initrange)
9989
self.decoder.bias.data.fill_(0)
10090
self.decoder.weight.data.uniform_(-initrange, initrange)
10191

102-
def forward(self, hidden, input):
92+
def forward(self, input, hidden):
10393
emb = self.encoder(input)
104-
hidden, output = self.rnn(hidden, emb)
105-
decoded = self.decoder(output)
106-
return hidden, decoded
107-
108-
def initHidden(self, bsz):
109-
return self.rnn.initHidden(bsz)
94+
output, hidden = self.rnn(emb, hidden)
95+
decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
96+
return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
11097

111-
model = RNNModel(args.model, corpus.dic.ntokens(), args.emsize, args.nhid, args.nlayers)
98+
ntokens = corpus.dic.ntokens()
99+
model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers)
112100
if args.cuda:
113101
model.cuda()
114102

115103
criterion = nn.CrossEntropyLoss()
116104

105+
def initHidden(model, bsz):
106+
weight = next(model.parameters()).data
107+
if args.model == 'LSTM':
108+
return (Variable(weight.new(args.nlayers, bsz, args.nhid).zero_()),
109+
Variable(weight.new(args.nlayers, bsz, args.nhid).zero_()))
110+
else:
111+
return Variable(weight.new(args.nlayers, bsz, args.nhid).zero_())
112+
113+
117114
########################################
118115
# TRAINING
119116
########################################
120117

121118
lr = args.lr
122119
clip = args.clip
123-
reportinterval = args.reportint * args.batchsize
120+
reportinterval = args.reportint
121+
124122

125123
# Perform the forward pass only.
126-
def evaluate(model, data, criterion):
124+
def evaluate(model, data, criterion, bsz):
127125
loss = 0
128-
hidden = model.initHidden(data.size(1))
126+
hidden = initHidden(model, bsz)
129127
# Loop over validation data.
130-
for i in range(0, data.size(0) - 1):
131-
hidden, output = model(hidden, Variable(data[i], requires_grad=False))
132-
loss += criterion(output, Variable(data[i+1], requires_grad=False)).data
128+
for i in range(0, data.size(0) - 1, bptt):
129+
seq_len = min(bptt, data.size(0) - 1 - i)
130+
output, hidden = model(Variable(data[i:i+seq_len], requires_grad=False), hidden)
131+
targets = data[i+1:i+seq_len+1].view(-1)
132+
loss += bptt * criterion(output.view(seq_len*bsz, -1), Variable(targets, requires_grad=False)).data
133+
hidden = repackageHidden(hidden)
133134

134135
return loss[0] / data.size(0)
135136

@@ -157,46 +158,60 @@ def repackageHidden(h):
157158
total_loss = 0
158159
epoch_start_time = time.time()
159160
# Start with an initial hidden state.
160-
hidden = model.initHidden(bsz)
161-
# Loop over the training data.
161+
hidden = initHidden(model, bsz)
162+
162163
loss = 0
163164
i = 0
164165
model.zero_grad()
165-
166166
total_loss = 0
167167
start_time = epoch_start_time = time.time()
168-
while i < train.size(0) - 1:
169-
hidden, output = model(hidden, Variable(train[i], requires_grad=False))
170-
loss += criterion(output, Variable(train[i+1], requires_grad=False))
171-
i += 1
168+
ntokens = corpus.dic.ntokens()
169+
# Loop over the training data.
170+
for batch, i in enumerate(range(0, train.size(0) - 1, bptt)):
171+
seq_len = min(bptt, train.size(0) - 1 - i)
172+
output, hidden = model(Variable(train[i:i+seq_len], requires_grad=False), hidden)
173+
targets = train[i+1:i+seq_len+1].view(-1)
174+
loss = criterion(output.view(-1, ntokens), Variable(targets, requires_grad=False))
175+
176+
# FIXME: this is the result of a double bug
177+
# bug #1: you can't have dangling nodes in the graph to call backward
178+
# bug #2: hidden.sum() doesn't work, gives me an error in backward, which I can't reproduce in a simple way
179+
# File "/data/users/alerer/pytorch/pytorch/torch/autograd/variable.py", line 82, in backward
180+
# self._execution_engine.run_backward(self, gradient, retain_variables)
181+
# File "/data/users/alerer/pytorch/pytorch/torch/autograd/functions/reduce.py", line 27, in backward
182+
# return grad_output.new(*self.input_size).fill_(grad_output[0])
183+
#ValueError: fill_ recieved an invalid combination of argument types - got (torch.cuda.FloatTensor), but expected (float value)
184+
if args.model == 'LSTM':
185+
loss += 0*hidden[0].sum(0).sum(1).sum(2)
186+
loss += 0*hidden[1].sum(0).sum(1).sum(2)
187+
else:
188+
loss += 0*hidden.sum(0).sum(1).sum(2)
172189

173-
if i % bptt == 0:
174-
loss.backward()
190+
loss.backward()
175191

176-
clipped_lr = lr * clipGradient(model, args.clip)
192+
clipped_lr = lr * clipGradient(model, args.clip)
177193

178-
for p in model.parameters():
179-
p.data.sub_(p.grad.mul(clipped_lr))
194+
for p in model.parameters():
195+
p.data.sub_(p.grad.mul(clipped_lr))
180196

181-
hidden = repackageHidden(hidden)
182-
model.zero_grad()
183-
total_loss += loss.data
184-
loss = 0
197+
hidden = repackageHidden(hidden)
198+
model.zero_grad()
199+
total_loss += loss.data
200+
loss = 0
185201

186-
if i % reportinterval == 0:
202+
if batch % reportinterval == 0 and batch > 0:
187203
cur_loss = total_loss[0] / reportinterval
188204
elapsed = time.time() - start_time
189205
print(
190206
('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
191207
+ 'train loss {:5.2f} | train ppl {:8.2f}').format(
192-
epoch, i // bptt), train.size(0) // bptt, lr,
193-
elapsed * 1000 / reportinterval * bptt,
208+
epoch, batch, train.size(0) // bptt, lr, elapsed * 1000 / reportinterval,
194209
cur_loss, math.exp(cur_loss)
195210
))
196211
total_loss = 0
197212
start_time = time.time()
198213

199-
val_loss = evaluate(model, valid, criterion)
214+
val_loss = evaluate(model, valid, criterion, eval_bsz)
200215

201216
print(
202217
'| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
@@ -210,7 +225,7 @@ def repackageHidden(h):
210225
prev_loss = val_loss
211226

212227
# Run on test data.
213-
test_loss = evaluate(model, test, criterion)
228+
test_loss = evaluate(model, test, criterion, eval_bsz)
214229
print(
215230
'| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
216231
test_loss, math.exp(test_loss)

word_language_model/rnn_modules.py

Lines changed: 0 additions & 109 deletions
This file was deleted.

0 commit comments

Comments
 (0)