Skip to content

Commit 6fde116

Browse files
committed
initial commit of word language model
1 parent 764ac3b commit 6fde116

File tree

3 files changed

+365
-0
lines changed

3 files changed

+365
-0
lines changed

word_language_model/data.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
########################################
2+
# Data Fetching Script for PTB
3+
########################################
4+
5+
import torch
6+
import os.path
7+
8+
class Dictionary(object):
9+
def __init__(self):
10+
self.word2idx = {}
11+
self.idx2word = []
12+
13+
def addword(self, word):
14+
if word not in self.word2idx:
15+
self.idx2word.append(word)
16+
self.word2idx[word] = len(self.idx2word)
17+
18+
return self.word2idx[word]
19+
20+
def ntokens(self):
21+
return len(self.idx2word)
22+
23+
24+
class Corpus(object):
25+
def __init__(self, path):
26+
self.dic = Dictionary()
27+
self.train=self._loadfile(os.path.join(path, 'train.txt'))
28+
self.valid=self._loadfile(os.path.join(path, 'valid.txt'))
29+
self.test =self._loadfile(os.path.join(path, 'test.txt'))
30+
31+
# | Tokenize a text file.
32+
def _loadfile(self, path):
33+
# Read words from file.
34+
assert(os.path.exists(path))
35+
tokens = 0
36+
with open(path, 'r') as f:
37+
for line in f:
38+
words = line.split() + ['<eos>']
39+
for word in words:
40+
self.dic.addword(word)
41+
tokens += 1
42+
43+
with open(path, 'r') as f:
44+
ids = torch.LongTensor(tokens)
45+
token = 0
46+
for line in f:
47+
words = line.split() + ['<eos>']
48+
for word in words:
49+
ids[token] = self.dic.word2idx[word]
50+
token += 1
51+
52+
# Final dataset.
53+
return ids

word_language_model/main.py

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
###############################################################################
2+
# Language Modeling on Penn Tree Bank
3+
###############################################################################
4+
5+
import argparse
6+
import time
7+
import math
8+
9+
import torch
10+
import torch.nn as nn
11+
from torch.autograd import Variable
12+
13+
from rnn_modules import *
14+
import data
15+
16+
parser = argparse.ArgumentParser(description='PyTorch PTB Language Model')
17+
18+
# Data parameters
19+
parser.add_argument('-data' , type=str, default='./data/penn', help='Location of the data corpus' )
20+
# Model parameters.
21+
parser.add_argument('-model' , type=str, default='RNN' , help='Type of recurrent net. RNN, LSTM, or GRU.' )
22+
parser.add_argument('-emsize' , type=int, default=200 , help='Size of word embeddings' )
23+
parser.add_argument('-nhid' , type=int, default=200 , help='Number of hidden units per layer.' )
24+
# Optimization parameters.
25+
parser.add_argument('-lr' , type=float, default=20 , help='Initial learning rate.' )
26+
parser.add_argument('-clip' , type=float, default=0.5 , help='Gradient clipping.' )
27+
parser.add_argument('-maxepoch' , type=int, default=6 , help='Upper epoch limit.' )
28+
parser.add_argument('-batchsize' , type=int, default=20 , help='Batch size.' )
29+
parser.add_argument('-bptt' , type=int, default=20 , help='Sequence length.' )
30+
# Device parameters.
31+
parser.add_argument('-seed' , type=int, default=1111 , help='Random seed.' )
32+
parser.add_argument('-cuda' , action='store_true' , help='Use CUDA.' )
33+
# Misc parameters.
34+
parser.add_argument('-reportint' , type=int, default=1000 , help='Report interval.' )
35+
parser.add_argument('-save' , type=str, default='model.pt' , help='Path to save the final model.' )
36+
args = parser.parse_args()
37+
38+
# Set the random seed manually for reproducibility.
39+
torch.manual_seed(args.seed)
40+
# If the GPU is enabled, do some plumbing.
41+
42+
if torch.cuda.is_available() and not args.cuda:
43+
print("WARNING: You have a CUDA device, so you should probably run with -cuda")
44+
45+
###############################################################################
46+
## LOAD DATA
47+
###############################################################################
48+
49+
corpus = data.Corpus(args.data)
50+
51+
def batchify(data, bsz, bptt):
52+
nbatch = int(math.floor(data.size(0) / bsz / bptt))
53+
data = data.narrow(0, 0, nbatch * bptt * bsz)
54+
data = data.view(bsz, -1).t().contiguous()
55+
if args.cuda:
56+
data = data.cuda()
57+
return data
58+
59+
train = batchify(corpus.train, args.batchsize, args.bptt)
60+
valid = batchify(corpus.valid, 10, 1)
61+
test = batchify(corpus.test, 10, 1)
62+
train = train[:10000]
63+
valid = valid[:100]
64+
65+
bptt = args.bptt
66+
bsz = args.batchsize
67+
68+
###############################################################################
69+
# MAKE MODEL
70+
###############################################################################
71+
72+
initrange = 0.1
73+
74+
class RNNModel(nn.Container):
75+
"""A container module with an encoder, an RNN (one of several flavors),
76+
and a decoder. Runs one RNN step at a time.
77+
"""
78+
79+
@staticmethod
80+
def name2module(name):
81+
if name == 'RNN':
82+
return RNN
83+
elif name == 'LSTM':
84+
return LSTM
85+
elif name == 'GRU':
86+
return GRU
87+
else:
88+
error("Unknown RNN module: " + name)
89+
90+
def __init__(self, rnnType, ntoken, ninp, nhid):
91+
rnnModule = RNNModel.name2module(rnnType)
92+
super(RNNModel, self).__init__(
93+
encoder = nn.sparse.Embedding(ntoken, ninp),
94+
rnn = rnnModule(ninp, nhid),
95+
decoder = nn.Linear(nhid, ntoken),
96+
)
97+
98+
# FIXME: is this better than the standard init? probably
99+
# FIXME: we need better reset_parameters methods in stdlib
100+
self.encoder.weight.data.uniform_(-initrange, initrange)
101+
self.decoder.bias.data.fill_(0)
102+
self.decoder.weight.data.uniform_(-initrange, initrange)
103+
104+
def __call__(self, hidden, input):
105+
emb = self.encoder(input)
106+
hidden, output = self.rnn(hidden, emb)
107+
decoded = self.decoder(output)
108+
return hidden, decoded
109+
110+
def initHidden(self, bsz):
111+
return self.rnn.initHidden(bsz)
112+
113+
model = RNNModel(args.model, corpus.dic.ntokens(), args.emsize, args.nhid)
114+
if args.cuda:
115+
model.cuda()
116+
117+
criterion = nn.CrossEntropyLoss()
118+
119+
########################################
120+
# TRAINING
121+
########################################
122+
123+
lr = args.lr
124+
clip = args.clip
125+
reportinterval = args.reportint
126+
127+
# Perform the forward pass only.
128+
def evaluate(model, data, criterion):
129+
loss = 0
130+
hidden = model.initHidden(data.size(1))
131+
# Loop over validation data.
132+
for i in range(0, data.size(0) - 1):
133+
hidden, output = model(hidden, Variable(data[i], requires_grad=False))
134+
loss += criterion(output, Variable(data[i+1], requires_grad=False)).data[0]
135+
136+
return loss / data.size(0)
137+
138+
# simple gradient clipping, using the total norm of the gradient
139+
def clipGradient(model, clip):
140+
totalnorm = 0
141+
for p in model.parameters():
142+
modulenorm = p.grad.norm()
143+
totalnorm += modulenorm ** 2
144+
totalnorm = math.sqrt(totalnorm)
145+
return min(1, args.clip / (totalnorm + 1e-6))
146+
147+
# Between bptt intervals, we want to maintain the hidden state data
148+
# but don't want to backprop gradients across bptt intervals.
149+
# So we have to rewrap the hidden state in a fresh Variable.
150+
def repackageHidden(h):
151+
if type(h) == Variable:
152+
return Variable(h.data)
153+
else:
154+
return tuple(repackageVariable(v) for v in h)
155+
156+
# Loop over epochs.
157+
prev_loss = None
158+
for epoch in range(1, args.maxepoch+1):
159+
total_loss = 0
160+
epoch_start_time = time.time()
161+
# Start with an initial hidden state.
162+
hidden = model.initHidden(bsz)
163+
# Loop over the training data.
164+
loss = 0
165+
i = 0
166+
model.zero_grad()
167+
168+
total_loss = 0
169+
start_time = epoch_start_time = time.time()
170+
while i < train.size(0) - 1:
171+
hidden, output = model(hidden, Variable(train[i], requires_grad=False))
172+
loss += criterion(output, Variable(train[i+1], requires_grad=False))
173+
i += 1
174+
175+
if i % bptt == 0:
176+
loss.backward()
177+
178+
clipped_lr = lr * clipGradient(model, args.clip)
179+
180+
for p in model.parameters():
181+
p.data.sub_(p.grad.mul(clipped_lr))
182+
183+
hidden = repackageHidden(hidden)
184+
model.zero_grad()
185+
total_loss += loss.data[0]
186+
loss = 0
187+
188+
if i % reportinterval == 0:
189+
cur_loss = total_loss / reportinterval
190+
elapsed = time.time() - start_time
191+
print(
192+
('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
193+
+ 'train loss {:5.2f} | train ppl {:8.2f}').format(
194+
epoch, i, train.size(0), lr, elapsed * 1000 / reportinterval * bsz,
195+
cur_loss, math.exp(cur_loss)
196+
))
197+
total_loss = 0
198+
start_time = time.time()
199+
200+
val_loss = evaluate(model, valid, criterion)
201+
202+
print(
203+
'| end of epoch {:3d} | ms/batch {:5.2f} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
204+
epoch, (time.time() - epoch_start_time) * 1000 / train.size(0), val_loss, math.exp(val_loss)
205+
))
206+
207+
# The annealing schedule.
208+
if prev_loss and val_loss > prev_loss:
209+
lr = lr / 4
210+
211+
prev_loss = val_loss
212+
213+
# Run on test data.
214+
test_loss = evaluate(model, test, criterion)
215+
print(
216+
'| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
217+
test_loss, math.exp(test_loss)
218+
))
219+
220+
if args.save != '' :
221+
with open(args.save, 'wb') as f:
222+
torch.save(model, f)

word_language_model/rnn_modules.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
###############################################################################
2+
# Various RNN Modules
3+
###############################################################################
4+
5+
import torch
6+
import torch.nn as nn
7+
from torch.autograd import Variable
8+
9+
# FIXME: add CUDNN
10+
11+
class RNN(nn.Container):
12+
13+
def __init__(self, ninp, nhid):
14+
super(RNN, self).__init__(
15+
i2h=nn.Linear(ninp, nhid),
16+
h2h=nn.Linear(nhid, nhid),
17+
sigmoid=nn.Sigmoid(),
18+
)
19+
self.ninp = ninp
20+
self.nhid = nhid
21+
22+
def __call__(self, hidden, input):
23+
next = self.sigmoid(self.h2h(hidden) + self.i2h(input))
24+
return next, next
25+
26+
def initHidden(self, bsz):
27+
return Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_())
28+
29+
30+
class LSTM(nn.Container):
31+
32+
def __init__(self, ninp, nhid):
33+
super(LSTM, self).__init__(
34+
i2h=nn.Linear(ninp, 4 * nhid),
35+
h2h=nn.Linear(nhid, 4 * nhid),
36+
sigmoid=nn.Sigmoid(),
37+
tanh=nn.Tanh(),
38+
)
39+
self.ninp = ninp
40+
self.nhid = nhid
41+
42+
def __call__(self, hidden, input):
43+
c, h = hidden
44+
gates = self.h2h(h) + self.i2h(input)
45+
gates = gates.view(input.size(0), 4, self.nhid).transpose(0, 1)
46+
47+
ingate = self.sigmoid(gates[0])
48+
cellgate = self.tanh(gates[1])
49+
forgetgate = self.sigmoid(gates[2])
50+
outgate = self.sigmoid(gates[3])
51+
52+
nextc = (forgetgate * c) + (ingate * cellgate)
53+
nexth = outgate * self.tanh(nextc)
54+
55+
return (nextc, nexth), nexth
56+
57+
def initHidden(self, bsz):
58+
return (Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_()),
59+
Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_()))
60+
61+
62+
class GRU(nn.Container):
63+
64+
def __init__(self, ninp, nhid):
65+
super(GRU, self).__init__(
66+
i2h=nn.Linear(ninp, 3 * nhid),
67+
h2h=nn.Linear(nhid, 3 * nhid),
68+
sigmoid=nn.Sigmoid(),
69+
tanh=nn.Tanh(),
70+
)
71+
self.ninp = ninp
72+
self.nhid = nhid
73+
74+
def __call__(self, hidden, input):
75+
gi = i2h(input).view(3, input.size(0), self.nhid).transpose(0, 1)
76+
gh = h2h(hidden).view(3, input.size(0), self.nhid).transpose(0, 1)
77+
78+
resetgate = self.sigmoid(gi[0] + gh[0])
79+
updategate = self.sigmoid(gi[1] + gh[1])
80+
81+
output = self.tanh(gi[2] + resetgate * gh[2])
82+
nexth = hidden + updategate * (output - h)
83+
84+
return nexth, output
85+
86+
def initHidden(self, bsz):
87+
return Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_())
88+
89+
90+

0 commit comments

Comments
 (0)