initial commit of word language model

adamlerer · adamlerer · commit 6fde116a1bb2 · 2016-10-16T21:33:14.000-07:00
diff --git a/word_language_model/data.py b/word_language_model/data.py
@@ -0,0 +1,53 @@
+########################################
+# Data Fetching Script for PTB
+########################################
+
+import torch
+import os.path
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+
+    def addword(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word)
+        
+        return self.word2idx[word]
+
+    def ntokens(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+    def __init__(self, path):
+        self.dic = Dictionary()
+        self.train=self._loadfile(os.path.join(path, 'train.txt'))
+        self.valid=self._loadfile(os.path.join(path, 'valid.txt'))
+        self.test =self._loadfile(os.path.join(path, 'test.txt'))
+
+    # | Tokenize a text file.
+    def _loadfile(self, path):
+        # Read words from file.
+        assert(os.path.exists(path))
+        tokens = 0
+        with open(path, 'r') as f:
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    self.dic.addword(word)
+                    tokens += 1
+    
+        with open(path, 'r') as f:
+            ids = torch.LongTensor(tokens)
+            token = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    ids[token] = self.dic.word2idx[word]
+                    token += 1
+    
+        # Final dataset.
+        return ids
diff --git a/word_language_model/main.py b/word_language_model/main.py
@@ -0,0 +1,222 @@
+###############################################################################
+# Language Modeling on Penn Tree Bank
+###############################################################################
+
+import argparse
+import time
+import math
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+from rnn_modules import *
+import data
+
+parser = argparse.ArgumentParser(description='PyTorch PTB Language Model')
+
+# Data parameters
+parser.add_argument('-data'      , type=str, default='./data/penn', help='Location of the data corpus'               )
+# Model parameters.
+parser.add_argument('-model'     , type=str, default='RNN'        , help='Type of recurrent net. RNN, LSTM, or GRU.' )
+parser.add_argument('-emsize'    , type=int, default=200          , help='Size of word embeddings'                   )
+parser.add_argument('-nhid'      , type=int, default=200          , help='Number of hidden units per layer.'         )
+# Optimization parameters.
+parser.add_argument('-lr'        , type=float, default=20         , help='Initial learning rate.'                  )
+parser.add_argument('-clip'      , type=float, default=0.5        , help='Gradient clipping.'                      )
+parser.add_argument('-maxepoch'  , type=int,   default=6          , help='Upper epoch limit.'                      )
+parser.add_argument('-batchsize' , type=int,   default=20         , help='Batch size.'                             )
+parser.add_argument('-bptt'      , type=int,   default=20         , help='Sequence length.'                        )
+# Device parameters.
+parser.add_argument('-seed'      , type=int,   default=1111       , help='Random seed.'                            )
+parser.add_argument('-cuda'      , action='store_true'            , help='Use CUDA.'                               )
+# Misc parameters.
+parser.add_argument('-reportint' , type=int,   default=1000       , help='Report interval.'                        )
+parser.add_argument('-save'      , type=str,   default='model.pt' , help='Path to save the final model.'           )
+args = parser.parse_args()
+
+# Set the random seed manually for reproducibility.
+torch.manual_seed(args.seed)
+# If the GPU is enabled, do some plumbing.
+
+if torch.cuda.is_available() and not args.cuda:
+    print("WARNING: You have a CUDA device, so you should probably run with -cuda")
+
+###############################################################################
+## LOAD DATA
+###############################################################################
+
+corpus = data.Corpus(args.data)
+
+def batchify(data, bsz, bptt):
+    nbatch = int(math.floor(data.size(0) / bsz / bptt))
+    data = data.narrow(0, 0, nbatch * bptt * bsz)
+    data = data.view(bsz, -1).t().contiguous()
+    if args.cuda:
+        data = data.cuda()
+    return data
+
+train = batchify(corpus.train, args.batchsize, args.bptt)
+valid = batchify(corpus.valid, 10, 1)
+test  = batchify(corpus.test,  10, 1)
+train = train[:10000]
+valid = valid[:100]
+
+bptt  = args.bptt
+bsz   = args.batchsize
+
+###############################################################################
+# MAKE MODEL
+###############################################################################
+
+initrange = 0.1
+
+class RNNModel(nn.Container):
+    """A container module with an encoder, an RNN (one of several flavors),
+    and a decoder. Runs one RNN step at a time.
+    """
+
+    @staticmethod
+    def name2module(name):
+        if name == 'RNN':
+            return RNN
+        elif name == 'LSTM':
+            return LSTM
+        elif name == 'GRU':
+            return GRU
+        else:
+            error("Unknown RNN module: " + name)
+
+    def __init__(self, rnnType, ntoken, ninp, nhid):
+        rnnModule = RNNModel.name2module(rnnType)
+        super(RNNModel, self).__init__(
+            encoder = nn.sparse.Embedding(ntoken, ninp),
+            rnn = rnnModule(ninp, nhid),
+            decoder = nn.Linear(nhid, ntoken),
+        )
+
+        # FIXME: is this better than the standard init? probably
+        # FIXME: we need better reset_parameters methods in stdlib
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.fill_(0)
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def __call__(self, hidden, input):
+        emb = self.encoder(input)
+        hidden, output = self.rnn(hidden, emb)
+        decoded = self.decoder(output)
+        return hidden, decoded
+
+    def initHidden(self, bsz):
+        return self.rnn.initHidden(bsz)
+
+model = RNNModel(args.model, corpus.dic.ntokens(), args.emsize, args.nhid)
+if args.cuda:
+    model.cuda()
+
+criterion = nn.CrossEntropyLoss()
+
+########################################
+# TRAINING
+########################################
+
+lr   = args.lr
+clip = args.clip
+reportinterval = args.reportint
+
+# Perform the forward pass only.
+def evaluate(model, data, criterion):
+    loss = 0
+    hidden = model.initHidden(data.size(1))
+    # Loop over validation data.
+    for i in range(0, data.size(0) - 1):
+        hidden, output = model(hidden, Variable(data[i], requires_grad=False))
+        loss += criterion(output, Variable(data[i+1], requires_grad=False)).data[0]
+
+    return loss / data.size(0)
+
+# simple gradient clipping, using the total norm of the gradient
+def clipGradient(model, clip):
+    totalnorm = 0
+    for p in model.parameters():
+        modulenorm = p.grad.norm()
+        totalnorm += modulenorm ** 2
+    totalnorm = math.sqrt(totalnorm)
+    return min(1, args.clip / (totalnorm + 1e-6))
+
+# Between bptt intervals, we want to maintain the hidden state data
+# but don't want to backprop gradients across bptt intervals.
+# So we have to rewrap the hidden state in a fresh Variable.
+def repackageHidden(h):
+    if type(h) == Variable:
+        return Variable(h.data)
+    else:
+        return tuple(repackageVariable(v) for v in h)
+
+# Loop over epochs.
+prev_loss = None
+for epoch in range(1, args.maxepoch+1):
+    total_loss = 0
+    epoch_start_time = time.time()
+    # Start with an initial hidden state.
+    hidden = model.initHidden(bsz)
+    # Loop over the training data.
+    loss = 0
+    i = 0
+    model.zero_grad()
+
+    total_loss = 0
+    start_time = epoch_start_time = time.time()
+    while i < train.size(0) - 1:
+        hidden, output = model(hidden, Variable(train[i], requires_grad=False))
+        loss += criterion(output, Variable(train[i+1], requires_grad=False))
+        i += 1
+
+        if i % bptt == 0:
+            loss.backward()
+
+            clipped_lr = lr * clipGradient(model, args.clip)
+
+            for p in model.parameters():
+                p.data.sub_(p.grad.mul(clipped_lr))
+
+            hidden = repackageHidden(hidden)
+            model.zero_grad()
+            total_loss += loss.data[0]
+            loss = 0
+
+        if i % reportinterval == 0:
+            cur_loss = total_loss / reportinterval
+            elapsed = time.time() - start_time
+            print(
+                    ('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
+                    + 'train loss {:5.2f} | train ppl {:8.2f}').format(
+                epoch, i, train.size(0), lr, elapsed * 1000 / reportinterval * bsz,
+                cur_loss, math.exp(cur_loss)
+            ))
+            total_loss = 0
+            start_time = time.time()
+
+    val_loss = evaluate(model, valid, criterion)
+
+    print(
+        '| end of epoch {:3d} | ms/batch {:5.2f} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
+        epoch, (time.time() - epoch_start_time) * 1000 / train.size(0), val_loss, math.exp(val_loss)
+    ))
+
+    # The annealing schedule.
+    if prev_loss and val_loss > prev_loss:
+        lr = lr / 4
+
+    prev_loss = val_loss
+
+# Run on test data.
+test_loss = evaluate(model, test, criterion)
+print(
+    '| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
+    test_loss, math.exp(test_loss)
+))
+
+if args.save != '' :
+    with open(args.save, 'wb') as f:
+        torch.save(model, f)
diff --git a/word_language_model/rnn_modules.py b/word_language_model/rnn_modules.py
@@ -0,0 +1,90 @@
+###############################################################################
+# Various RNN Modules
+###############################################################################
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+# FIXME: add CUDNN
+
+class RNN(nn.Container):
+   
+    def __init__(self, ninp, nhid):
+        super(RNN, self).__init__(
+            i2h=nn.Linear(ninp, nhid),
+            h2h=nn.Linear(nhid, nhid),
+            sigmoid=nn.Sigmoid(),
+        )
+        self.ninp = ninp
+        self.nhid = nhid
+
+    def __call__(self, hidden, input):
+        next = self.sigmoid(self.h2h(hidden) + self.i2h(input))
+        return next, next
+
+    def initHidden(self, bsz):
+        return Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_())
+
+
+class LSTM(nn.Container):
+   
+    def __init__(self, ninp, nhid):
+        super(LSTM, self).__init__(
+            i2h=nn.Linear(ninp, 4 * nhid), 
+            h2h=nn.Linear(nhid, 4 * nhid), 
+            sigmoid=nn.Sigmoid(),
+            tanh=nn.Tanh(),
+        )
+        self.ninp = ninp
+        self.nhid = nhid
+
+    def __call__(self, hidden, input):
+        c, h = hidden
+        gates = self.h2h(h) + self.i2h(input)
+        gates      = gates.view(input.size(0), 4, self.nhid).transpose(0, 1)
+
+        ingate     = self.sigmoid(gates[0])
+        cellgate   = self.tanh(gates[1])
+        forgetgate = self.sigmoid(gates[2])
+        outgate    = self.sigmoid(gates[3])
+
+        nextc = (forgetgate * c) + (ingate * cellgate) 
+        nexth = outgate * self.tanh(nextc)
+
+        return (nextc, nexth), nexth
+
+    def initHidden(self, bsz):
+        return (Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_()),
+                Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_()))
+
+
+class GRU(nn.Container):
+   
+    def __init__(self, ninp, nhid):
+        super(GRU, self).__init__(
+            i2h=nn.Linear(ninp, 3 * nhid),
+            h2h=nn.Linear(nhid, 3 * nhid),
+            sigmoid=nn.Sigmoid(),
+            tanh=nn.Tanh(),
+        )
+        self.ninp = ninp
+        self.nhid = nhid
+
+    def __call__(self, hidden, input):
+        gi = i2h(input).view(3, input.size(0), self.nhid).transpose(0, 1)
+        gh = h2h(hidden).view(3, input.size(0), self.nhid).transpose(0, 1)
+
+        resetgate  = self.sigmoid(gi[0] + gh[0])
+        updategate = self.sigmoid(gi[1] + gh[1])
+
+        output = self.tanh(gi[2] + resetgate * gh[2])
+        nexth = hidden + updategate * (output - h)
+
+        return nexth, output
+
+    def initHidden(self, bsz):
+        return Variable(self.h2h.weight.data.new(bsz, self.nhid).zero_())
+
+
+