Language Model Exercises#

In these exercises you will extend and develop language models. We will use the code from the notes, but within a python package lm.

Setup 1: Load Libraries#

%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys, os
_snlp_book_dir = ".."
sys.path.append(_snlp_book_dir) 
from statnlpbook.lm import *
from statnlpbook.ohhla import *
# %cd .. 
import sys
sys.path.append("..")
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Input In [1], in <cell line: 7>()
      5 _snlp_book_dir = ".."
      6 sys.path.append(_snlp_book_dir) 
----> 7 from statnlpbook.lm import *
      8 from statnlpbook.ohhla import *
      9 # %cd .. 

ModuleNotFoundError: No module named 'statnlpbook'

$$ \newcommand{\prob}{p} \newcommand{\vocab}{V} \newcommand{\params}{\boldsymbol{\theta}} \newcommand{\param}{\theta} \DeclareMathOperator{\perplexity}{PP} \DeclareMathOperator{\argmax}{argmax} \newcommand{\train}{\mathcal{D}} \newcommand{\counts}[2]{#_{#1}(#2) } $$

Setup 2: Load Data#

docs = load_all_songs("../data/ohhla/train/www.ohhla.com/anonymous/j_live/")
assert len(docs) == 50, "Your ohhla corpus is corrupted, please download it again!"
trainDocs, testDocs = docs[:len(docs)//2], docs[len(docs)//2:] 
train = words(trainDocs)
test = words(testDocs)

Task 1: Optimal Pseudo Count#

Plot the perplexity for laplace smoothing on the given data as a function of alpha in the interval [0.001, 0.1] in steps by 0.001. Is it fair to assume that this is a convex function? Write a method that finds the optimal pseudo count alpha number for laplace smoothing for the given data up to some predefined numerical precision epsilon under the assumption that the perplexity is a convex function of alpha. How often did you have to call perplexity to find the optimum?

Tips: You don’t need 1st or 2nd order derivatives in this case, only the gradient descent direction. Think about recursively slicing up the problem.

oov_train = inject_OOVs(train)
oov_vocab = set(oov_train)
oov_test = replace_OOVs(oov_vocab, test)
bigram = NGramLM(oov_train,2)

interval = [x/1000.0 for x in range(1, 100, 1)]
perplexity_at_1 = perplexity(LaplaceLM(bigram, alpha=1.0), oov_test)

def plot_perplexities(interval):
    """Plots the perplexity of LaplaceLM for every alpha in interval."""
    perplexities = [0.0 for alpha in interval]  # todo
    plt.plot(interval, perplexities)
    
def find_optimal(low, high, epsilon=1e-6):
    """Returns the optimal pseudo count alpha within the interval [low, high] and the perplexity."""
    print(high, low)
    if high - low < epsilon:
        return 0.0  # todo
    else:
        return 0.0  # todo

plot_perplexities(interval)        
find_optimal(0.0, 1.0)
1.0 0.0
0.0
../_images/language_models_7_2.png

Task 2: Sanity Check LM#

Implement a method that tests whether a language model provides a valid probability distribution.

def sanity_check(lm, *history):
    """Throws an AssertionError if lm does not define a valid probability distribution for all words 
    in the vocabulary."""  
    probability_mass = 1.0  # todo
    assert abs(probability_mass - 1.0) < 1e-6, probability_mass

unigram = NGramLM(oov_train,1)
stupid = StupidBackoff(bigram, unigram, 0.1)
print(sum([stupid.probability(word, 'the') for word in stupid.vocab]))
sanity_check(stupid, 'the')
1.0647115579930904

Task 3: Subtract Count LM#

Develop and implement a language model that subtracts a count $d\in[0,1]$ from each non-zero count in the training set. Let’s first formalise this:

\begin{align} #{w=0}(h_n) &= \sum{w \in V} \mathbf{1}[\counts{\train}{h_n,w} = 0]\ #{w>0}(h_n) &= \sum{w \in V} \mathbf{1}[\counts{\train}{h_n,w} > 0]\ \prob(w|h_n) &= \begin{cases} \frac{\counts{\train}{h_n,w} - d}{\counts{\train}{h_n}} & \mbox{if }\counts{\train}{h_n,w} > 0 \ \frac{???}{\counts{\train}{h_n}} & \mbox{otherwise} \end{cases} \end{align}

class SubtractCount(CountLM):        
    def __init__(self, base_lm, d):
        super().__init__(base_lm.vocab, base_lm.order)
        self.base_lm = base_lm
        self.d = d            
        self._counts = base_lm._counts  # not good style since it is a protected member
        self.vocab = base_lm.vocab

    def counts(self, word_and_history):
        if self._counts[word_and_history] > 0:
            return 0.0  # todo
        else:
            return 0.0  # todo

    def norm(self, history):
        return self.base_lm.norm(history)    
    
subtract_lm = SubtractCount(unigram, 0.1)
oov_prob = subtract_lm.probability(OOV, 'the')
rest_prob = sum([subtract_lm.probability(word, 'the') for word in subtract_lm.vocab])
print(oov_prob + rest_prob)
sanity_check(subtract_lm, 'the')
perplexity(subtract_lm, oov_test)
0.0
inf

Task 4: Normalisation of Stupid LM#

Develop and implement a version of the stupid language model that provides probabilities summing up to 1.

class StupidBackoffNormalized(LanguageModel):
    def __init__(self, main, backoff, alpha):
        super().__init__(main.vocab, main.order)
        self.main = main
        self.backoff = backoff
        self.alpha = alpha               

    def probability(self, word, *history):
        return 0.0  # todo
        
less_stupid = StupidBackoffNormalized(bigram, unigram, 0.1)
print(sum([less_stupid.probability(word, 'the') for word in less_stupid.vocab]))
sanity_check(less_stupid, 'the')
perplexity(less_stupid, oov_test)
0.0
inf