# Czech Spell-checking using RNNs

In this lab, your goal is to train a model which decides whether you should write a 'y' or a 'i' in a Czech text.

## Data

The come from the Czech Wikipedia. Text from Czech Wikipedia was split into sentences and cleaned such that it contains only characters from Czech alphabet. All Ys were replaces by Is. The targets are encoded in the following way:

* 1 is __i__
* 2 is __y__
* 0 are all the remaining characters

In total there are:
* 500k training sentences
* 400 validation sentences

In [None]:
!curl http://ufallab.ms.mff.cuni.cz/~libovicky/yi.tgz | tar zxvf -

In [None]:
with open("val_text.txt") as f_text, open("val_target.txt") as f_yi:
    val_text = list([line.rstrip() for line in f_text])
    val_yis = list([line.rstrip() for line in f_yi])
assert len(val_text) == len(val_yis)
print("{} validation sentences.".format(len(val_text)))

In [None]:
for sentence, yis in zip(val_text[:5], val_yis):
    print(sentence)
    print(yis)
    print()

In [None]:
with open("train_text.txt") as f_text, open("train_target.txt") as f_yi:
    train_text = list([line.rstrip() for line in f_text])
    train_yis = list([line.rstrip() for line in f_yi])
assert len(train_text) == len(train_yis)
print("{} training sentences.".format(len(train_text)))

## Rule-based baseline

Very simple rules based on grammar.

In [None]:
Y_CONSONANTS = set(list("hkr"))

def apply_char_rules(sentence):
    estimation = ""
    preprev_char = ""
    prev_char = ""
    for char in sentence:
        if char == "i" or char == "í":
            if prev_char in Y_CONSONANTS:
                estimation += "2"
            elif (preprev_char == " " or preprev_char == "") and prev_char == "v":
                estimation += "2"
            else:
                estimation += "1"
        else:
            estimation += "0"

        preprev_char = prev_char
        prev_char = char
    return estimation

In [None]:
correct = 0.
total = 0.
for sentence, yis in zip(val_text, val_yis):
    estimation = apply_char_rules(sentence)

    for est, cor in zip(estimation, yis):
        if est == "1" or est == "2":
            total += 1
            correct += int(est == cor)

print("accuracy: {}".format(correct / total))

## Statistical baseline

Use always the most frequent spelling.

In [None]:
import string
SPLITS = set(string.punctuation + string.whitespace)
count_table = {}


def evaluation():
    word_buf = []
    correct = 0
    total = 0

    for sent, solutions in zip(val_text, val_yis):
        estimated = ""
        for char in sent:
            if char in SPLITS:
                word = "".join(word_buf)
                if word in count_table:
                    entries = count_table[word]
                    most_frequent = max(entries, key=lambda x: entries[x])
                    estimated += most_frequent
                else:
                    estimated += "".join("1" if c in list('íi') else "0" for c in word)
                word_buf = []
                estimated += "0"
            else:
                word_buf.append(char)

        for real, estimated in zip(solutions, estimated):
            if real != "0":
                total += 1
                correct += int(real == estimated)

    return correct / total


word_buf = []
solution_buf = []
for i, (sent, solutions) in enumerate(zip(train_text, train_yis)):
    for char, clazz in zip(sent, solutions):
        if char in SPLITS:
            if sum(solution_buf):
                word = "".join(word_buf)
                solution = "".join(str(x) for x in solution_buf)
                if word not in count_table:
                    count_table[word] = {}
                if solution not in count_table[word]:
                    count_table[word][solution] = 1
                else:
                    count_table[word][solution] += 1
            word_buf = []
            solution_buf = []
        else:
            word_buf.append(char)
            solution_buf.append(int(clazz))

    if i % 1000 == 0:
        acc = evaluation()
        print("{}\t{}".format(i, acc))


## Classification using RNN

Use bi-directional RNN to classify.

Please fill all __TODOs__ in the following cell. There are three function that build the TensorFlow graph. Once you are done, you should be able to run the following two cells with the service code for training the model.

When you are done, you can play around with the model.

* add dropout between embeddings and RNN and between RNN and the classifier
* try LSTM / Layer-nomralized LSTM instead of GRU
* vary dimensionality of the model
* try multilayer output projection v ReLU activation

In [None]:
from collections import namedtuple
import tensorflow as tf

ALPHABET = list(("_aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxzž"
                 "0123456789\"'?!.,% #@$(){}[]- "))
TARGET_CLASSES = ["0", "1", "2"]

Network = namedtuple('Network',
                     ['input', 'lengths', 'targets',
                      'predictions', 'cost'])


def build_network(embedding_size=32, gru_size=256):
    """RNN architecture for guessing y/i.
    
    Embed the sequence and call RNN and classifyier on top of it.
    
    Args:
        embedding_size: Dimensionality of character embeddings.
        gru_size: Dimensionality of the recurrent cell.
    
    Returns:
        A tuple of tensors representing the network: placeholder for input
        characters (batch x sentence length), sequence length (batch) and
        targets (batch x sentence length); tensors with predictions (batch x
        sentence length) and cost (scalar).
    
    """
    seq_input = tf.placeholder(tf.int32, shape=[None, None])
    lengths = tf.placeholder(tf.int32, shape=[None])
    targets = tf.placeholder(tf.int32, shape=[None, None])

    # TODO fill the correct shape of the embeddings
    embeddings = tf.get_variable(
        "embeddings", shape=[..., ...])

    # TODO do embedding lookup
    embedded_sequence = ...
    rnn_output = _bidirectional_layer(embedded_sequence, lengths, gru_size)

    logits_flatten, predictions = _classifier(rnn_output, 2 * gru_size, lengths)

    cost = _cost_function(logits_flatten, targets)

    return Network(seq_input, lengths, targets, predictions, cost)


def _bidirectional_layer(embedded_sequence, lengths, gru_size):
    """Apply a bidirectional RNN over an embedded sequence.
    
    Args:
        embedded_sequence: A 3D float tensor of shape (batch x sentence 
            length x embedding)
        lengths: A 1D integer tensor of shape (batch)
        gru_size: Dimension of the GRU cell.
        
    Returns:
        A 3D float tensor with RNN output (batch x sentence length x 2 gru_size)    
    """
    with tf.variable_scope("bidi_layer"):
        # TODO initialize GRU cell for forward and backward network
        cell_fw = ...
        cell_bw = ...
        # TODO run bidirectional dynamic rnn on embedded sequence
        (output_fw, output_bw), _ = ...

        # TODO concatenate the output tensors for both directions
        rnn_outputs = ...
    return rnn_outputs


def _classifier(rnn_output, rnn_out_size, lengths):
    with tf.variable_scope("Classifier"):
        # the RNN outpus should be reshaped into a 2D tensor in order to
        # the linear projection
        outputs_reshaped = tf.reshape(rnn_output, shape=[-1, rnn_out_size])

        # TODO define learnable variables for linear projection
        weight_matrix = ...
        bias = ...
        # TODO do the linear projection
        logits_flatten = ...

        predictions = tf.reshape(
            logits_flatten, shape=[-1, tf.reduce_max(lengths), len(TARGET_CLASSES)])

    return logits_flatten, predictions


def _cost_function(logits_flatten, targets):
    with tf.variable_scope("Cost"):
        targets_flatten = tf.reshape(targets, shape=[-1])

        # mask with 1s with y/i and 0s elsewhere
        relevance_mask = tf.cast(tf.greater(targets_flatten, 0), tf.float32)

        xent_per_char = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits_flatten, targets_flatten)

        # compute the cost only at the position with y/i
        cost = (tf.reduce_sum(xent_per_char * relevance_mask) / 
            tf.reduce_sum(relevance_mask))

    return cost

In [None]:
import itertools
import re
import sys
import numpy as np
import tensorflow as tf
from build_network import build_network, ALPHABET, TARGET_CLASSES

ALPHABET_DICT = {char: index for index, char in enumerate(ALPHABET)}
TARGET_DICT = {clazz: index for index, clazz in enumerate(TARGET_CLASSES)}

MAX_LEN = 200


def data_to_tensor(texts, dictionary):
    """Convert text data into numpy tensors of indices.

    Args:
        texts: List of strings to be converted.
        dictionary: Dictionary with indices for characters.

    Returns:
        Tuple of numpy array with the indices and numpy array with string
        lenghts.
    """
    text_indices = [[dictionary.get(c, 0) for c in text.rstrip()[:MAX_LEN]]
                    for text in texts]
    lengths = [len(i) for i in text_indices]
    max_len = max(lengths)
    matrix = []
    for indices in text_indices:
        matrix.append(indices +
                      [0 for _ in range(max_len - len(indices))])
    return np.array(matrix), np.array(lengths)


def evaluation(logits, targets, lengths):
    """Compute accuracy of Ys and Is.

    Args:
        logits: Scores for classes from the model.
        targets: The ground truth values.
        lengths: Sentence lengths.

    Returns:
        Accuracy - proportion of correctly guessed ys/is.
    """
    predicted_classes = np.argmax(logits, axis=2)

    correct = 0.
    count = 0.

    for predicted, target, length in zip(predicted_classes, targets, lengths):
        count += np.sum(target[:length] > 0)
        correct += np.sum((predicted[:length] == target[:length]) * (target[:length] > 0))

    accuracy = correct / count

    return accuracy


def get_train_op(model):
    """Prepare TF training operation with regularization."""
    bias_regex = re.compile(r'[Bb]ias')
    regularizable = [tf.reduce_sum(
        v ** 2) for v in tf.trainable_variables()
                     if bias_regex.findall(v.name)]

    l2_value = sum(v * v for v in regularizable)
    l2_cost = 1e-6 * l2_value

    cost = model.cost + l2_cost
    optimizer = tf.train.AdamOptimizer(1e-4)
    train_op = optimizer.minimize(cost)
    return train_op

In [None]:
model = build_network()
print("The graph has been built.")
f_text = open(sys.argv[1], encoding="utf-8")
f_cap = open(sys.argv[2])

val_text, val_lengths = data_to_tensor(
    itertools.islice(f_text, 400), ALPHABET_DICT)
val_cap, _ = data_to_tensor(itertools.islice(f_cap, 400), TARGET_DICT)
print("Validation data are ready.")

train_op = get_train_op(model)
print("Optimizer has been built.")

session = tf.Session(config=tf.ConfigProto(
    intra_op_parallelism_threads=2))
session.run(tf.initialize_all_variables())
print("Session initialized.")

batch_n = 0

while True:
    batch_n += 1
    text_batch, batch_lengths = data_to_tensor(
        train_text[50 * batch_n:50 * (batch_n + 1)], ALPHABET_DICT)
    cap_batch, _ = data_to_tensor(
        train_yis[50 * batch_n:50 * (batch_n + 1)], TARGET_DICT)

    if text_batch.shape == (0,):
        break

    _, predictions, cross_entropy = session.run(
        [train_op, model.predictions, model.cost],
        feed_dict={model.input: text_batch,
                   model.targets: cap_batch,
                   model.lengths: batch_lengths})
    accuracy = evaluation(
        predictions, cap_batch, batch_lengths)
    print("batch {}:\tacc: {:.4f}\txent: {:.4f}".format(
        batch_n, accuracy, cross_entropy))

    if batch_n % 10 == 0:
        predictions, cross_entropy = session.run(
            [model.predictions, model.cost],
            feed_dict={model.input: val_text,
                       model.targets: val_cap,
                       model.lengths: val_lengths})
        accuracy = evaluation(
            predictions, val_cap, batch_lengths)

        print("")
        print("Valdidation after batch {}".format(batch_n))
        print("  accuracy:       {:.5f}".format(accuracy))
        print("  cross-entropy:  {:.5f}".format(cross_entropy))

        print("")