Skip to content

MorphoDataset

The MorphoDataset class loads a morphological dataset in a vertical format.

  • The data consists of three datasets
    • train
    • dev
    • test
  • Each dataset is a torch.utils.data.Dataset providing
    • __len__: number of sentences in the dataset
    • __getitem__: return the requested sentence as an Element instance, which is a dictionary with keys "words"/"lemmas"/"tags", each being a list of strings
    • words, lemmas, tags: instances of type Factor containing the following fields:
      • strings: a Python list containing input sentences, each being a list of strings (words/lemmas/tags)
      • string_vocab: a npfl138.Vocabulary object capable of mapping words to indices. It is constructed on the train set and shared by the dev and test sets
      • char_vocab: a npfl138.Vocabulary object capable of mapping characters to indices. It is constructed on the train set and shared by the dev and test sets
    • cle_batch: a method for creating inputs for character-level embeddings. It takes a list of sentences, each being a list of string words, and produces a tuple of two tensors:
      • unique_words with shape [num_unique_words, max_word_length] containing each unique word as a sequence of character ids
      • words_indices with shape [num_sentences, max_sentence_length] containing for every word its index in unique_words

npfl138.datasets.morpho_dataset.MorphoDataset

Source code in npfl138/datasets/morpho_dataset.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
class MorphoDataset:
    PAD: int = 0
    """The index of the padding token in the vocabulary, always present."""
    UNK: int = 1
    """The index of the unknown token in the vocabulary, always present."""
    BOW: int = 2
    """A special beginning-of-word token, always present in character vocabularies."""
    EOW: int = 3
    """A special end-of-word token, always present in character vocabularies."""

    _URL: str = "https://ufal.mff.cuni.cz/~straka/courses/npfl138/2425/datasets/"

    Element = TypedDict("Element", {"words": list[str], "lemmas": list[str], "tags": list[str]})
    """The type of a single dataset element, i.e., a single sentence."""

    class Factor:
        """A factor of the dataset, i.e., words, lemmas or tags."""
        string_vocab: Vocabulary
        """The word vocabulary of this factor."""
        char_vocab: Vocabulary
        """The character vocabulary of this factor."""
        strings: list[list[str]]
        """The input sentences of this factor, each being a list of strings."""

        def __init__(self) -> None:
            self.strings = []

        def finalize(self, train: Self | None = None) -> None:
            # Create vocabularies
            if train:
                self.string_vocab = train.string_vocab
                self.char_vocab = train.char_vocab
            else:
                strings = sorted(set(string for sentence in self.strings for string in sentence))
                self.string_vocab = Vocabulary(strings, add_unk=True)

                bow_eow = ["[BOW]", "[EOW]"]
                self.char_vocab = Vocabulary(bow_eow + sorted(set(char for string in strings for char in string)),
                                             add_unk=True)

    class Dataset(torch.utils.data.Dataset):
        def __init__(self, data_file: BinaryIO, train: Self | None = None, max_sentences: int | None = None) -> None:
            # Create factors
            self._factors = (MorphoDataset.Factor(), MorphoDataset.Factor(), MorphoDataset.Factor())
            self._factors_tensors = None

            # Load the data
            self._size = 0
            in_sentence = False
            for line in data_file:
                line = line.decode("utf-8").rstrip("\r\n")
                if line:
                    if not in_sentence:
                        for factor in self._factors:
                            factor.strings.append([])
                        self._size += 1

                    columns = line.split("\t")
                    assert len(columns) == len(self._factors)
                    for column, factor in zip(columns, self._factors):
                        factor.strings[-1].append(column)

                    in_sentence = True
                else:
                    in_sentence = False
                    if max_sentences is not None and self._size >= max_sentences:
                        break

            # Finalize the mappings
            for i, factor in enumerate(self._factors):
                factor.finalize(train._factors[i] if train else None)

        @property
        def words(self) -> "MorphoDataset.Factor":
            """Factor containing the words of the dataset."""
            return self._factors[0]

        @property
        def lemmas(self) -> "MorphoDataset.Factor":
            """Factor containing the lemmas of the dataset."""
            return self._factors[1]

        @property
        def tags(self) -> "MorphoDataset.Factor":
            """Factor containing the tags of the dataset."""
            return self._factors[2]

        def __len__(self) -> int:
            """Return the number of sentences in the dataset."""
            return self._size

        def __getitem__(self, index: int) -> "MorphoDataset.Element":
            """Return the `index`-th element of the dataset as a dictionary."""
            return {"words": self.words.strings[index],
                    "lemmas": self.lemmas.strings[index],
                    "tags": self.tags.strings[index]}

        def cle_batch(self, words: list[list[str]]) -> tuple[torch.Tensor, torch.Tensor]:
            """Create a batch suitable for computation of character-level word embeddings.

            Parameters:
              words: A batch of sentences, each being a list of string words.

            Returns:
              unique_words: A tensor with shape `[num_unique_words, max_word_length]`
                containing each unique word as a sequence of character ids.
              words_indices: A tensor with shape `[num_sentences, max_sentence_length]`
                containing for every word from the batch its index in `unique_words`.
            """
            unique_strings = list(set(word for sentence in words for word in sentence))
            unique_string_map = {word: index + 1 for index, word in enumerate(unique_strings)}
            unique_words = torch.nn.utils.rnn.pad_sequence(
                [torch.tensor([MorphoDataset.UNK])]
                + [torch.tensor(self.words.char_vocab.indices(word)) for word in unique_strings], batch_first=True)
            words_indices = torch.nn.utils.rnn.pad_sequence(
                [torch.tensor([unique_string_map[word] for word in sentence]) for sentence in words], batch_first=True)
            return unique_words, words_indices

        def cle_batch_packed(self, words: list[list[str]]) -> tuple[torch.nn.utils.rnn.PackedSequence,
                                                                    torch.nn.utils.rnn.PackedSequence]:
            """Create a batch suitable for computation of character-level word embeddings.

            This function is very similar to `cle_batch`, but it returns packed sequences instead
            of padded sequences.

            Parameters:
              words: A batch of sentences, each being a list of string words.

            Returns:
              unique_words: A PackedSequence containing each unique word as
                a sequence of character ids.
              words_indices: A PackedSequence containing for every word from
              the batch its index in `unique_words`.
            """
            unique_strings = list(set(word for sentence in words for word in sentence))
            unique_string_map = {word: index + 1 for index, word in enumerate(unique_strings)}
            unique_words = torch.nn.utils.rnn.pack_sequence(
                [torch.tensor([MorphoDataset.UNK])]
                + [torch.tensor(self.words.char_vocab.indices(word)) for word in unique_strings], False)
            words_indices = torch.nn.utils.rnn.pack_sequence(
                [torch.tensor([unique_string_map[word] for word in sentence]) for sentence in words], False)
            return unique_words, words_indices

    def __init__(self, dataset, max_sentences=None):
        """Load the `dataset` dataset, downloading it if necessary.

        Parameters:
          dataset: The name of the dataset, for example `czech_pdt`.
          max_sentences: The maximum number of sentences to load.
        """
        path = "{}.zip".format(dataset)
        if not os.path.exists(path):
            print("Downloading dataset {}...".format(dataset), file=sys.stderr)
            urllib.request.urlretrieve("{}/{}".format(self._URL, path), filename="{}.tmp".format(path))
            os.rename("{}.tmp".format(path), path)

        with zipfile.ZipFile(path, "r") as zip_file:
            for dataset in ["train", "dev", "test"]:
                with zip_file.open("{}_{}.txt".format(os.path.splitext(path)[0], dataset), "r") as dataset_file:
                    setattr(self, dataset, self.Dataset(
                        dataset_file, train=getattr(self, "train", None), max_sentences=max_sentences))

    train: Dataset
    """The training dataset."""
    dev: Dataset
    """The development dataset."""
    test: Dataset
    """The test dataset."""

    # Evaluation infrastructure.
    @staticmethod
    def evaluate(gold_dataset: "MorphoDataset.Factor", predictions: Sequence[str]) -> float:
        """Evaluate the `predictions` against the gold dataset.

        Returns:
          accuracy: The accuracy of the predictions in percentages.
        """
        gold_sentences = gold_dataset.strings

        predicted_sentences, in_sentence = [], False
        for line in predictions:
            line = line.rstrip("\n")
            if not line:
                in_sentence = False
            else:
                if not in_sentence:
                    predicted_sentences.append([])
                    in_sentence = True
                predicted_sentences[-1].append(line)

        if len(predicted_sentences) != len(gold_sentences):
            raise RuntimeError("The predictions contain different number of sentences than gold data: {} vs {}".format(
                len(predicted_sentences), len(gold_sentences)))

        correct, total = 0, 0
        for i, (predicted_sentence, gold_sentence) in enumerate(zip(predicted_sentences, gold_sentences)):
            if len(predicted_sentence) != len(gold_sentence):
                raise RuntimeError("Predicted sentence {} has different number of words than gold: {} vs {}".format(
                    i + 1, len(predicted_sentence), len(gold_sentence)))
            correct += sum(predicted == gold for predicted, gold in zip(predicted_sentence, gold_sentence))
            total += len(predicted_sentence)

        return 100 * correct / total

    @staticmethod
    def evaluate_file(gold_dataset: "MorphoDataset.Factor", predictions_file: TextIO) -> float:
        """Evaluate the file with predictions against the gold dataset.

        Returns:
          accuracy: The accuracy of the predictions in percentages.
        """
        predictions = predictions_file.readlines()
        return MorphoDataset.evaluate(gold_dataset, predictions)

PAD class-attribute instance-attribute

PAD: int = 0

The index of the padding token in the vocabulary, always present.

UNK class-attribute instance-attribute

UNK: int = 1

The index of the unknown token in the vocabulary, always present.

BOW class-attribute instance-attribute

BOW: int = 2

A special beginning-of-word token, always present in character vocabularies.

EOW class-attribute instance-attribute

EOW: int = 3

A special end-of-word token, always present in character vocabularies.

Element class-attribute instance-attribute

Element = TypedDict(
    "Element", {"words": list[str], "lemmas": list[str], "tags": list[str]}
)

The type of a single dataset element, i.e., a single sentence.

Factor

A factor of the dataset, i.e., words, lemmas or tags.

Source code in npfl138/datasets/morpho_dataset.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class Factor:
    """A factor of the dataset, i.e., words, lemmas or tags."""
    string_vocab: Vocabulary
    """The word vocabulary of this factor."""
    char_vocab: Vocabulary
    """The character vocabulary of this factor."""
    strings: list[list[str]]
    """The input sentences of this factor, each being a list of strings."""

    def __init__(self) -> None:
        self.strings = []

    def finalize(self, train: Self | None = None) -> None:
        # Create vocabularies
        if train:
            self.string_vocab = train.string_vocab
            self.char_vocab = train.char_vocab
        else:
            strings = sorted(set(string for sentence in self.strings for string in sentence))
            self.string_vocab = Vocabulary(strings, add_unk=True)

            bow_eow = ["[BOW]", "[EOW]"]
            self.char_vocab = Vocabulary(bow_eow + sorted(set(char for string in strings for char in string)),
                                         add_unk=True)

string_vocab instance-attribute

string_vocab: Vocabulary

The word vocabulary of this factor.

char_vocab instance-attribute

char_vocab: Vocabulary

The character vocabulary of this factor.

strings instance-attribute

strings: list[list[str]] = []

The input sentences of this factor, each being a list of strings.

Dataset

Bases: Dataset

Source code in npfl138/datasets/morpho_dataset.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_file: BinaryIO, train: Self | None = None, max_sentences: int | None = None) -> None:
        # Create factors
        self._factors = (MorphoDataset.Factor(), MorphoDataset.Factor(), MorphoDataset.Factor())
        self._factors_tensors = None

        # Load the data
        self._size = 0
        in_sentence = False
        for line in data_file:
            line = line.decode("utf-8").rstrip("\r\n")
            if line:
                if not in_sentence:
                    for factor in self._factors:
                        factor.strings.append([])
                    self._size += 1

                columns = line.split("\t")
                assert len(columns) == len(self._factors)
                for column, factor in zip(columns, self._factors):
                    factor.strings[-1].append(column)

                in_sentence = True
            else:
                in_sentence = False
                if max_sentences is not None and self._size >= max_sentences:
                    break

        # Finalize the mappings
        for i, factor in enumerate(self._factors):
            factor.finalize(train._factors[i] if train else None)

    @property
    def words(self) -> "MorphoDataset.Factor":
        """Factor containing the words of the dataset."""
        return self._factors[0]

    @property
    def lemmas(self) -> "MorphoDataset.Factor":
        """Factor containing the lemmas of the dataset."""
        return self._factors[1]

    @property
    def tags(self) -> "MorphoDataset.Factor":
        """Factor containing the tags of the dataset."""
        return self._factors[2]

    def __len__(self) -> int:
        """Return the number of sentences in the dataset."""
        return self._size

    def __getitem__(self, index: int) -> "MorphoDataset.Element":
        """Return the `index`-th element of the dataset as a dictionary."""
        return {"words": self.words.strings[index],
                "lemmas": self.lemmas.strings[index],
                "tags": self.tags.strings[index]}

    def cle_batch(self, words: list[list[str]]) -> tuple[torch.Tensor, torch.Tensor]:
        """Create a batch suitable for computation of character-level word embeddings.

        Parameters:
          words: A batch of sentences, each being a list of string words.

        Returns:
          unique_words: A tensor with shape `[num_unique_words, max_word_length]`
            containing each unique word as a sequence of character ids.
          words_indices: A tensor with shape `[num_sentences, max_sentence_length]`
            containing for every word from the batch its index in `unique_words`.
        """
        unique_strings = list(set(word for sentence in words for word in sentence))
        unique_string_map = {word: index + 1 for index, word in enumerate(unique_strings)}
        unique_words = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor([MorphoDataset.UNK])]
            + [torch.tensor(self.words.char_vocab.indices(word)) for word in unique_strings], batch_first=True)
        words_indices = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor([unique_string_map[word] for word in sentence]) for sentence in words], batch_first=True)
        return unique_words, words_indices

    def cle_batch_packed(self, words: list[list[str]]) -> tuple[torch.nn.utils.rnn.PackedSequence,
                                                                torch.nn.utils.rnn.PackedSequence]:
        """Create a batch suitable for computation of character-level word embeddings.

        This function is very similar to `cle_batch`, but it returns packed sequences instead
        of padded sequences.

        Parameters:
          words: A batch of sentences, each being a list of string words.

        Returns:
          unique_words: A PackedSequence containing each unique word as
            a sequence of character ids.
          words_indices: A PackedSequence containing for every word from
          the batch its index in `unique_words`.
        """
        unique_strings = list(set(word for sentence in words for word in sentence))
        unique_string_map = {word: index + 1 for index, word in enumerate(unique_strings)}
        unique_words = torch.nn.utils.rnn.pack_sequence(
            [torch.tensor([MorphoDataset.UNK])]
            + [torch.tensor(self.words.char_vocab.indices(word)) for word in unique_strings], False)
        words_indices = torch.nn.utils.rnn.pack_sequence(
            [torch.tensor([unique_string_map[word] for word in sentence]) for sentence in words], False)
        return unique_words, words_indices

words property

words: Factor

Factor containing the words of the dataset.

lemmas property

lemmas: Factor

Factor containing the lemmas of the dataset.

tags property

tags: Factor

Factor containing the tags of the dataset.

__len__

__len__() -> int

Return the number of sentences in the dataset.

Source code in npfl138/datasets/morpho_dataset.py
134
135
136
def __len__(self) -> int:
    """Return the number of sentences in the dataset."""
    return self._size

__getitem__

__getitem__(index: int) -> Element

Return the index-th element of the dataset as a dictionary.

Source code in npfl138/datasets/morpho_dataset.py
138
139
140
141
142
def __getitem__(self, index: int) -> "MorphoDataset.Element":
    """Return the `index`-th element of the dataset as a dictionary."""
    return {"words": self.words.strings[index],
            "lemmas": self.lemmas.strings[index],
            "tags": self.tags.strings[index]}

cle_batch

cle_batch(words: list[list[str]]) -> tuple[Tensor, Tensor]

Create a batch suitable for computation of character-level word embeddings.

Parameters:

  • words (list[list[str]]) –

    A batch of sentences, each being a list of string words.

Returns:

  • unique_words ( Tensor ) –

    A tensor with shape [num_unique_words, max_word_length] containing each unique word as a sequence of character ids.

  • words_indices ( Tensor ) –

    A tensor with shape [num_sentences, max_sentence_length] containing for every word from the batch its index in unique_words.

Source code in npfl138/datasets/morpho_dataset.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def cle_batch(self, words: list[list[str]]) -> tuple[torch.Tensor, torch.Tensor]:
    """Create a batch suitable for computation of character-level word embeddings.

    Parameters:
      words: A batch of sentences, each being a list of string words.

    Returns:
      unique_words: A tensor with shape `[num_unique_words, max_word_length]`
        containing each unique word as a sequence of character ids.
      words_indices: A tensor with shape `[num_sentences, max_sentence_length]`
        containing for every word from the batch its index in `unique_words`.
    """
    unique_strings = list(set(word for sentence in words for word in sentence))
    unique_string_map = {word: index + 1 for index, word in enumerate(unique_strings)}
    unique_words = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor([MorphoDataset.UNK])]
        + [torch.tensor(self.words.char_vocab.indices(word)) for word in unique_strings], batch_first=True)
    words_indices = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor([unique_string_map[word] for word in sentence]) for sentence in words], batch_first=True)
    return unique_words, words_indices

cle_batch_packed

cle_batch_packed(
    words: list[list[str]],
) -> tuple[PackedSequence, PackedSequence]

Create a batch suitable for computation of character-level word embeddings.

This function is very similar to cle_batch, but it returns packed sequences instead of padded sequences.

Parameters:

  • words (list[list[str]]) –

    A batch of sentences, each being a list of string words.

Returns:

Source code in npfl138/datasets/morpho_dataset.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def cle_batch_packed(self, words: list[list[str]]) -> tuple[torch.nn.utils.rnn.PackedSequence,
                                                            torch.nn.utils.rnn.PackedSequence]:
    """Create a batch suitable for computation of character-level word embeddings.

    This function is very similar to `cle_batch`, but it returns packed sequences instead
    of padded sequences.

    Parameters:
      words: A batch of sentences, each being a list of string words.

    Returns:
      unique_words: A PackedSequence containing each unique word as
        a sequence of character ids.
      words_indices: A PackedSequence containing for every word from
      the batch its index in `unique_words`.
    """
    unique_strings = list(set(word for sentence in words for word in sentence))
    unique_string_map = {word: index + 1 for index, word in enumerate(unique_strings)}
    unique_words = torch.nn.utils.rnn.pack_sequence(
        [torch.tensor([MorphoDataset.UNK])]
        + [torch.tensor(self.words.char_vocab.indices(word)) for word in unique_strings], False)
    words_indices = torch.nn.utils.rnn.pack_sequence(
        [torch.tensor([unique_string_map[word] for word in sentence]) for sentence in words], False)
    return unique_words, words_indices

__init__

__init__(dataset, max_sentences=None)

Load the dataset dataset, downloading it if necessary.

Parameters:

  • dataset

    The name of the dataset, for example czech_pdt.

  • max_sentences

    The maximum number of sentences to load.

Source code in npfl138/datasets/morpho_dataset.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def __init__(self, dataset, max_sentences=None):
    """Load the `dataset` dataset, downloading it if necessary.

    Parameters:
      dataset: The name of the dataset, for example `czech_pdt`.
      max_sentences: The maximum number of sentences to load.
    """
    path = "{}.zip".format(dataset)
    if not os.path.exists(path):
        print("Downloading dataset {}...".format(dataset), file=sys.stderr)
        urllib.request.urlretrieve("{}/{}".format(self._URL, path), filename="{}.tmp".format(path))
        os.rename("{}.tmp".format(path), path)

    with zipfile.ZipFile(path, "r") as zip_file:
        for dataset in ["train", "dev", "test"]:
            with zip_file.open("{}_{}.txt".format(os.path.splitext(path)[0], dataset), "r") as dataset_file:
                setattr(self, dataset, self.Dataset(
                    dataset_file, train=getattr(self, "train", None), max_sentences=max_sentences))

train instance-attribute

train: Dataset

The training dataset.

dev instance-attribute

dev: Dataset

The development dataset.

test instance-attribute

test: Dataset

The test dataset.

evaluate staticmethod

evaluate(gold_dataset: Factor, predictions: Sequence[str]) -> float

Evaluate the predictions against the gold dataset.

Returns:

  • accuracy ( float ) –

    The accuracy of the predictions in percentages.

Source code in npfl138/datasets/morpho_dataset.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
@staticmethod
def evaluate(gold_dataset: "MorphoDataset.Factor", predictions: Sequence[str]) -> float:
    """Evaluate the `predictions` against the gold dataset.

    Returns:
      accuracy: The accuracy of the predictions in percentages.
    """
    gold_sentences = gold_dataset.strings

    predicted_sentences, in_sentence = [], False
    for line in predictions:
        line = line.rstrip("\n")
        if not line:
            in_sentence = False
        else:
            if not in_sentence:
                predicted_sentences.append([])
                in_sentence = True
            predicted_sentences[-1].append(line)

    if len(predicted_sentences) != len(gold_sentences):
        raise RuntimeError("The predictions contain different number of sentences than gold data: {} vs {}".format(
            len(predicted_sentences), len(gold_sentences)))

    correct, total = 0, 0
    for i, (predicted_sentence, gold_sentence) in enumerate(zip(predicted_sentences, gold_sentences)):
        if len(predicted_sentence) != len(gold_sentence):
            raise RuntimeError("Predicted sentence {} has different number of words than gold: {} vs {}".format(
                i + 1, len(predicted_sentence), len(gold_sentence)))
        correct += sum(predicted == gold for predicted, gold in zip(predicted_sentence, gold_sentence))
        total += len(predicted_sentence)

    return 100 * correct / total

evaluate_file staticmethod

evaluate_file(gold_dataset: Factor, predictions_file: TextIO) -> float

Evaluate the file with predictions against the gold dataset.

Returns:

  • accuracy ( float ) –

    The accuracy of the predictions in percentages.

Source code in npfl138/datasets/morpho_dataset.py
251
252
253
254
255
256
257
258
259
@staticmethod
def evaluate_file(gold_dataset: "MorphoDataset.Factor", predictions_file: TextIO) -> float:
    """Evaluate the file with predictions against the gold dataset.

    Returns:
      accuracy: The accuracy of the predictions in percentages.
    """
    predictions = predictions_file.readlines()
    return MorphoDataset.evaluate(gold_dataset, predictions)