Skip to content

TextClassificationDataset

The TextClassificationDataset class represents a text classification dataset.

  • Loads a text classification dataset in a vertical format.
  • The data consists of three datasets:
    • train
    • dev
    • test
  • Each dataset is a torch.utils.data.Dataset providing
    • __len__: number of sentences in the dataset
    • __getitem__: return the requested sentence as an Element instance, which is a dictionary with keys "document" and "label", each being a string
    • data: a dictionary of type Elements, with keys "documents" and "labels"
    • label_vocab, a npfl138.Vocabulary instance with the label mapping

npfl138.datasets.text_classification_dataset.TextClassificationDataset

Source code in npfl138/datasets/text_classification_dataset.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class TextClassificationDataset:
    Element = TypedDict("Element", {"document": str, "label": str})
    """The type of a single dataset element, i.e., a single document and its label."""
    Elements = TypedDict("Elements", {"documents": list[str], "labels": list[str]})
    """The type of the whole dataset, i.e., a corpus of documents."""

    _URL: str = "https://ufal.mff.cuni.cz/~straka/courses/npfl138/2425/datasets/"

    class Dataset(torch.utils.data.Dataset):
        def __init__(self, data_file: BinaryIO, train: Self | None = None) -> None:
            # Load the data
            self._data: "TextClassificationDataset.Elements" = {
                "documents": [],
                "labels": [],
            }
            for line in data_file:
                line = line.decode("utf-8").rstrip("\r\n")
                label, document = line.split("\t", maxsplit=1)

                self._data["documents"].append(document)
                self._data["labels"].append(label)

            # Create or copy the label mapping
            if train:
                self._label_vocab = train._label_vocab
            else:
                self._label_vocab = Vocabulary(sorted(set(self._data["labels"])))

        @property
        def data(self) -> "TextClassificationDataset.Elements":
            """Return the whole dataset as a `TextClassificationDataset.Elements` object."""
            return self._data

        @property
        def label_vocab(self) -> Vocabulary:
            """The label vocabulary of the dataset."""
            return self._label_vocab

        def __len__(self) -> int:
            """Return the number of documents in the dataset."""
            return len(self._data["labels"])

        def __getitem__(self, index: int) -> "TextClassificationDataset.Element":
            """Return the `index`-th element of the dataset as a dictionary."""
            return {key.removesuffix("s"): value[index] for key, value in self._data.items()}

    def __init__(self, name: str) -> None:
        """Create the dataset from the given filename, downloading it if necessary."""
        path = "{}.zip".format(name)
        if not os.path.exists(path):
            print("Downloading dataset {}...".format(name), file=sys.stderr)
            urllib.request.urlretrieve("{}/{}".format(self._URL, path), filename="{}.tmp".format(path))
            os.rename("{}.tmp".format(path), path)

        with zipfile.ZipFile(path, "r") as zip_file:
            for dataset in ["train", "dev", "test"]:
                with zip_file.open("{}_{}.txt".format(os.path.splitext(path)[0], dataset), "r") as dataset_file:
                    setattr(self, dataset, self.Dataset(dataset_file, train=getattr(self, "train", None)))

    train: Dataset
    """The training dataset."""
    dev: Dataset
    """The development dataset."""
    test: Dataset
    """The test dataset."""

    # Evaluation infrastructure.
    @staticmethod
    def evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float:
        """Evaluate the `predictions` against the gold dataset.

        Returns:
          accuracy: The accuracy of the predictions in percentages.
        """
        gold = gold_dataset.data["labels"]

        if len(predictions) != len(gold):
            raise RuntimeError("The predictions are of different size than gold data: {} vs {}".format(
                len(predictions), len(gold)))

        correct = sum(gold[i] == predictions[i] for i in range(len(gold)))
        return 100 * correct / len(gold)

    @staticmethod
    def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
        """Evaluate the file with predictions against the gold dataset.

        Returns:
          accuracy: The accuracy of the predictions in percentages.
        """
        predictions = [line.rstrip("\r\n") for line in predictions_file]
        return TextClassificationDataset.evaluate(gold_dataset, predictions)

Element class-attribute instance-attribute

Element = TypedDict('Element', {'document': str, 'label': str})

The type of a single dataset element, i.e., a single document and its label.

Elements class-attribute instance-attribute

Elements = TypedDict('Elements', {'documents': list[str], 'labels': list[str]})

The type of the whole dataset, i.e., a corpus of documents.

Dataset

Bases: Dataset

Source code in npfl138/datasets/text_classification_dataset.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_file: BinaryIO, train: Self | None = None) -> None:
        # Load the data
        self._data: "TextClassificationDataset.Elements" = {
            "documents": [],
            "labels": [],
        }
        for line in data_file:
            line = line.decode("utf-8").rstrip("\r\n")
            label, document = line.split("\t", maxsplit=1)

            self._data["documents"].append(document)
            self._data["labels"].append(label)

        # Create or copy the label mapping
        if train:
            self._label_vocab = train._label_vocab
        else:
            self._label_vocab = Vocabulary(sorted(set(self._data["labels"])))

    @property
    def data(self) -> "TextClassificationDataset.Elements":
        """Return the whole dataset as a `TextClassificationDataset.Elements` object."""
        return self._data

    @property
    def label_vocab(self) -> Vocabulary:
        """The label vocabulary of the dataset."""
        return self._label_vocab

    def __len__(self) -> int:
        """Return the number of documents in the dataset."""
        return len(self._data["labels"])

    def __getitem__(self, index: int) -> "TextClassificationDataset.Element":
        """Return the `index`-th element of the dataset as a dictionary."""
        return {key.removesuffix("s"): value[index] for key, value in self._data.items()}

data property

data: Elements

Return the whole dataset as a TextClassificationDataset.Elements object.

label_vocab property

label_vocab: Vocabulary

The label vocabulary of the dataset.

__len__

__len__() -> int

Return the number of documents in the dataset.

Source code in npfl138/datasets/text_classification_dataset.py
71
72
73
def __len__(self) -> int:
    """Return the number of documents in the dataset."""
    return len(self._data["labels"])

__getitem__

__getitem__(index: int) -> Element

Return the index-th element of the dataset as a dictionary.

Source code in npfl138/datasets/text_classification_dataset.py
75
76
77
def __getitem__(self, index: int) -> "TextClassificationDataset.Element":
    """Return the `index`-th element of the dataset as a dictionary."""
    return {key.removesuffix("s"): value[index] for key, value in self._data.items()}

__init__

__init__(name: str) -> None

Create the dataset from the given filename, downloading it if necessary.

Source code in npfl138/datasets/text_classification_dataset.py
79
80
81
82
83
84
85
86
87
88
89
90
def __init__(self, name: str) -> None:
    """Create the dataset from the given filename, downloading it if necessary."""
    path = "{}.zip".format(name)
    if not os.path.exists(path):
        print("Downloading dataset {}...".format(name), file=sys.stderr)
        urllib.request.urlretrieve("{}/{}".format(self._URL, path), filename="{}.tmp".format(path))
        os.rename("{}.tmp".format(path), path)

    with zipfile.ZipFile(path, "r") as zip_file:
        for dataset in ["train", "dev", "test"]:
            with zip_file.open("{}_{}.txt".format(os.path.splitext(path)[0], dataset), "r") as dataset_file:
                setattr(self, dataset, self.Dataset(dataset_file, train=getattr(self, "train", None)))

train instance-attribute

train: Dataset

The training dataset.

dev instance-attribute

dev: Dataset

The development dataset.

test instance-attribute

test: Dataset

The test dataset.

evaluate staticmethod

evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float

Evaluate the predictions against the gold dataset.

Returns:

  • accuracy ( float ) –

    The accuracy of the predictions in percentages.

Source code in npfl138/datasets/text_classification_dataset.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
@staticmethod
def evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float:
    """Evaluate the `predictions` against the gold dataset.

    Returns:
      accuracy: The accuracy of the predictions in percentages.
    """
    gold = gold_dataset.data["labels"]

    if len(predictions) != len(gold):
        raise RuntimeError("The predictions are of different size than gold data: {} vs {}".format(
            len(predictions), len(gold)))

    correct = sum(gold[i] == predictions[i] for i in range(len(gold)))
    return 100 * correct / len(gold)

evaluate_file staticmethod

evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float

Evaluate the file with predictions against the gold dataset.

Returns:

  • accuracy ( float ) –

    The accuracy of the predictions in percentages.

Source code in npfl138/datasets/text_classification_dataset.py
116
117
118
119
120
121
122
123
124
@staticmethod
def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
    """Evaluate the file with predictions against the gold dataset.

    Returns:
      accuracy: The accuracy of the predictions in percentages.
    """
    predictions = [line.rstrip("\r\n") for line in predictions_file]
    return TextClassificationDataset.evaluate(gold_dataset, predictions)