TextClassificationDataset

The TextClassificationDataset class represents a text classification dataset.

Loads a text classification dataset in a vertical format.
The data consists of three datasets:
- train
- dev
- test
Each dataset is a torch.utils.data.Dataset providing
- __len__: number of sentences in the dataset
- __getitem__: return the requested sentence as an Element instance, which is a dictionary with keys "document" and "label", each being a string
- data: a dictionary of type Elements, with keys "documents" and "labels"
- label_vocab, a npfl138.Vocabulary instance with the label mapping

npfl138.datasets.text_classification_dataset.TextClassificationDataset

Source code in npfl138/datasets/text_classification_dataset.py

class TextClassificationDataset:
    Element = TypedDict("Element", {"document": str, "label": str})
    """The type of a single dataset element, i.e., a single document and its label."""
    Elements = TypedDict("Elements", {"documents": list[str], "labels": list[str]})
    """The type of the whole dataset, i.e., a corpus of documents."""

    _URL: str = "https://ufal.mff.cuni.cz/~straka/courses/npfl138/2425/datasets/"

    class Dataset(torch.utils.data.Dataset):
        def __init__(self, data_file: BinaryIO, train: Self | None = None) -> None:
            # Load the data
            self._data: "TextClassificationDataset.Elements" = {
                "documents": [],
                "labels": [],
            }
            for line in data_file:
                line = line.decode("utf-8").rstrip("\r\n")
                label, document = line.split("\t", maxsplit=1)

                self._data["documents"].append(document)
                self._data["labels"].append(label)

            # Create or copy the label mapping
            if train:
                self._label_vocab = train._label_vocab
            else:
                self._label_vocab = Vocabulary(sorted(set(self._data["labels"])))

        @property
        def data(self) -> "TextClassificationDataset.Elements":
            """Return the whole dataset as a `TextClassificationDataset.Elements` object."""
            return self._data

        @property
        def label_vocab(self) -> Vocabulary:
            """The label vocabulary of the dataset."""
            return self._label_vocab

        def __len__(self) -> int:
            """Return the number of documents in the dataset."""
            return len(self._data["labels"])

        def __getitem__(self, index: int) -> "TextClassificationDataset.Element":
            """Return the `index`-th element of the dataset as a dictionary."""
            return {key.removesuffix("s"): value[index] for key, value in self._data.items()}

    def __init__(self, name: str) -> None:
        """Create the dataset from the given filename, downloading it if necessary."""
        path = "{}.zip".format(name)
        if not os.path.exists(path):
            print("Downloading dataset {}...".format(name), file=sys.stderr)
            urllib.request.urlretrieve("{}/{}".format(self._URL, path), filename="{}.tmp".format(path))
            os.rename("{}.tmp".format(path), path)

        with zipfile.ZipFile(path, "r") as zip_file:
            for dataset in ["train", "dev", "test"]:
                with zip_file.open("{}_{}.txt".format(os.path.splitext(path)[0], dataset), "r") as dataset_file:
                    setattr(self, dataset, self.Dataset(dataset_file, train=getattr(self, "train", None)))

    train: Dataset
    """The training dataset."""
    dev: Dataset
    """The development dataset."""
    test: Dataset
    """The test dataset."""

    # Evaluation infrastructure.
    @staticmethod
    def evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float:
        """Evaluate the `predictions` against the gold dataset.

        Returns:
          accuracy: The accuracy of the predictions in percentages.
        """
        gold = gold_dataset.data["labels"]

        if len(predictions) != len(gold):
            raise RuntimeError("The predictions are of different size than gold data: {} vs {}".format(
                len(predictions), len(gold)))

        correct = sum(gold[i] == predictions[i] for i in range(len(gold)))
        return 100 * correct / len(gold)

    @staticmethod
    def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
        """Evaluate the file with predictions against the gold dataset.

        Returns:
          accuracy: The accuracy of the predictions in percentages.
        """
        predictions = [line.rstrip("\r\n") for line in predictions_file]
        return TextClassificationDataset.evaluate(gold_dataset, predictions)

Element `class-attribute` `instance-attribute`

Element = TypedDict('Element', {'document': str, 'label': str})

The type of a single dataset element, i.e., a single document and its label.

Elements `class-attribute` `instance-attribute`

Elements = TypedDict('Elements', {'documents': list[str], 'labels': list[str]})

The type of the whole dataset, i.e., a corpus of documents.

Dataset

Bases: Dataset

Source code in npfl138/datasets/text_classification_dataset.py

class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_file: BinaryIO, train: Self | None = None) -> None:
        # Load the data
        self._data: "TextClassificationDataset.Elements" = {
            "documents": [],
            "labels": [],
        }
        for line in data_file:
            line = line.decode("utf-8").rstrip("\r\n")
            label, document = line.split("\t", maxsplit=1)

            self._data["documents"].append(document)
            self._data["labels"].append(label)

        # Create or copy the label mapping
        if train:
            self._label_vocab = train._label_vocab
        else:
            self._label_vocab = Vocabulary(sorted(set(self._data["labels"])))

    @property
    def data(self) -> "TextClassificationDataset.Elements":
        """Return the whole dataset as a `TextClassificationDataset.Elements` object."""
        return self._data

    @property
    def label_vocab(self) -> Vocabulary:
        """The label vocabulary of the dataset."""
        return self._label_vocab

    def __len__(self) -> int:
        """Return the number of documents in the dataset."""
        return len(self._data["labels"])

    def __getitem__(self, index: int) -> "TextClassificationDataset.Element":
        """Return the `index`-th element of the dataset as a dictionary."""
        return {key.removesuffix("s"): value[index] for key, value in self._data.items()}

data `property`

data: Elements

Return the whole dataset as a TextClassificationDataset.Elements object.

label_vocab `property`

label_vocab: Vocabulary

The label vocabulary of the dataset.

len

__len__() -> int

Return the number of documents in the dataset.

Source code in npfl138/datasets/text_classification_dataset.py

def __len__(self) -> int:
    """Return the number of documents in the dataset."""
    return len(self._data["labels"])

getitem

__getitem__(index: int) -> Element

Return the index-th element of the dataset as a dictionary.

Source code in npfl138/datasets/text_classification_dataset.py

def __getitem__(self, index: int) -> "TextClassificationDataset.Element":
    """Return the `index`-th element of the dataset as a dictionary."""
    return {key.removesuffix("s"): value[index] for key, value in self._data.items()}

init

__init__(name: str) -> None

Create the dataset from the given filename, downloading it if necessary.

Source code in npfl138/datasets/text_classification_dataset.py

def __init__(self, name: str) -> None:
    """Create the dataset from the given filename, downloading it if necessary."""
    path = "{}.zip".format(name)
    if not os.path.exists(path):
        print("Downloading dataset {}...".format(name), file=sys.stderr)
        urllib.request.urlretrieve("{}/{}".format(self._URL, path), filename="{}.tmp".format(path))
        os.rename("{}.tmp".format(path), path)

    with zipfile.ZipFile(path, "r") as zip_file:
        for dataset in ["train", "dev", "test"]:
            with zip_file.open("{}_{}.txt".format(os.path.splitext(path)[0], dataset), "r") as dataset_file:
                setattr(self, dataset, self.Dataset(dataset_file, train=getattr(self, "train", None)))

train `instance-attribute`

train: Dataset

The training dataset.

dev `instance-attribute`

dev: Dataset

The development dataset.

test `instance-attribute`

test: Dataset

The test dataset.

evaluate `staticmethod`

evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float

Evaluate the predictions against the gold dataset.

Returns:

accuracy ( float ) –

The accuracy of the predictions in percentages.

Source code in npfl138/datasets/text_classification_dataset.py

@staticmethod
def evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float:
    """Evaluate the `predictions` against the gold dataset.

    Returns:
      accuracy: The accuracy of the predictions in percentages.
    """
    gold = gold_dataset.data["labels"]

    if len(predictions) != len(gold):
        raise RuntimeError("The predictions are of different size than gold data: {} vs {}".format(
            len(predictions), len(gold)))

    correct = sum(gold[i] == predictions[i] for i in range(len(gold)))
    return 100 * correct / len(gold)

evaluate_file `staticmethod`

evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float

Evaluate the file with predictions against the gold dataset.

Returns:

accuracy ( float ) –

The accuracy of the predictions in percentages.

Source code in npfl138/datasets/text_classification_dataset.py

@staticmethod
def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
    """Evaluate the file with predictions against the gold dataset.

    Returns:
      accuracy: The accuracy of the predictions in percentages.
    """
    predictions = [line.rstrip("\r\n") for line in predictions_file]
    return TextClassificationDataset.evaluate(gold_dataset, predictions)

TextClassificationDataset

npfl138.datasets.text_classification_dataset.TextClassificationDataset

Element class-attribute instance-attribute

Elements class-attribute instance-attribute

Dataset

data property

label_vocab property

__len__

__getitem__

__init__

train instance-attribute

dev instance-attribute

test instance-attribute

evaluate staticmethod

evaluate_file staticmethod

Element `class-attribute` `instance-attribute`

Elements `class-attribute` `instance-attribute`

data `property`

label_vocab `property`

len

getitem

init

train `instance-attribute`

dev `instance-attribute`

test `instance-attribute`

evaluate `staticmethod`

evaluate_file `staticmethod`