Skip to content

UppercaseData

Loads the Uppercase data.

  • The UppercaseData consists of three Datasets
  • When loading, you need to specify window and alphabet_size. If alphabet_size is nonzero, it specifies the maximum number of alphabet characters, in which case that many most frequent characters will be used, and all other will be remapped to "<unk>".
  • Features are generated using a sliding window of a given size, i.e., for a character, we include left window characters, the character itself, and right window characters; 2 * window + 1 in total.
  • Each dataset (train/dev/test) has the following members:
    • __len__: the length of the text;
    • text: the original text (of course lowercased in case of the test set);
    • alphabet: an alphabet used by windows;
    • windows: a Pytorch Tensor with shape [size, 2 * window + 1] containing windows with indices of input lowercased characters;
    • labels: a PyTorch Tensor with shape [size] containing 0/1 indicating whether the character of the corresponding window is lowercase/uppercase.

npfl138.datasets.uppercase_data.UppercaseData

Source code in npfl138/datasets/uppercase_data.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
class UppercaseData:
    LABELS: int = 2

    URL: str = "https://ufal.mff.cuni.cz/~straka/courses/npfl138/2526/datasets/uppercase_data.zip"

    class Dataset:
        def __init__(self, data: str, window: int, alphabet: int | list[str]) -> None:
            self._window = window
            self._text = data
            self._size = len(data)

            # Create alphabet_map
            alphabet_map = {"<pad>": 0, "<unk>": 1}
            if not isinstance(alphabet, int):
                # Check that <pad> and <unk> are present at the beginning
                if alphabet[:2] == ["<pad>", "<unk>"]:
                    alphabet = alphabet[2:]
                else:
                    print("UppercaseData warning: The alphabet should start with <pad> and <unk>, prepending them.")

                for index, letter in enumerate(alphabet, len(alphabet_map)):
                    if letter in alphabet_map:
                        raise ValueError(f"UppercaseData: Duplicated character '{letter}' in the alphabet.")
                    alphabet_map[letter] = index
            else:
                # Find most frequent characters
                freqs = {}
                for char in self._text.lower():
                    freqs[char] = freqs.get(char, 0) + 1

                most_frequent = sorted(freqs.items(), key=lambda item: item[1], reverse=True)
                for i, (char, freq) in enumerate(most_frequent, len(alphabet_map)):
                    alphabet_map[char] = i
                    if alphabet and len(alphabet_map) >= alphabet:
                        break

            # Remap lowercased input characters using the alphabet_map and create labels
            lcletters = np.zeros(self._size + 2 * window, dtype=np.int64)
            labels = np.zeros(self._size, dtype=np.int64)
            for i in range(self._size):
                char = self._text[i].lower()
                if char not in alphabet_map:
                    char = "<unk>"
                lcletters[i + window] = alphabet_map[char]
                labels[i] = self._text[i].isupper()

            self._windows = torch.from_numpy(lcletters).unfold(0, 2 * window + 1, 1)
            self._labels = torch.from_numpy(labels)

            # Compute alphabet
            self._alphabet = [None] * len(alphabet_map)
            for key, value in alphabet_map.items():
                self._alphabet[value] = key

        def __len__(self) -> int:
            """Return the number of elements in the dataset."""
            return self._size

        @property
        def text(self) -> str:
            """The original text of the dataset."""
            return self._text

        @property
        def alphabet(self) -> list[str]:
            """An alphabet used by `windows`."""
            return self._alphabet

        @property
        def windows(self) -> torch.Tensor:
            """A Tensor with shape `[size, 2 * window + 1]` and dtype `torch.int64`
            containing windows with indices of input lowercased characters.
            """
            return self._windows

        @property
        def labels(self) -> torch.Tensor:
            """A Tensor with shape `[size]` and dtype `torch.int64` containing zeros and ones
            indicating whether the character of the corresponding window is lowercase or uppercase.
            """
            return self._labels

    def __init__(self, window: int, alphabet_size: int = 0) -> None:
        """Load the UppercaseData dataset, downloading it if necessary.

        Parameters:
          window: The size of the sliding window of left and right characters to use for generating features.
          alphabet_size: If nonzero, the maximum number of alphabet characters (the most frequent ones will
            be used, others are remapped go "<unk>"); if zero, all characters are used.
        """
        path = os.path.basename(self.URL)
        if not os.path.exists(path):
            print(f"Downloading dataset {path}...", file=sys.stderr)
            urllib.request.urlretrieve(self.URL, filename=f"{path}.tmp")
            os.rename(f"{path}.tmp", path)

        with zipfile.ZipFile(path, "r") as zip_file:
            for dataset in ["train", "dev", "test"]:
                with zip_file.open(f"{os.path.splitext(path)[0]}_{dataset}.txt", "r") as dataset_file:
                    data = dataset_file.read().decode("utf-8")
                setattr(self, dataset, self.Dataset(
                    data,
                    window,
                    alphabet=alphabet_size if dataset == "train" else self.train.alphabet,
                ))

    train: Dataset
    """The training dataset."""
    dev: Dataset
    """The development dataset."""
    test: Dataset
    """The test dataset

    Warning:
        The test dataset is lowercased.
    """

    # Evaluation infrastructure.
    @staticmethod
    def evaluate(gold_dataset: Dataset, predictions: str) -> float:
        """Evaluate the `predictions` against the gold dataset.

        Returns:
          accuracy
        """
        gold = gold_dataset.text

        if len(predictions) < len(gold):
            raise RuntimeError(f"The predictions are shorter than gold data: {len(predictions)} vs {len(gold)}.")

        correct = 0
        for i in range(len(gold)):
            # Note that just the lower() condition is not enough, for example
            # u03c2 and u03c3 have both u03c2 as an uppercase character.
            if predictions[i].lower() != gold[i].lower() and predictions[i].upper() != gold[i].upper():
                raise RuntimeError("The predictions and gold data differ on position {}: {} vs {}.".format(
                    i, repr(predictions[i:i + 20].lower()), repr(gold[i:i + 20].lower())))

            correct += gold[i] == predictions[i]
        return correct / len(gold)

    @staticmethod
    def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
        """Evaluate the file with predictions against the gold dataset.

        Returns:
          accuracy
        """
        predictions = predictions_file.read()
        return UppercaseData.evaluate(gold_dataset, predictions)

Dataset

Source code in npfl138/datasets/uppercase_data.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class Dataset:
    def __init__(self, data: str, window: int, alphabet: int | list[str]) -> None:
        self._window = window
        self._text = data
        self._size = len(data)

        # Create alphabet_map
        alphabet_map = {"<pad>": 0, "<unk>": 1}
        if not isinstance(alphabet, int):
            # Check that <pad> and <unk> are present at the beginning
            if alphabet[:2] == ["<pad>", "<unk>"]:
                alphabet = alphabet[2:]
            else:
                print("UppercaseData warning: The alphabet should start with <pad> and <unk>, prepending them.")

            for index, letter in enumerate(alphabet, len(alphabet_map)):
                if letter in alphabet_map:
                    raise ValueError(f"UppercaseData: Duplicated character '{letter}' in the alphabet.")
                alphabet_map[letter] = index
        else:
            # Find most frequent characters
            freqs = {}
            for char in self._text.lower():
                freqs[char] = freqs.get(char, 0) + 1

            most_frequent = sorted(freqs.items(), key=lambda item: item[1], reverse=True)
            for i, (char, freq) in enumerate(most_frequent, len(alphabet_map)):
                alphabet_map[char] = i
                if alphabet and len(alphabet_map) >= alphabet:
                    break

        # Remap lowercased input characters using the alphabet_map and create labels
        lcletters = np.zeros(self._size + 2 * window, dtype=np.int64)
        labels = np.zeros(self._size, dtype=np.int64)
        for i in range(self._size):
            char = self._text[i].lower()
            if char not in alphabet_map:
                char = "<unk>"
            lcletters[i + window] = alphabet_map[char]
            labels[i] = self._text[i].isupper()

        self._windows = torch.from_numpy(lcletters).unfold(0, 2 * window + 1, 1)
        self._labels = torch.from_numpy(labels)

        # Compute alphabet
        self._alphabet = [None] * len(alphabet_map)
        for key, value in alphabet_map.items():
            self._alphabet[value] = key

    def __len__(self) -> int:
        """Return the number of elements in the dataset."""
        return self._size

    @property
    def text(self) -> str:
        """The original text of the dataset."""
        return self._text

    @property
    def alphabet(self) -> list[str]:
        """An alphabet used by `windows`."""
        return self._alphabet

    @property
    def windows(self) -> torch.Tensor:
        """A Tensor with shape `[size, 2 * window + 1]` and dtype `torch.int64`
        containing windows with indices of input lowercased characters.
        """
        return self._windows

    @property
    def labels(self) -> torch.Tensor:
        """A Tensor with shape `[size]` and dtype `torch.int64` containing zeros and ones
        indicating whether the character of the corresponding window is lowercase or uppercase.
        """
        return self._labels

__len__

__len__() -> int

Return the number of elements in the dataset.

Source code in npfl138/datasets/uppercase_data.py
97
98
99
def __len__(self) -> int:
    """Return the number of elements in the dataset."""
    return self._size

text property

text: str

The original text of the dataset.

alphabet property

alphabet: list[str]

An alphabet used by windows.

windows property

windows: Tensor

A Tensor with shape [size, 2 * window + 1] and dtype torch.int64 containing windows with indices of input lowercased characters.

labels property

labels: Tensor

A Tensor with shape [size] and dtype torch.int64 containing zeros and ones indicating whether the character of the corresponding window is lowercase or uppercase.

__init__

__init__(window: int, alphabet_size: int = 0) -> None

Load the UppercaseData dataset, downloading it if necessary.

Parameters:

  • window (int) –

    The size of the sliding window of left and right characters to use for generating features.

  • alphabet_size (int, default: 0 ) –

    If nonzero, the maximum number of alphabet characters (the most frequent ones will be used, others are remapped go ""); if zero, all characters are used.

Source code in npfl138/datasets/uppercase_data.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def __init__(self, window: int, alphabet_size: int = 0) -> None:
    """Load the UppercaseData dataset, downloading it if necessary.

    Parameters:
      window: The size of the sliding window of left and right characters to use for generating features.
      alphabet_size: If nonzero, the maximum number of alphabet characters (the most frequent ones will
        be used, others are remapped go "<unk>"); if zero, all characters are used.
    """
    path = os.path.basename(self.URL)
    if not os.path.exists(path):
        print(f"Downloading dataset {path}...", file=sys.stderr)
        urllib.request.urlretrieve(self.URL, filename=f"{path}.tmp")
        os.rename(f"{path}.tmp", path)

    with zipfile.ZipFile(path, "r") as zip_file:
        for dataset in ["train", "dev", "test"]:
            with zip_file.open(f"{os.path.splitext(path)[0]}_{dataset}.txt", "r") as dataset_file:
                data = dataset_file.read().decode("utf-8")
            setattr(self, dataset, self.Dataset(
                data,
                window,
                alphabet=alphabet_size if dataset == "train" else self.train.alphabet,
            ))

train instance-attribute

train: Dataset

The training dataset.

dev instance-attribute

dev: Dataset

The development dataset.

test instance-attribute

test: Dataset

The test dataset

Warning

The test dataset is lowercased.

evaluate staticmethod

evaluate(gold_dataset: Dataset, predictions: str) -> float

Evaluate the predictions against the gold dataset.

Returns:

Source code in npfl138/datasets/uppercase_data.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
@staticmethod
def evaluate(gold_dataset: Dataset, predictions: str) -> float:
    """Evaluate the `predictions` against the gold dataset.

    Returns:
      accuracy
    """
    gold = gold_dataset.text

    if len(predictions) < len(gold):
        raise RuntimeError(f"The predictions are shorter than gold data: {len(predictions)} vs {len(gold)}.")

    correct = 0
    for i in range(len(gold)):
        # Note that just the lower() condition is not enough, for example
        # u03c2 and u03c3 have both u03c2 as an uppercase character.
        if predictions[i].lower() != gold[i].lower() and predictions[i].upper() != gold[i].upper():
            raise RuntimeError("The predictions and gold data differ on position {}: {} vs {}.".format(
                i, repr(predictions[i:i + 20].lower()), repr(gold[i:i + 20].lower())))

        correct += gold[i] == predictions[i]
    return correct / len(gold)

evaluate_file staticmethod

evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float

Evaluate the file with predictions against the gold dataset.

Returns:

Source code in npfl138/datasets/uppercase_data.py
184
185
186
187
188
189
190
191
192
@staticmethod
def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
    """Evaluate the file with predictions against the gold dataset.

    Returns:
      accuracy
    """
    predictions = predictions_file.read()
    return UppercaseData.evaluate(gold_dataset, predictions)