37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156 | class UppercaseData:
LABELS: int = 2
_URL: str = "https://ufal.mff.cuni.cz/~straka/courses/npfl138/2425/datasets/uppercase_data.zip"
class Dataset:
def __init__(self, data: str, window: int, alphabet: int | list[str], label_dtype: torch.dtype) -> None:
self._window = window
self._text = data
self._size = len(data)
# Create alphabet_map
alphabet_map = {"<pad>": 0, "<unk>": 1}
if not isinstance(alphabet, int):
# Check that <pad> and <unk> are present at the beginning
if alphabet[:2] == ["<pad>", "<unk>"]:
alphabet = alphabet[2:]
else:
print("UppercaseData warning: The alphabet should start with <pad> and <unk>, prepending them.")
for index, letter in enumerate(alphabet, len(alphabet_map)):
if letter in alphabet_map:
raise ValueError("UppercaseData: Duplicated character '{}' in the alphabet.".format(letter))
alphabet_map[letter] = index
else:
# Find most frequent characters
freqs = {}
for char in self._text.lower():
freqs[char] = freqs.get(char, 0) + 1
most_frequent = sorted(freqs.items(), key=lambda item: item[1], reverse=True)
for i, (char, freq) in enumerate(most_frequent, len(alphabet_map)):
alphabet_map[char] = i
if alphabet and len(alphabet_map) >= alphabet:
break
# Remap lowercased input characters using the alphabet_map and create labels
lcletters = np.zeros(self._size + 2 * window, dtype=np.int64)
labels = np.zeros(self._size, dtype=np.float32)
for i in range(self._size):
char = self._text[i].lower()
if char not in alphabet_map:
char = "<unk>"
lcletters[i + window] = alphabet_map[char]
labels[i] = self._text[i].isupper()
self._windows = torch.from_numpy(lcletters).unfold(0, 2 * window + 1, 1)
self._labels = torch.from_numpy(labels).to(dtype=label_dtype)
# Compute alphabet
self._alphabet = [None] * len(alphabet_map)
for key, value in alphabet_map.items():
self._alphabet[value] = key
@property
def size(self) -> int:
return self._size
@property
def text(self) -> str:
return self._text
@property
def alphabet(self) -> list[str]:
return self._alphabet
@property
def windows(self) -> torch.Tensor:
return self._windows
@property
def labels(self) -> torch.Tensor:
return self._labels
def __init__(self, window: int, alphabet_size: int = 0, label_dtype: torch.dtype = torch.float32) -> None:
path = os.path.basename(self._URL)
if not os.path.exists(path):
print("Downloading dataset {}...".format(path), file=sys.stderr)
urllib.request.urlretrieve(self._URL, filename="{}.tmp".format(path))
os.rename("{}.tmp".format(path), path)
with zipfile.ZipFile(path, "r") as zip_file:
for dataset in ["train", "dev", "test"]:
with zip_file.open("{}_{}.txt".format(os.path.splitext(path)[0], dataset), "r") as dataset_file:
data = dataset_file.read().decode("utf-8")
setattr(self, dataset, self.Dataset(
data,
window,
alphabet=alphabet_size if dataset == "train" else self.train.alphabet,
label_dtype=label_dtype,
))
train: Dataset
dev: Dataset
test: Dataset
# Evaluation infrastructure.
@staticmethod
def evaluate(gold_dataset: Dataset, predictions: str) -> float:
gold = gold_dataset.text
if len(predictions) < len(gold):
raise RuntimeError("The predictions are shorter than gold data: {} vs {}.".format(
len(predictions), len(gold)))
correct = 0
for i in range(len(gold)):
# Note that just the lower() condition is not enough, for example
# u03c2 and u03c3 have both u03c2 as an uppercase character.
if predictions[i].lower() != gold[i].lower() and predictions[i].upper() != gold[i].upper():
raise RuntimeError("The predictions and gold data differ on position {}: {} vs {}.".format(
i, repr(predictions[i:i + 20].lower()), repr(gold[i:i + 20].lower())))
correct += gold[i] == predictions[i]
return 100 * correct / len(gold)
@staticmethod
def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
predictions = predictions_file.read()
return UppercaseData.evaluate(gold_dataset, predictions)
|