Skip to content

CommonVoiceCs

npfl138.datasets.common_voice_cs.CommonVoiceCs

Source code in npfl138/datasets/common_voice_cs.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
class CommonVoiceCs:
    PAD: int = 0
    """The index of the padding token in the vocabulary."""

    MFCC_DIM: int = 13
    """The dimensionality of the MFCC features."""

    LETTERS: int = 48
    """The number of letters used in the dataset."""
    LETTER_NAMES: list[str] = [
        "[PAD]",
        " ", "a", "á", "ä", "b", "c", "č", "d", "ď", "e", "é", "è", "ě", "f", "g", "h",
        "i", "í", "ï", "j", "k", "l", "m", "n", "ň", "o", "ó", "ö", "p", "q", "r", "ř",
        "s", "š", "t", "ť", "u", "ú", "ů", "ü", "v", "w", "x", "y", "ý", "z", "ž",
    ]
    """The list of letter strings used in the dataset."""

    Element = TypedDict("Element", {"mfccs": torch.Tensor, "sentence": str})
    """The type of a single dataset element."""

    URL: str = "https://ufal.mff.cuni.cz/~straka/courses/npfl138/2425/datasets/"

    class Dataset(TFRecordDataset):
        def __init__(self, path: str, size: int, decode_on_demand: bool) -> None:
            super().__init__(path, size, decode_on_demand)

        def __len__(self) -> int:
            """Return the number of elements in the dataset."""
            return super().__len__()

        def __getitem__(self, index: int) -> "CommonVoiceCs.Element":
            """Return the `index`-th element of the dataset."""
            return super().__getitem__(index)

        def _tfrecord_decode(self, data: dict, indices: dict, index: int) -> "CommonVoiceCs.Element":
            return {
                "mfccs": data["mfccs"][indices["mfccs"][index]:indices["mfccs"][index + 1]].view(
                    -1, CommonVoiceCs.MFCC_DIM),
                "sentence": data["sentence"][
                    indices["sentence"][index]:indices["sentence"][index + 1]].numpy().tobytes().decode("utf-8"),
            }

    def __init__(self, decode_on_demand: bool = False) -> None:
        "Load the CommonVoiceCs dataset, downloading it if necessary."
        for dataset, size in [("train", 9_773), ("dev", 904), ("test", 3_240)]:
            path = "common_voice_cs.{}.tfrecord".format(dataset)
            if not os.path.exists(path):
                print("Downloading file {}...".format(path), file=sys.stderr)
                urllib.request.urlretrieve("{}/{}".format(self.URL, path), filename="{}.tmp".format(path))
                os.rename("{}.tmp".format(path), path)

            setattr(self, dataset, self.Dataset(path, size, decode_on_demand))

        self._letters_vocab = Vocabulary(self.LETTER_NAMES[1:])

    train: Dataset
    """The training dataset."""
    dev: Dataset
    """The development dataset."""
    test: Dataset
    """The test dataset."""

    @property
    def letters_vocab(self) -> Vocabulary:
        """The [npfl138.Vocabulary][] object of the letters used in the dataset."""
        return self._letters_vocab

    # Methods for generating MFCC features.
    def load_audio(self, path: str, target_sample_rate: int | None = None) -> tuple[torch.Tensor, int]:
        """Load an audio file and return the audio tensor and sample rate.

        Optionally resample the audio to the target sample rate.
        """
        audio, sample_rate = torchaudio.load(path)
        if target_sample_rate is not None and target_sample_rate != sample_rate:
            audio = torchaudio.functional.resample(audio, sample_rate, target_sample_rate)
            sample_rate = target_sample_rate
        return torch.mean(audio, dim=0), sample_rate

    def mfcc_extract(self, audio: torch.Tensor, sample_rate: int = 16_000) -> torch.Tensor:
        """Extract MFCC features from an audio tensor.

        This function can be used to extract MFCC features from any audio sample,
        allowing to perform speech recording on any audio sample.
        """
        assert sample_rate == 16_000, "Only 16k sample rate is supported"

        if not hasattr(self, "_mfcc_fn"):
            # Compute a 1024-point STFT with frames of 64 ms and 75% overlap.
            # Then warp the linear scale spectrograms into the mel-scale.
            # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
            # Finally, compute MFCCs from log-mel-spectrograms and take the first
            # `CommonVoiceCs.MFCC_DIM=13` of them.
            self._mfcc_fn = torchaudio.transforms.MFCC(
                sample_rate=16_000, n_mfcc=self.MFCC_DIM, log_mels=True,
                melkwargs={"n_fft": 1024, "win_length": 1024, "hop_length": 256,
                           "f_min": 80., "f_max": 7600., "n_mels": 80, "center": False}
            )

        # Compute MFCCs of shape `[sequence_length, CommonVoiceCs.MFCC_DIM=13]`.
        mfccs = self._mfcc_fn(audio).permute(1, 0)

        # Scale the first MFCC coefficient by for consistency with the CommonVoiceCs dataset.
        mfccs[:, 0] *= 2**0.5
        return mfccs

    # The EditDistanceMetric
    EditDistanceMetric = metrics.EditDistance
    """The edit distance metric used for evaluation."""

    # Evaluation infrastructure
    @staticmethod
    def evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float:
        """Evaluate the `predictions` against the gold dataset.

        Returns:
          edit_distance: The average edit distance of the predictions in percentages.
        """
        gold = [example["sentence"] for example in gold_dataset]

        if len(predictions) != len(gold):
            raise RuntimeError("The predictions are of different size than gold data: {} vs {}".format(
                len(predictions), len(gold)))

        return 100 * metrics.EditDistance().update(predictions, gold).compute()

    @staticmethod
    def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
        """Evaluate the file with predictions against the gold dataset.

        Returns:
          edit_distance: The average edit distance of the predictions in percentages.
        """
        predictions = []
        for line in predictions_file:
            predictions.append(line.rstrip("\n"))
        return CommonVoiceCs.evaluate(gold_dataset, predictions)

PAD class-attribute instance-attribute

PAD: int = 0

The index of the padding token in the vocabulary.

MFCC_DIM class-attribute instance-attribute

MFCC_DIM: int = 13

The dimensionality of the MFCC features.

LETTERS class-attribute instance-attribute

LETTERS: int = 48

The number of letters used in the dataset.

LETTER_NAMES class-attribute instance-attribute

LETTER_NAMES: list[str] = [
    "[PAD]",
    " ",
    "a",
    "á",
    "ä",
    "b",
    "c",
    "č",
    "d",
    "ď",
    "e",
    "é",
    "è",
    "ě",
    "f",
    "g",
    "h",
    "i",
    "í",
    "ï",
    "j",
    "k",
    "l",
    "m",
    "n",
    "ň",
    "o",
    "ó",
    "ö",
    "p",
    "q",
    "r",
    "ř",
    "s",
    "š",
    "t",
    "ť",
    "u",
    "ú",
    "ů",
    "ü",
    "v",
    "w",
    "x",
    "y",
    "ý",
    "z",
    "ž",
]

The list of letter strings used in the dataset.

Element class-attribute instance-attribute

Element = TypedDict('Element', {'mfccs': Tensor, 'sentence': str})

The type of a single dataset element.

Dataset

Bases: TFRecordDataset

Source code in npfl138/datasets/common_voice_cs.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class Dataset(TFRecordDataset):
    def __init__(self, path: str, size: int, decode_on_demand: bool) -> None:
        super().__init__(path, size, decode_on_demand)

    def __len__(self) -> int:
        """Return the number of elements in the dataset."""
        return super().__len__()

    def __getitem__(self, index: int) -> "CommonVoiceCs.Element":
        """Return the `index`-th element of the dataset."""
        return super().__getitem__(index)

    def _tfrecord_decode(self, data: dict, indices: dict, index: int) -> "CommonVoiceCs.Element":
        return {
            "mfccs": data["mfccs"][indices["mfccs"][index]:indices["mfccs"][index + 1]].view(
                -1, CommonVoiceCs.MFCC_DIM),
            "sentence": data["sentence"][
                indices["sentence"][index]:indices["sentence"][index + 1]].numpy().tobytes().decode("utf-8"),
        }

__len__

__len__() -> int

Return the number of elements in the dataset.

Source code in npfl138/datasets/common_voice_cs.py
45
46
47
def __len__(self) -> int:
    """Return the number of elements in the dataset."""
    return super().__len__()

__getitem__

__getitem__(index: int) -> Element

Return the index-th element of the dataset.

Source code in npfl138/datasets/common_voice_cs.py
49
50
51
def __getitem__(self, index: int) -> "CommonVoiceCs.Element":
    """Return the `index`-th element of the dataset."""
    return super().__getitem__(index)

__init__

__init__(decode_on_demand: bool = False) -> None

Load the CommonVoiceCs dataset, downloading it if necessary.

Source code in npfl138/datasets/common_voice_cs.py
61
62
63
64
65
66
67
68
69
70
71
72
def __init__(self, decode_on_demand: bool = False) -> None:
    "Load the CommonVoiceCs dataset, downloading it if necessary."
    for dataset, size in [("train", 9_773), ("dev", 904), ("test", 3_240)]:
        path = "common_voice_cs.{}.tfrecord".format(dataset)
        if not os.path.exists(path):
            print("Downloading file {}...".format(path), file=sys.stderr)
            urllib.request.urlretrieve("{}/{}".format(self.URL, path), filename="{}.tmp".format(path))
            os.rename("{}.tmp".format(path), path)

        setattr(self, dataset, self.Dataset(path, size, decode_on_demand))

    self._letters_vocab = Vocabulary(self.LETTER_NAMES[1:])

train instance-attribute

train: Dataset

The training dataset.

dev instance-attribute

dev: Dataset

The development dataset.

test instance-attribute

test: Dataset

The test dataset.

letters_vocab property

letters_vocab: Vocabulary

The npfl138.Vocabulary object of the letters used in the dataset.

load_audio

load_audio(
    path: str, target_sample_rate: int | None = None
) -> tuple[Tensor, int]

Load an audio file and return the audio tensor and sample rate.

Optionally resample the audio to the target sample rate.

Source code in npfl138/datasets/common_voice_cs.py
87
88
89
90
91
92
93
94
95
96
def load_audio(self, path: str, target_sample_rate: int | None = None) -> tuple[torch.Tensor, int]:
    """Load an audio file and return the audio tensor and sample rate.

    Optionally resample the audio to the target sample rate.
    """
    audio, sample_rate = torchaudio.load(path)
    if target_sample_rate is not None and target_sample_rate != sample_rate:
        audio = torchaudio.functional.resample(audio, sample_rate, target_sample_rate)
        sample_rate = target_sample_rate
    return torch.mean(audio, dim=0), sample_rate

mfcc_extract

mfcc_extract(audio: Tensor, sample_rate: int = 16000) -> Tensor

Extract MFCC features from an audio tensor.

This function can be used to extract MFCC features from any audio sample, allowing to perform speech recording on any audio sample.

Source code in npfl138/datasets/common_voice_cs.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def mfcc_extract(self, audio: torch.Tensor, sample_rate: int = 16_000) -> torch.Tensor:
    """Extract MFCC features from an audio tensor.

    This function can be used to extract MFCC features from any audio sample,
    allowing to perform speech recording on any audio sample.
    """
    assert sample_rate == 16_000, "Only 16k sample rate is supported"

    if not hasattr(self, "_mfcc_fn"):
        # Compute a 1024-point STFT with frames of 64 ms and 75% overlap.
        # Then warp the linear scale spectrograms into the mel-scale.
        # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
        # Finally, compute MFCCs from log-mel-spectrograms and take the first
        # `CommonVoiceCs.MFCC_DIM=13` of them.
        self._mfcc_fn = torchaudio.transforms.MFCC(
            sample_rate=16_000, n_mfcc=self.MFCC_DIM, log_mels=True,
            melkwargs={"n_fft": 1024, "win_length": 1024, "hop_length": 256,
                       "f_min": 80., "f_max": 7600., "n_mels": 80, "center": False}
        )

    # Compute MFCCs of shape `[sequence_length, CommonVoiceCs.MFCC_DIM=13]`.
    mfccs = self._mfcc_fn(audio).permute(1, 0)

    # Scale the first MFCC coefficient by for consistency with the CommonVoiceCs dataset.
    mfccs[:, 0] *= 2**0.5
    return mfccs

EditDistanceMetric class-attribute instance-attribute

EditDistanceMetric = EditDistance

The edit distance metric used for evaluation.

evaluate staticmethod

evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float

Evaluate the predictions against the gold dataset.

Returns:

  • edit_distance ( float ) –

    The average edit distance of the predictions in percentages.

Source code in npfl138/datasets/common_voice_cs.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
@staticmethod
def evaluate(gold_dataset: Dataset, predictions: Sequence[str]) -> float:
    """Evaluate the `predictions` against the gold dataset.

    Returns:
      edit_distance: The average edit distance of the predictions in percentages.
    """
    gold = [example["sentence"] for example in gold_dataset]

    if len(predictions) != len(gold):
        raise RuntimeError("The predictions are of different size than gold data: {} vs {}".format(
            len(predictions), len(gold)))

    return 100 * metrics.EditDistance().update(predictions, gold).compute()

evaluate_file staticmethod

evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float

Evaluate the file with predictions against the gold dataset.

Returns:

  • edit_distance ( float ) –

    The average edit distance of the predictions in percentages.

Source code in npfl138/datasets/common_voice_cs.py
145
146
147
148
149
150
151
152
153
154
155
@staticmethod
def evaluate_file(gold_dataset: Dataset, predictions_file: TextIO) -> float:
    """Evaluate the file with predictions against the gold dataset.

    Returns:
      edit_distance: The average edit distance of the predictions in percentages.
    """
    predictions = []
    for line in predictions_file:
        predictions.append(line.rstrip("\n"))
    return CommonVoiceCs.evaluate(gold_dataset, predictions)