Skip to content

TTSDataset

The TTSDataset is a collection of (text, mel_spectrogram) pairs.

  • The data consists of just one dataset:
    • train
  • The train dataset is a torch.utils.data.Dataset instance providing
    • __len__: number of utterances in the dataset;
    • __getitem__: return the requested utterance as a dictionary with keys:
      • "text": the text of an utterance as a string,
      • "mel_spectrogram": the mel spectrogram of an utterance with shape [length, n_mels];
    • char_vocab: a npfl138.Vocabulary instance with the character mapping.

npfl138.datasets.tts_dataset.TTSDataset

Source code in npfl138/datasets/tts_dataset.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class TTSDataset:
    PAD: int = 0
    """The index of the padding token in the vocabulary."""

    Element = TypedDict("Element", {"text": str, "mel_spectrogram": torch.Tensor})
    """The type of a single dataset element, i.e., a single utterance."""

    _URL: str = "https://ufal.mff.cuni.cz/~straka/courses/npfl138/2425/datasets/"

    class Dataset(torch.utils.data.Dataset):
        def __init__(self, path: str, sample_rate: int, window_length: int, hop_length: int, mels: int) -> None:
            # Load the data
            self._utterances = []
            with open(path, "r", encoding="utf-8") as tsv_file:
                for line in tsv_file:
                    path, text = line.rstrip("\r\n").split("\t")
                    self._utterances.append((text, path))

            # Create the character mapping
            self._char_vocab = Vocabulary(sorted(set("".join(text for text, _ in self._utterances))), add_unk=True)

            # Create the MelSpectrogram transform
            self._transform = torchaudio.transforms.MelSpectrogram(
                sample_rate=sample_rate,
                n_fft=window_length,
                win_length=window_length,
                hop_length=hop_length,
                n_mels=mels,
                power=1,  # Vococ-style mel spectrogram; others like BigVGAN use power=2
            )

        def __len__(self) -> int:
            """Return the number of utterances in the dataset."""
            return len(self._utterances)

        def __getitem__(self, index: int) -> "TTSDataset.Element":
            """Return the `index`-th element of the dataset as a dictionary."""
            text, path = self._utterances[index]
            audio, sample_rate = torchaudio.load(path, normalize=True)  # load audio file
            audio = audio.mean(dim=0)  # convert to mono
            if sample_rate != self._transform.sample_rate:  # resample if necessary
                audio = torchaudio.functional.resample(audio, sample_rate, self._transform.sample_rate)
            mel_spectrogram = self._transform(audio).permute(1, 0)  # mel spectrogram
            mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-7))  # dynamic range compression
            return {
                "text": text,
                "mel_spectrogram": mel_spectrogram,
            }

        @property
        def char_vocab(self) -> Vocabulary:
            """The character vocabulary of the dataset."""
            return self._char_vocab

    def __init__(self, name: str, sample_rate: int, window_length: int, hop_length: int, mels: int) -> None:
        """Load the dataset from the given filename, downloading it if necessary."""
        path = "{}.tsv".format(name)
        if not os.path.exists(path):
            zip_path = "{}.zip".format(name)
            print("Downloading dataset {}...".format(name), file=sys.stderr)
            urllib.request.urlretrieve("{}/{}".format(self._URL, zip_path), filename=zip_path)
            with zipfile.ZipFile(zip_path) as zip_file:
                zip_file.extractall()
            os.remove(zip_path)

        self.train = self.Dataset(path, sample_rate, window_length, hop_length, mels)

    train: Dataset
    """The training dataset."""

PAD class-attribute instance-attribute

PAD: int = 0

The index of the padding token in the vocabulary.

Element class-attribute instance-attribute

Element = TypedDict('Element', {'text': str, 'mel_spectrogram': Tensor})

The type of a single dataset element, i.e., a single utterance.

Dataset

Bases: Dataset

Source code in npfl138/datasets/tts_dataset.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class Dataset(torch.utils.data.Dataset):
    def __init__(self, path: str, sample_rate: int, window_length: int, hop_length: int, mels: int) -> None:
        # Load the data
        self._utterances = []
        with open(path, "r", encoding="utf-8") as tsv_file:
            for line in tsv_file:
                path, text = line.rstrip("\r\n").split("\t")
                self._utterances.append((text, path))

        # Create the character mapping
        self._char_vocab = Vocabulary(sorted(set("".join(text for text, _ in self._utterances))), add_unk=True)

        # Create the MelSpectrogram transform
        self._transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=window_length,
            win_length=window_length,
            hop_length=hop_length,
            n_mels=mels,
            power=1,  # Vococ-style mel spectrogram; others like BigVGAN use power=2
        )

    def __len__(self) -> int:
        """Return the number of utterances in the dataset."""
        return len(self._utterances)

    def __getitem__(self, index: int) -> "TTSDataset.Element":
        """Return the `index`-th element of the dataset as a dictionary."""
        text, path = self._utterances[index]
        audio, sample_rate = torchaudio.load(path, normalize=True)  # load audio file
        audio = audio.mean(dim=0)  # convert to mono
        if sample_rate != self._transform.sample_rate:  # resample if necessary
            audio = torchaudio.functional.resample(audio, sample_rate, self._transform.sample_rate)
        mel_spectrogram = self._transform(audio).permute(1, 0)  # mel spectrogram
        mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-7))  # dynamic range compression
        return {
            "text": text,
            "mel_spectrogram": mel_spectrogram,
        }

    @property
    def char_vocab(self) -> Vocabulary:
        """The character vocabulary of the dataset."""
        return self._char_vocab

__len__

__len__() -> int

Return the number of utterances in the dataset.

Source code in npfl138/datasets/tts_dataset.py
61
62
63
def __len__(self) -> int:
    """Return the number of utterances in the dataset."""
    return len(self._utterances)

__getitem__

__getitem__(index: int) -> Element

Return the index-th element of the dataset as a dictionary.

Source code in npfl138/datasets/tts_dataset.py
65
66
67
68
69
70
71
72
73
74
75
76
77
def __getitem__(self, index: int) -> "TTSDataset.Element":
    """Return the `index`-th element of the dataset as a dictionary."""
    text, path = self._utterances[index]
    audio, sample_rate = torchaudio.load(path, normalize=True)  # load audio file
    audio = audio.mean(dim=0)  # convert to mono
    if sample_rate != self._transform.sample_rate:  # resample if necessary
        audio = torchaudio.functional.resample(audio, sample_rate, self._transform.sample_rate)
    mel_spectrogram = self._transform(audio).permute(1, 0)  # mel spectrogram
    mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-7))  # dynamic range compression
    return {
        "text": text,
        "mel_spectrogram": mel_spectrogram,
    }

char_vocab property

char_vocab: Vocabulary

The character vocabulary of the dataset.

__init__

__init__(
    name: str, sample_rate: int, window_length: int, hop_length: int, mels: int
) -> None

Load the dataset from the given filename, downloading it if necessary.

Source code in npfl138/datasets/tts_dataset.py
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(self, name: str, sample_rate: int, window_length: int, hop_length: int, mels: int) -> None:
    """Load the dataset from the given filename, downloading it if necessary."""
    path = "{}.tsv".format(name)
    if not os.path.exists(path):
        zip_path = "{}.zip".format(name)
        print("Downloading dataset {}...".format(name), file=sys.stderr)
        urllib.request.urlretrieve("{}/{}".format(self._URL, zip_path), filename=zip_path)
        with zipfile.ZipFile(zip_path) as zip_file:
            zip_file.extractall()
        os.remove(zip_path)

    self.train = self.Dataset(path, sample_rate, window_length, hop_length, mels)

train instance-attribute

train: Dataset

The training dataset.