Skip to content

Vocabulary

npfl138.Vocabulary

A class for managing mapping between strings and indices.

The vocabulary is initialized with a list of strings. It provides:

  • __len__: the number of strings in the vocabulary,
  • __iter__: the iterator over strings in the vocabulary,
  • string(index: int) -> str: the string for a given vocabulary index,
  • strings(indices: Sequence[int]) -> list[str]: the list of strings for the given indices,
  • index(string: str) -> int: the index of a given string in the vocabulary,
  • indices(strings: Sequence[str]) -> list[int]: the list of indices for given strings.
Source code in npfl138/vocabulary.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
class Vocabulary:
    """ A class for managing mapping between strings and indices.

    The vocabulary is initialized with a list of strings. It provides:

    - `__len__`: the number of strings in the vocabulary,
    - `__iter__`: the iterator over strings in the vocabulary,
    - `string(index: int) -> str`: the string for a given vocabulary index,
    - `strings(indices: Sequence[int]) -> list[str]`: the list of strings for the given indices,
    - `index(string: str) -> int`: the index of a given string in the vocabulary,
    - `indices(strings: Sequence[str]) -> list[int]`: the list of indices for given strings.
    """
    PAD: int = 0
    """The index of the padding token."""
    UNK: int = 1
    """The index of the unknown token, if present."""

    def __init__(self, strings: Sequence[str], add_unk: bool = False) -> None:
        """Initializes the vocabulary with the given list of strings.

        The `Vocabulary.PAD` is always the first token in the vocabulary;
        `Vocabulary.UNK` is the second token but only when `add_unk=True`.
        """
        self._strings = ["[PAD]"] + (["[UNK]"] if add_unk else [])
        self._strings.extend(strings)
        self._string_map = {string: index for index, string in enumerate(self._strings)}
        if not add_unk:
            self.UNK = None

    def __len__(self) -> int:
        """Returns the number of strings in the vocabulary."""
        return len(self._strings)

    def __iter__(self) -> Iterable[str]:
        """Returns an iterator over strings in the vocabulary."""
        return iter(self._strings)

    def string(self, index: int) -> str:
        """Returns the string for a given vocabulary index."""
        return self._strings[index]

    def strings(self, indices: Sequence[int]) -> list[str]:
        """Returns the list of strings for the given indices."""
        return [self._strings[index] for index in indices]

    def index(self, string: str) -> int | None:
        """Returns the index of a given string in the vocabulary."""
        return self._string_map.get(string, self.UNK)

    def indices(self, strings: Sequence[str]) -> list[int | None]:
        """Returns the list of indices for given strings."""
        return [self._string_map.get(string, self.UNK) for string in strings]

PAD class-attribute instance-attribute

PAD: int = 0

The index of the padding token.

UNK class-attribute instance-attribute

UNK: int = 1

The index of the unknown token, if present.

__init__

__init__(strings: Sequence[str], add_unk: bool = False) -> None

Initializes the vocabulary with the given list of strings.

The Vocabulary.PAD is always the first token in the vocabulary; Vocabulary.UNK is the second token but only when add_unk=True.

Source code in npfl138/vocabulary.py
26
27
28
29
30
31
32
33
34
35
36
def __init__(self, strings: Sequence[str], add_unk: bool = False) -> None:
    """Initializes the vocabulary with the given list of strings.

    The `Vocabulary.PAD` is always the first token in the vocabulary;
    `Vocabulary.UNK` is the second token but only when `add_unk=True`.
    """
    self._strings = ["[PAD]"] + (["[UNK]"] if add_unk else [])
    self._strings.extend(strings)
    self._string_map = {string: index for index, string in enumerate(self._strings)}
    if not add_unk:
        self.UNK = None

__len__

__len__() -> int

Returns the number of strings in the vocabulary.

Source code in npfl138/vocabulary.py
38
39
40
def __len__(self) -> int:
    """Returns the number of strings in the vocabulary."""
    return len(self._strings)

__iter__

__iter__() -> Iterable[str]

Returns an iterator over strings in the vocabulary.

Source code in npfl138/vocabulary.py
42
43
44
def __iter__(self) -> Iterable[str]:
    """Returns an iterator over strings in the vocabulary."""
    return iter(self._strings)

string

string(index: int) -> str

Returns the string for a given vocabulary index.

Source code in npfl138/vocabulary.py
46
47
48
def string(self, index: int) -> str:
    """Returns the string for a given vocabulary index."""
    return self._strings[index]

strings

strings(indices: Sequence[int]) -> list[str]

Returns the list of strings for the given indices.

Source code in npfl138/vocabulary.py
50
51
52
def strings(self, indices: Sequence[int]) -> list[str]:
    """Returns the list of strings for the given indices."""
    return [self._strings[index] for index in indices]

index

index(string: str) -> int | None

Returns the index of a given string in the vocabulary.

Source code in npfl138/vocabulary.py
54
55
56
def index(self, string: str) -> int | None:
    """Returns the index of a given string in the vocabulary."""
    return self._string_map.get(string, self.UNK)

indices

indices(strings: Sequence[str]) -> list[int | None]

Returns the list of indices for given strings.

Source code in npfl138/vocabulary.py
58
59
60
def indices(self, strings: Sequence[str]) -> list[int | None]:
    """Returns the list of indices for given strings."""
    return [self._string_map.get(string, self.UNK) for string in strings]