Skip to content

MorphoAnalyzer

npfl138.datasets.morpho_analyzer.MorphoAnalyzer

Loads a morphological analyses in a vertical format.

The analyzer provides only a method get(word: str) returning a list of analyses, each containing two fields lemma and tag. If an analysis of the word is not found, an empty list is returned.

Source code in npfl138/datasets/morpho_analyzer.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
class MorphoAnalyzer:
    """ Loads a morphological analyses in a vertical format.

    The analyzer provides only a method `get(word: str)` returning a list
    of analyses, each containing two fields `lemma` and `tag`.
    If an analysis of the word is not found, an empty list is returned.
    """

    _URL: str = "https://ufal.mff.cuni.cz/~straka/courses/npfl138/2425/datasets/"

    class LemmaTag:
        """A class representing a morphological analysis of a word."""
        def __init__(self, lemma: str, tag: str) -> None:
            self.lemma = lemma
            """A lemma of the word."""
            self.tag = tag
            """A tag of the word."""

        def __repr__(self) -> str:
            return "(lemma: {}, tag: {})".format(self.lemma, self.tag)

    def __init__(self, dataset: str) -> None:
        """Loads the morphological analyses from the specified dataset."""
        path = "{}.zip".format(dataset)
        if not os.path.exists(path):
            print("Downloading dataset {}...".format(dataset), file=sys.stderr)
            urllib.request.urlretrieve("{}/{}".format(self._URL, path), filename="{}.tmp".format(path))
            os.rename("{}.tmp".format(path), path)

        self.analyses = {}
        with zipfile.ZipFile(path, "r") as zip_file:
            with zip_file.open("{}.txt".format(dataset), "r") as analyses_file:
                for line in analyses_file:
                    line = line.decode("utf-8").rstrip("\n")
                    columns = line.split("\t")

                    analyses = []
                    for i in range(1, len(columns) - 1, 2):
                        analyses.append(self.LemmaTag(columns[i], columns[i + 1]))
                    self.analyses[columns[0]] = analyses

    def get(self, word: str) -> list[LemmaTag]:
        """Returns a (possibly empty) list of morphological analyses for the given word."""
        return self.analyses.get(word, [])

LemmaTag

A class representing a morphological analysis of a word.

Source code in npfl138/datasets/morpho_analyzer.py
22
23
24
25
26
27
28
29
30
31
class LemmaTag:
    """A class representing a morphological analysis of a word."""
    def __init__(self, lemma: str, tag: str) -> None:
        self.lemma = lemma
        """A lemma of the word."""
        self.tag = tag
        """A tag of the word."""

    def __repr__(self) -> str:
        return "(lemma: {}, tag: {})".format(self.lemma, self.tag)

lemma instance-attribute

lemma = lemma

A lemma of the word.

tag instance-attribute

tag = tag

A tag of the word.

__init__

__init__(dataset: str) -> None

Loads the morphological analyses from the specified dataset.

Source code in npfl138/datasets/morpho_analyzer.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(self, dataset: str) -> None:
    """Loads the morphological analyses from the specified dataset."""
    path = "{}.zip".format(dataset)
    if not os.path.exists(path):
        print("Downloading dataset {}...".format(dataset), file=sys.stderr)
        urllib.request.urlretrieve("{}/{}".format(self._URL, path), filename="{}.tmp".format(path))
        os.rename("{}.tmp".format(path), path)

    self.analyses = {}
    with zipfile.ZipFile(path, "r") as zip_file:
        with zip_file.open("{}.txt".format(dataset), "r") as analyses_file:
            for line in analyses_file:
                line = line.decode("utf-8").rstrip("\n")
                columns = line.split("\t")

                analyses = []
                for i in range(1, len(columns) - 1, 2):
                    analyses.append(self.LemmaTag(columns[i], columns[i + 1]))
                self.analyses[columns[0]] = analyses

get

get(word: str) -> list[LemmaTag]

Returns a (possibly empty) list of morphological analyses for the given word.

Source code in npfl138/datasets/morpho_analyzer.py
53
54
55
def get(self, word: str) -> list[LemmaTag]:
    """Returns a (possibly empty) list of morphological analyses for the given word."""
    return self.analyses.get(word, [])