#!btred -e autostart # -*- cperl -*- # PADT -- modified by Otakar Smrz # b/n-tred macro converting PDT 2.0 a-layer trees into CoNLL line-oriented data format # written by Zdenek Zabokrtsky, zabokrtsky@ufal.mff.cuni.cz #include "../lib/Analytic.mak" package TredMacro; use Encode::Arabic::Buckwalter ':xml'; use strict; our $root; our @test = map { decode "buckwalter", $_ } qw '~ ~iy ~a ~A'; our @reps = map { decode "buckwalter", $_ } qw 'niy iy ya nA'; our %miss; our %form; our %data = ( 'OanA_PRON_1S' => 'OanA', '~a_PRON_1S' => 'OanA', '~iy_PRON_1S' => 'OanA', 'iy_POSS_PRON_1S' => 'OanA', 'iy_PRON_1S' => 'OanA', 'niy_IVSUFF_DO:1S' => 'OanA', 'niy_PRON_1S' => 'OanA', 'niy_PVSUFF_DO:1S' => 'OanA', 'y_PRON_1S' => 'OanA', 'ya_POSS_PRON_1S' => 'OanA', 'ya_PRON_1S' => 'OanA', 'ka_POSS_PRON_2MS' => 'Oanota', 'ka_PRON_2MS' => 'Oanota', 'ki_PRON_2FS' => 'Oanoti', 'huwa_PRON_3MS' => 'huwa', 'hu_IVSUFF_DO:3MS' => 'huwa', 'hu_POSS_PRON_3MS' => 'huwa', 'hu_PRON_3MS' => 'huwa', 'hu_PVSUFF_DO:3MS' => 'huwa', 'hi_IVSUFF_DO:3MS' => 'huwa', 'hi_POSS_PRON_3MS' => 'huwa', 'hi_PRON_3MS' => 'huwa', 'hiya_PRON_3FS' => 'hiya', 'hA_IVSUFF_DO:3FS' => 'hiya', 'hA_POSS_PRON_3FS' => 'hiya', 'hA_PRON_3FS' => 'hiya', 'hA_PVSUFF_DO:3FS' => 'hiya', 'naHonu_PRON_1P' => 'naHonu', 'nA_IVSUFF_DO:1P' => 'naHonu', 'nA_POSS_PRON_1P' => 'naHonu', 'nA_PRON_1P' => 'naHonu', 'nA_PVSUFF_DO:1P' => 'naHonu', 'unA_IVSUFF_DO:1P' => 'naHonu', '~A_PRON_1P' => 'naHonu', 'kum_POSS_PRON_2MP' => 'Oanotum', 'kum_PRON_2MP' => 'Oanotum', 'kum_PVSUFF_DO:2MP' => 'Oanotum', 'hum_IVSUFF_DO:3MP' => 'hum', 'hum_POSS_PRON_3MP' => 'hum', 'hum_PRON_3MP' => 'hum', 'hum_PVSUFF_DO:3MP' => 'hum', 'him_POSS_PRON_3MP' => 'hum', 'him_PRON_3MP' => 'hum', 'hun~a_POSS_PRON_3FP' => 'hun~a', 'hun~a_PRON_3FP' => 'hun~a', 'hun~a_PVSUFF_DO:3FP' => 'hun~a', 'hin~a_POSS_PRON_3FP' => 'hun~a', 'hin~a_PRON_3FP' => 'hun~a', 'humA_IVSUFF_DO:3D' => 'humA', 'humA_POSS_PRON_3D' => 'humA', 'humA_PRON_3D' => 'humA', 'humA_PVSUFF_DO:3D' => 'humA', 'himA_IVSUFF_DO:3D' => 'humA', 'himA_POSS_PRON_3D' => 'humA', 'himA_PRON_3D' => 'humA', ); our %look = ( 'POSS_PRON_1S' => 'PRON', 'POSS_PRON_2MP' => 'PRON', 'POSS_PRON_2MS' => 'PRON', 'POSS_PRON_3FP' => 'PRON', 'PRON_1S' => 'PRON', 'PRON_2FS' => 'PRON', 'PRON_2MP' => 'PRON', 'PRON_3FP' => 'PRON', 'PVSUFF_DO:1P' => 'PRON', 'PVSUFF_DO:1S' => 'PRON', 'PVSUFF_DO:2MP' => 'PRON', 'PVSUFF_DO:3FP' => 'PRON', 'IVSUFF_DO:1P' => 'PRON', 'IVSUFF_DO:1S' => 'PRON', 'IVSUFF_DO:3D' => 'PRON', 'IVSUFF_DO:3FS' => 'PRON', 'IVSUFF_DO:3MP' => 'PRON', 'IVSUFF_DO:3MS' => 'PRON', 'POSS_PRON_1P' => 'PRON', 'POSS_PRON_3D' => 'PRON', 'POSS_PRON_3FS' => 'PRON', 'POSS_PRON_3MP' => 'PRON', 'POSS_PRON_3MS' => 'PRON', 'PRON_1P' => 'PRON', 'PRON_2MS' => 'PRON', 'PRON_3D' => 'PRON', 'PRON_3FS' => 'PRON', 'PRON_3MP' => 'PRON', 'PRON_3MS' => 'PRON', 'PVSUFF_DO:3D' => 'PRON', 'PVSUFF_DO:3FS' => 'PRON', 'PVSUFF_DO:3MP' => 'PRON', 'PVSUFF_DO:3MS' => 'PRON', ); sub look { my ($prefix, $suffix) = @_; my ($buck, $utf8); $miss{$suffix}++ unless exists $look{$suffix}; if ($suffix =~ /^(?:[CIP]VSUFF_DO|POSS_PRON|PRON)/ ) { $buck = encode 'buckwalter', $prefix; $form{$buck . '_' . $suffix}++ unless $data{$buck . '_' . $suffix}; $utf8 = decode 'buckwalter', $data{$buck . '_' . $suffix}; return $utf8 . '_1'; } else { return $prefix . '_' . $suffix; } } sub scores { my $text = shift; $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/\_/g; return $text; } sub autostart { open F, '>:encoding(utf8)', FileName() . '.conll'; select F; do { foreach my $node (sort { $a->{'ord'} <=> $b->{'ord'} } $root->descendants()) { my $id = $node->{'ord'}; my $form = $node->{'form'} || $node->{'origf'}; my $lemma = $node->{'lemma'} || '_'; my $tag = $node->{'x_comment'} || $node->{'tag'} || '_'; my $cpostag = substr $tag, 0, 1; my $postag = substr $tag, 0, 2; my @feats = (); my $head = $node->parent()->{'ord'}; my $deprel = $node->{'afun'} ne '???' && $node->{'afun'} || '_'; my $gloss = scores($node->{'x_gloss'}); if ($form ne '') { warn "??\t" . ThisAddress($node) . "\n" if $form =~ /^$test[0]/; if ($form eq $test[1]) { $form = $reps[1]; if ($tag ne 'S----1-S2-') { $form = $reps[0]; warn "><\t" . ThisAddress($node) . "\n"; } } elsif ($form eq $test[2]) { $form = $reps[2]; } elsif ($form eq $test[3]) { $form = $reps[3]; } } else { warn "!!\t" . ThisAddress($node) . "\n"; } if ($lemma ne '_') { $lemma =~ s/\_\?$//; $lemma =~ s/\_EMPH\_PART$/\_EMPHATIC\_PART/; $lemma =~ s/\_RESULT\_CLAUSE\_PARTICLE$/\_EMPHATIC\_PART/; $lemma =~ s/\_SUBJUNC$/\_SUB/; $lemma =~ s/\_SUB\_CONJ$/\_1/; $lemma =~ s/\_REL_PRON$/\_1/; $lemma =~ s/^([^_]+)\_([A-Z][A-Z\_0-9\:]*)$/look($1, $2)/e; warn "??\t" . ThisAddress($node) . "\n" if $lemma =~ /^$test[0]/; } if ($lemma eq '') { $lemma = '?'; warn "!!\t" . ThisAddress($node) . "\n"; } if ($tag ne '_') { my @names = qw 'Mood Voice Person Gender Number Case Defin'; my @posit = split //, $tag; splice @posit, 4, 1; for (my $i = -@names; $i < 0; $i++) { push @feats, $names[$i] . '=' . $posit[$i] unless $posit[$i] eq '-'; } } if ($node->{'parallel'} =~ /^(?:Co|Ap)$/) { push @feats, 'MemberOf=' . $node->{'parallel'}; } if (Analytic::isClauseHead($node)) { push @feats, 'ClauseHead=' . ( $node->{'arabclause'} =~ /^P/ ? $node->{'arabclause'} : 'Pred' ); } if ($node->{'arabspec'} eq 'Ref') { push @feats, 'GramCoref=True'; } if ($node->{'origf'} ne '') { push @feats, 'InputForm=' . $node->{'origf'}; } if ($gloss ne '') { push @feats, 'TokenGloss=' . $gloss; } @feats = ('_') unless @feats; my $feats = join '|', @feats; print "" . ( join "\t", $id, $form, $lemma, $cpostag, $postag, $feats, $head, $deprel, '_', '_' ) . "\n"; } print "\n" if $root->children(); } while TredMacro::NextTree(); close F; ChangingFile(0); } END { select STDOUT; printf "%5d\t%s\n", $miss{$_}, $_ foreach sort { $miss{$a} <=> $miss{$b} } keys %miss; print "\n"; printf "%5d\t%s\n", $form{$_}, $_ foreach sort { $form{$a} <=> $form{$b} } keys %form; }