#!btred -e all() # -*- mode: cperl; coding: utf-8; -*- #encoding utf-8 =head1 old2pml.btred Convert data from legacy formats of PDT 1.0 to PML-formats of PDT 2.0 =head1 SYNOPSIS btred -I old2pml.btred [ btred-options ] [ -o script-options -- ] files ... or ntred [ ntred-options ] -i files ... ntred -I old2pml.btred [ ntred-options ] -- script-options =head1 OUTPUT The output files are named as the input files as follows: the input filename is first transformed based on the command-line options passed to C. If the resulting filename contains a C<.gz> suffix, it is stripped. Finally, according to the layer of annotation, one of the suffixes .a, .m, and .w is appended. =head1 DESCRIPTION This script converts data from PDT 1.0 format (csts,fs,...) to the PML-format. For analytically annotated data, the script creates adata, mdata and wdata instances. For data with morphological annotation only, it creates instances of mdata and wdata. =head2 BTRED AND NTRED OPTIONS The following btred/ntred flags can be used: =over 4 =item C<-e 'all()'> create all possible PML instances (default) =item C<-e adata> create only adata PML instance =item C<-e mdata> create only mdata PML instance =item C<-e wdata> create only wdata PML instance =item C<-a|-r|-s|-p> affect filenames of the output files =item C<-X govMD_mcc> (only btred) when converting from CSTS, build the a-tree structure based on the values of the CSTS tag =back =head2 SCRIPT OPTIONS The following SCRIPT OPTIONS can be used: =over 4 =item C<--gzip> create output files compressed by gzip =item C<--with-MM> add all as m-layer alternatives (morphological analysis) =item C<--with-MD> add all as m-layer alternatives (tagger output) =item C<--select-MD> name do not include into m-layer (human annotation); instead include and use this annotation as the one referred to from the a-layer =item C<--include-MD> regexp Selective inclusion of tagger output. Include to the m-layer if the value of the src attribute matches a given regular expression. =item C<--with-afun> attr Use values of the attribute C as analytical function. E.g., for CSTS files, C<--with-afun afunMD_xy> reads afun from . =item C<--desc-m|--desc-a> description =item C<--vers-m|--vers-a> version info Use given values as values for and in of a particular layer. If not specified, defaults to Manual annotation and no version_info. =item C<--desc-MD|--desc-MM> name1:description;name2:description;... =item C<--vers-MD|--desc-MM> name1:version info;name2:version_info;... Use given values for where nameX is name1, name2, ... and corresponds to names in the src attribute of CSTS or =back =head1 AUTHOR Petr Pajas Copyright 2005 Petr Pajas, All rights reserved. =cut package TredMacro; #ifndef NTRED #include #endif use IO; use XML::Writer; # these flags control whether MMl and MDl csts tags are included use vars qw($with_MM $with_MD $select_MD $include_MD $gzip %x %schema); use constant { LM => 'LM', AM => 'AM', PML_NS => "http://ufal.mff.cuni.cz/pdt/pml/" }; %schema = ( a => 'adata_schema.xml', m => 'mdata_schema.xml', w => 'wdata_schema.xml', ); $with_afun='afun'; $desc_m = 'Manual annotation'; $desc_a = 'Manual annotation'; use Getopt::Long; GetOptions("--gzip" => \$gzip, "with-MM" => \$with_MM, "with-MD" => \$with_MD, "include-MD=s" => \$include_MD, "with-afun=s" => \$with_afun, "select-MD=s" => \$select_MD, "desc-MM=s" => \$desc_MM, "vers-MM=s" => \$vers_MM, "desc-MD=s" => \$desc_MD, "vers-MD=s" => \$vers_MD, "desc-m=s" => \$desc_m, "vers-m=s" => \$vers_m, "x2m=s" => \@{$x{m}}, "x2a=s" => \@{$x{a}}, "x2w=s" => \@{$x{w}}, "desc-a=s" => \$desc_a, "vers-a=s" => \$vers_a, "schema-a=s" => \$schema{a}, "schema-m=s" => \$schema{m}, "schema-w=s" => \$schema{w}, ) || die "Command-line error: invalid script option!\nSee perldoc old2pml.btred for usage.\n"; for my $k (keys %x) { $x{$k} = { map { /(.*?)=(.*)/ ? ( $1 => $2 ) : ( $_ => $_ ) } map { split /,/ } @{$x{$k}} }; } if ($with_MD and $include_MD ne "") { die "Command-line error: --with-MD and --include-MD are exclusive\n"; } $include_MD='.*' if ($with_MD); sub uniq { my %a; @a{@_}=@_; values %a } my %e = qw( { { } } à à < < ] ] \ \ ˆ ^ [ [ _ _ $ $ > > @ @ | | & & ¯on; ˇ ` ` # # * * % % ); sub ent { my ($text)=@_; return $text unless $text=~/\&/; while (my($k,$v)=each %e) { $text=~s/\Q$k\E/$v/g; } return $text; } sub x_filename { my ($x,$full)=@_; my $fn = main::save_transform_filename(FileName()); $fn =~ s/\.gz$//g; $fn =~ s{^.*/}{}g unless $full; return $fn.".".$x.($gzip ? '.gz' : ''); } sub create_writer { my ($layer)=@_; undef $w; my $file = x_filename($layer,1); # my $output = IOBackend::open_backend($output,'w','utf-8'); # open(my $output,"> ".$file); # binmode($output,":encoding(utf-8)"); $w = new XML::Writer(OUTPUT => Treex::PML::IO::open_backend($file,'w','utf-8'), DATA_MODE => 1, DATA_INDENT => 1); print "Writing $file\n"; } sub close_writer { Treex::PML::IO::close_backend($w->getOutput); undef $w; } sub value2xml { my ($name, $value, $seq, @atrs)=@_; $w->startTag($name,@atrs); if ($value =~ /\|/ and $seq ne "") { foreach my $v (split /\|/,$value) { $w->startTag($seq); $w->characters($v); $w->endTag($seq); } } else { $w->characters($value); } $w->endTag($name); } sub atr2xml { my ($node, $atr, $xmlatr, $seq,@atrs) = @_; my $val = $node->{$atr}; if (FS()->isList($atr)) { $val = join '|', grep { not /^(---|-|NA|NIL|\?\?\?)$/ } split /\|/, $node->{$atr}; } unless ($val eq "") { $xmlatr = $atr if ($xmlatr eq ""); value2xml($xmlatr,$val,$seq,@atrs); } } sub _id { my ($id)=@_; $id=~y{:/}{-_}; $id; } sub node_id { my ($node)=@_; my $id; if ($node->{AID} ne "") { $id = $node->{AID}; } else { my $sent_id = $node->root->{ID1}; $sent_id =~ s{.*/}{}; if ($node->{ord} eq "") { $id = $node->{AID} = $sent_id."W".$node->{sentord}; } else { $id = $node->{AID} = $sent_id."W".$node->{ord}; } } $id=~y/:/-/; $id; } sub anode2xml { my ($node)=@_; if ($node->parent) { value2xml("m.rf",'m#m-'.node_id($node). ($select_MD eq "" ? "" : "-T".$select_MD)); if ($node->{$with_afun} ne "" and $node->{$with_afun} ne "-" and $node->{$with_afun} ne "---" and $node->{$with_afun} ne "???") { my ($afun,$suffix)=split /_/,$node->{$with_afun}; value2xml('afun',$afun); if ($suffix eq "Pa") { value2xml('is_parenthesis_root',1); } elsif ($suffix) { value2xml('is_member',1); } } } else { value2xml("s.rf",'m#m-'._id($node->{ID1})); } atr2xml($node,'ord','ord'); if (exists $x{a} and $node->parent) { foreach my $k (keys %{$x{a}}) { value2xml($x{a}{$k},$node->{'x_'.$k}); } } # node children if ($node->firstson) { $w->startTag("children"); foreach my $child ($node->children) { $w->startTag(LM, id => "a-".node_id($child)); anode2xml($child); $w->endTag(LM); } $w->endTag("children"); } } sub w_rf { my ($n,$last_w)=@_; my $change; my $next_w; # generate w.rf (this step reproduces operation of wnode2xml if ($n->{origf} =~ /\|/ or $n->{origfkind}=~/\|/) { my @origf = split(/\|/,$n->{origf}); my @kind = split(/\|/,$n->{origfkind}); my $suffix = 'x1'; my $non_del = 0; my $LM=0; $w->startTag('w.rf'); foreach my $w (ListRegroupElements(\@origf,\@kind)) { $non_del=1 if ($w->[1] ne 'del'); unless ($w->[1] eq 'del') { value2xml(LM,'w#w-'.node_id($n).$suffix); $LM=1; } $suffix++; } if ($non_del) { $change = 'spell'; } else { if ($LM) { # one more token generated from 'form' itself value2xml(LM,'w#w-'.node_id($n)); } else { $w->characters('w#w-'.node_id($n)); } } $w->endTag('w.rf'); } elsif ($n->{origfkind} eq 'del') { # AID-x1: deleted token (there will be no reference to it from m-layer) # AID: this is what we will refer to on the m-layer value2xml('w.rf','w#w-'.node_id($n)); } elsif ($n->{origfkind} eq 'ins') { # this token is completely generated on m-layer, won't create anything # no reference $change = 'insert'; } elsif ($n->{origfkind} eq 'spell') { if ($n->{origf} ne '' and $n->{form} ne $n->{origf} and $n->{formtype}=~/^gen|\.gen/) { value2xml('w.rf','w#w-'.node_id($n)); $change = 'spell'; $next_w = ['spell',$n->{AID}]; } else { warn "ERROR1: node $n->{AID} has formtype=$n->{formtype}, form=$n->{form}, origf=$n->{origf}, and origf=spell\n"; } } elsif ($n->{origf} eq '') { if ($n->{formtype}=~/^gen|\.gen/ and $last_w and $last_w->[1] ne '') { value2xml('w.rf','w#w-'.$last_w->[1]); $change = $last_w->[0]; $next_w = $last_w if $change eq 'spell'; } else { if ($last_w) { warn "ERROR6: node $n->{AID} has formtype=$n->{formtype} but empty origf\n"; } else { warn "ERROR2: node $n->{AID} has formtype=$n->{formtype}, empty origf, and we don't know ID of the previous form\n"; } } } elsif ($n->{origfkind} eq 'ctcd') { if ($n->{origf} ne '') { value2xml('w.rf','w#w-'.node_id($n)); $change = 'ctcd'; $next_w = ['ctcd',node_id($n)]; } else { warn "ERROR3: node $n->{AID} has origfkind=ctcd and empty origf\n"; } } elsif ($n->{origfkind} eq 'num.orig') { value2xml('w.rf','w#w-'.node_id($n)); if ($n->{formtype}=~/gen/) { $change = 'num_normalization'; } } elsif ($n->{origfkind} eq '' or $n->{origfkind} eq 'same') { if ($n->{formtype}=~/^gen|\.gen/ and $last_w) { warn "ERROR4: node $n->{AID} has formtype=$n->{formtype} and non-empty origf (and is preceded by @$last_w)\n"; } else { value2xml('w.rf','w#w-'.node_id($n)); } } else { warn "ERROR5: unexpected origfkind=$n->{origfkind} (form=$n->{form}, origf=$n->{origf}, formtype=$n->{formtype}, origfkind=$n->{origfkind})\n"; } return ($change,$next_w); } sub mnode2xml { my ($node,$last_w) = @_; # MR attributes my ($change,$next_w); my @anots; if ($select_MD eq "") { if ($node->{tag} ne "" or $node->{lemma} ne "") { push @anots,["manual", "m-".node_id($node), $node->{tag}, $node->{lemma}, $node->{wt}, undef]; } } if ($include_MD ne "" or $select_MD ne "") { foreach my $atr (grep { /^lemmaMD_(\Q$select_MD\E|$include_MD)$/ } Attributes()) { my ($suffix) = ($atr =~ /_(.*)$/); if ($node->{'tagMD_'.$suffix} ne "" or $node->{'lemmaMD_'.$suffix} ne "") { push @anots, ["tagger-".$suffix, "m-".node_id($node)."-T$suffix", $node->{'tagMD_'.$suffix}, $node->{'lemmaMD_'.$suffix}, $node->{'wMDt_'.$suffix}, $node->{'wMDl_'.$suffix}]; } } } if ($with_MM) { foreach my $atr (grep { /^tagMM_/ } Attributes()) { my ($suffix) = ($atr =~ /_(.*)$/); my @t = split /\|/,$node->{'tagMM_'.$suffix}; my @l = split /\|/,$node->{'lemmaMM_'.$suffix}; my $i=1; while (@t) { push @anots, ["analysis-".$suffix,"m-".node_id($node)."-A$suffix:$i", shift @t, shift @l, undef, undef]; $i++; } } } $w->startTag('m') if (@anots>1); foreach my $anot (@anots) { $w->startTag(@anots>1 ? AM : 'm', id => $anot->[1]); value2xml('src.rf',$anot->[0]); ($change,$next_w) = w_rf($node,$last_w); value2xml('form',ent($node->{form})); value2xml('form_change',$change) if ($change); $w->startTag('lemma',($anot->[5] ne "" ? (weight => $anot->[5]) :())); $w->characters(ent($anot->[3])); $w->endTag('lemma'); $w->startTag('tag',($anot->[4] ne "" ? (weight => $anot->[4]) :())); $w->characters($anot->[2]); $w->endTag('tag'); if (exists $x{m}) { foreach my $k (keys %{$x{m}}) { value2xml($x{m}{$k},$node->{'x_'.$k}); } } $w->endTag(@anots>1 ? AM : 'm'); } $w->endTag('m') if (@anots>1); return $next_w; } sub w_element { my ($node,$token,$id,$nospace) = @_; # w-attributes $w->startTag('w', id => 'w-'._id($id)); value2xml('token',ent($token)); value2xml('no_space_after',$nospace) if $nospace == 1; if ($node->{pronunciation} ne '') { foreach (split /\|/,$node->{pronunciation}) { $w->startTag("othermarkup",origin => 'P'); $w->characters($_); $w->endTag("othermarkup"); } } if (exists $x{w}) { foreach my $k (keys %{$x{w}}) { value2xml($x{w}{$k},$node->{'x_'.$k}); } } $w->endTag('w'); } sub wnode2xml { my ($n,$next) = @_; # we ignore before generated tokens since # such a was added and doesn't belong to the w-layer # and should in theory be on the m-layer $nospace = $next->{origfkind} eq 'ins' ? 0 : $n->{nospace}; if ($n->{origf} =~ /\|/ or $n->{origfkind}=~/\|/) { my @origf = split(/\|/,$n->{origf}); my @kind = split(/\|/,$n->{origfkind}); my $suffix = 'x1'; my $non_del = 0; foreach my $word (ListRegroupElements(\@origf,\@kind)) { $non_del=1 if ($word->[1] ne 'del'); w_element($n,$word->[0],$n->{AID}.$suffix,0); $suffix++; } unless ($non_del) { # one more token generated from 'form' itself w_element($n,$n->{form},$n->{AID},$nospace); } } elsif ($n->{origfkind} eq 'del') { # deleted token (there will be no reference to it from m-layer) w_element($n,$n->{origf},$n->{AID}.'x1',$nospace); # this is what we will refer to on the m-layer w_element($n,$n->{form},$n->{AID},$nospace); } elsif ($n->{origfkind} eq 'ins') { # this token is completely generated on m-layer, won't create anything } elsif ($n->{origf} ne '') { w_element($n,$n->{origf},$n->{AID},$nospace); } if ($node->{gappost} ne "") { $w->startTag("othermarkup",origin => 'csts/doc/s'); if ($node->{gappost}=~/cdata($node->{gappost}); } else { $w->characters($node->{gappost}); } $w->endTag("othermarkup"); } } sub _write_m_annot_info { my ($tag, $id, $desc, $vers) = @_; $w->startTag($tag, id => $id); if ($desc ne "") { $w->startTag("desc"); $w->characters($desc); $w->endTag("desc"); } if ($vers ne "") { $w->startTag("version_info"); $w->characters($vers); $w->endTag("version_info"); } $w->endTag($tag); } sub _get_m_annot_info { my ($prefix,$desc,$vers,@keys)=@_; my (%desc,%vers); foreach my $part (split /\s*;\s*/, $desc) { my ($name,$val) = split /\s*:\s*/, $part, 2; $desc{$name}=$val; } foreach my $part (split /;\s*/, $vers) { my ($name,$val) = split /\s*:\s*/, $part, 2; $vers{$name}=$val; } return map { [$prefix.$_, $desc{$_}, $vers{$_}] } @keys; } sub write_m_annotation_info { my @annot_info; if ($select_MD eq "") { push @annot_info, ["manual",$desc_m,$vers_m]; } if ($include_MD ne "" or $select_MD ne "") { my %md; foreach my $atr (grep { /^tagMD_/ } Attributes()) { if ($atr =~ /^tagMD_(\Q$select_MD\E|$include_MD)$/) { $md{$1}=1; } } push @annot_info, _get_m_annot_info('tagger-',$desc_MD,$vers_MD, sort keys %md); } if ($with_MM) { my %mm; foreach my $atr (grep { /^tagMM_/ } Attributes()) { $mm{$1}=1 if ($atr =~ /^tagMM_(.*)$/); } push @annot_info, _get_m_annot_info('analysis-',$desc_MM,$vers_MM, sort keys %mm); } if (@annot_info == 1) { _write_m_annot_info("annotation_info",@{$annot_info[0]}); } elsif (@annot_info > 1) { $w->startTag("annotation_info"); for (@annot_info) { _write_m_annot_info("LM",@$_); } $w->endTag("annotation_info"); } } sub write_mdata { $w->xmlDecl("utf-8"); $w->startTag("mdata", xmlns => PML_NS); $w->startTag("head"); $w->emptyTag("schema", href => $schema{m}); $w->startTag("references"); $w->emptyTag("reffile", id => 'w', name=> "wdata", href => x_filename("w")); $w->endTag("references"); $w->endTag("head"); $w->startTag("meta"); value2xml('lang',$root->{cstslang} || 'cs'); write_m_annotation_info(); $w->endTag("meta"); my $first = 1; my $last_w; do {{ $w->startTag("s", id => 'm-'._id($root->{ID1})); foreach my $node (sort { $a->{sentord} <=> $b->{sentord} } grep { $_->{TID} eq "" } $root->descendants) { node_id($node); $last_w = mnode2xml($node,$last_w); } $w->endTag("s"); } } while (NextTree()); $w->endTag("mdata"); $w->end(); } sub write_wdata { $w->xmlDecl("utf-8"); $w->startTag("wdata", xmlns => PML_NS); $w->startTag("head"); $w->emptyTag("schema", href => $schema{w}); $w->endTag("head"); $w->startTag("meta"); if ($root->{cstslang}) { value2xml('lang',$root->{cstslang} || 'cs'); } value2xml('original_format',"csts"); if ($root->{cstsmarkup}) { $w->startTag("othermeta",origin => 'csts/h/markup'); my $markup = $root->{cstsmarkup}; $markup=~s/cdata($markup."\n"); $w->endTag("othermeta"); } if ($root->{cstssource}) { $w->startTag("othermeta",origin => 'csts/h/source'); my $markup = $root->{cstssource}; $markup=~s/cdata($markup."\n"); $w->endTag("othermeta"); } $w->endTag("meta"); my $first = 1; do {{ if ($root->{doc} ne "") { unless ($first) { $w->endTag("para"); $w->endTag("doc"); } my $orig_id = $root->{doc}."-".$root->{docid}; my $id = $root->{doc}; $id =~ s{.*/}{}; $id.='-'.$root->{docid}; $w->startTag("doc", id => 'w-'._id($id), source_id => $orig_id); } elsif ($first) { $w->startTag("doc", continues => 1); } if ($root->{doc} ne "" or $first) { $w->startTag("docmeta"); if ($root->{docmarkup}) { $w->startTag("othermeta",origin => 'csts/doc/a'); my $markup = $root->{docmarkup}; $markup=~s/cdata($markup."\n"); $w->endTag("othermeta"); } if ($root->{docprolog}) { $w->startTag("othermeta",origin => 'csts/doc/a'); my $markup = $root->{docprolog}; $markup=~s/cdata($markup."\n"); $w->endTag("othermeta"); } $w->endTag("docmeta"); } if ($root->{para} ne "") { $w->endTag('para') unless ($first or $root->{doc}); $w->startTag("para"); $w->startTag("othermarkup",origin => 'csts/doc/p/@n'); $w->characters($root->{para}); $w->endTag("othermarkup"); } $first = 0; use Data::Dumper; my @wnodes = sort { $a->{sentord} <=> $b->{sentord} } grep { $_->{TID} eq "" } $root->descendants; for (my $i=0; $i<@wnodes; $i++) { node_id($wnodes[$i]); wnode2xml($wnodes[$i],$wnodes[$i+1]); } }} while (NextTree()); unless ($first) { $w->endTag("para"); # $w->endTag("chap"); $w->endTag("doc"); } $w->endTag("wdata"); $w->end(); } sub write_adata { $w->xmlDecl("utf-8"); $w->startTag("adata", xmlns => PML_NS); $w->startTag("head"); $w->emptyTag("schema", href => $schema{a}); $w->startTag("references"); $w->emptyTag("reffile", id => 'm', name=> "mdata", href => x_filename("m")); $w->emptyTag("reffile", id => 'w', name=> "wdata", href => x_filename("w")); $w->endTag("references"); $w->endTag("head"); if ($desc_a.$vers_a ne "") { $w->startTag("meta"); $w->startTag("annotation_info"); if ($desc_a ne "") { $w->startTag("desc"); $w->characters($desc_a); $w->endTag("desc"); } if ($vers_a ne "") { $w->startTag("version_info"); $w->characters($vers_a); $w->endTag("version_info"); } $w->endTag("annotation_info"); $w->endTag("meta"); } $w->startTag("trees"); my $doc=0; do {{ my $node = $root; $w->startTag(LM, id => "a-"._id($root->{ID1})); anode2xml($root); $w->endTag(LM); } } while (NextTree()); $w->endTag("trees"); $w->endTag("adata"); $w->end(); } sub adata { GotoTree(0); return unless $root; create_writer('a'); write_adata(); close_writer(); } sub mdata { GotoTree(0); return unless $root; create_writer('m'); write_mdata(); close_writer(); } sub wdata { GotoTree(0); return unless $root; create_writer('w'); write_wdata(); close_writer(); } sub all { if ($root) { writeln("Converting ".FileName()); adata() if first { $_->firstson and $_->firstson->{ord} ne "" } GetTrees(); mdata(); wdata(); } else { writeln("No trees - skipping ".FileName()); } }