#!/usr/local/bin/perl # Converts phrase trees in Mike Collins' format into CSTS dependency trees. # Mike throws away lots of information for each word so of course only the # part that has survived is on output. while(<>) { # Assumption: a line contains exactly one tree. $tree = $_; # Convert words into CSTS but still retain bracktes. $tree =~ s|([^\s]+)/(>?[^\s]+)|$1$2|g; # Change the head marking to something more compatible with CSTS. $tree =~ s/>([^\s]+)/$1head/g; # Distinguish real words from punctuation. $tree =~ s/([^<]*)Z/$1Z/g; # Make the word order explicit by adding the attribute ord. @fragments = split(/\s+/, $tree); $ord = 1; for($i=0; $i<=$#fragments; $i++) { if($fragments[$i] =~ m/<[fd]>.*/) { $fragments[$i] =~ s/(.*)/$1$ord /; # For heads, switch the positions of and . $fragments[$i] =~ s/(.*)(head)(\d+)/$1$3$2/; $ord++; } } $tree = join(" ", @fragments) . "\n"; # Mark head of each phrase. (On the input: the tag is preceded by ">".) while($tree =~ m/$(.)P([^()]*)$/) { $phrtype = $1; $phrcont = $2; # Find the head of the phrase. $phrcont =~ m/(\d+)head/; $headord = $1; # Mark all other words within the phrase as the dependents of the head. $phrcont =~ s/(<[fd]>[^<]*[^<]*\d+)\s/$1$headord /g; # Mark all heads of nested phrases as dependents of the head. $phrcont =~ s/subphrase/$headord/g; # Rewrite head with subphrase so that this head does not compete # with the head of the superphrase. $phrcont =~ s/head/subphrase/; # Write the phrase back to the tree. $tree =~ s/$(.)P([^()]*)$/<$phrtype>$phrcont<\/$phrtype>/; } # Make the head of the outermost phrase depend on the root. $tree =~ s/subphrase/0/; # Convert "-LRB-" and "-RRB-" back to "(" and ")". $tree =~ s/-LRB-/$/g; $tree =~ s/-RRB-/$/g; # Split tree to words. @words = split(/\s+/, $tree); # Print the result except of the phrase markers. print "~~\n"; for($i=0; $i<=$#words; $i++) { if($words[$i] =~ m/<[fd]>.*/) { print $words[$i], "\n"; } } }~~