#!/usr/local/bin/perl

# Converts phrase trees in Mike Collins' format into CSTS dependency trees.
# Mike throws away lots of information for each word so of course only the
# part that has survived is on output.

while(<>)
{
    # Assumption: a line contains exactly one tree.
    $tree = $_;
    # Convert words into CSTS but still retain bracktes.
    $tree =~ s|([^\s]+)/(>?[^\s]+)|<f>$1<l><t>$2|g;
    # Change the head marking to something more compatible with CSTS.
    $tree =~ s/<t>>([^\s]+)/<t>$1<g>head/g;
    # Distinguish real words from punctuation.
    $tree =~ s/<f>([^<]*)<l><t>Z/<d>$1<l><t>Z/g;
    # Make the word order explicit by adding the attribute ord.
    @fragments = split(/\s+/, $tree);
    $ord = 1;
    for($i=0; $i<=$#fragments; $i++)
    {
	if($fragments[$i] =~ m/<[fd]>.*/)
	{
	    $fragments[$i] =~ s/(.*)/$1<r>$ord  /;
	    # For heads, switch the positions of <r> and <g>.
	    $fragments[$i] =~ s/(.*)(<g>head)(<r>\d+)/$1$3$2/;
	    $ord++;
	}
    }
    $tree = join(" ", @fragments) . "\n";
    # Mark head of each phrase. (On the input: the tag is preceded by ">".)
    while($tree =~ m/\((.)P([^()]*)\)/)
    {
	$phrtype = $1;
	$phrcont = $2;
	# Find the head of the phrase.
	$phrcont =~ m/<r>(\d+)<g>head/;
	$headord = $1;
	# Mark all other words within the phrase as the dependents of the head.
	$phrcont =~ s/(<[fd]>[^<]*<l><t>[^<]*<r>\d+)\s/$1<g>$headord /g;
	# Mark all heads of nested phrases as dependents of the head.
	$phrcont =~ s/<g>subphrase/<g>$headord/g;
	# Rewrite <g>head with <g>subphrase so that this head does not compete
	# with the head of the superphrase.
	$phrcont =~ s/<g>head/<g>subphrase/;
	# Write the phrase back to the tree.
	$tree =~ s/\((.)P([^()]*)\)/<$phrtype>$phrcont<\/$phrtype>/;
    }
    # Make the head of the outermost phrase depend on the root.
    $tree =~ s/<g>subphrase/<g>0/;
    # Convert "-LRB-" and "-RRB-" back to "(" and ")".
    $tree =~ s/-LRB-/\(/g;
    $tree =~ s/-RRB-/\)/g;
    # Split tree to words.
    @words = split(/\s+/, $tree);
    # Print the result except of the phrase markers.
    print "<s>\n";
    for($i=0; $i<=$#words; $i++)
    {
	if($words[$i] =~ m/<[fd]>.*/)
	{
	    print $words[$i], "\n";
	}
    }
}

