#!/usr/bin/perl
# $Id: pml_simplify,v 1.5 2006/05/05 13:55:54 pajas Exp $
# pml_simplify                   pajas@ufal.ms.mff.cuni.cz

use strict;

$|=1;   # flush on write

use XML::LibXML;
use XML::LibXML::Common qw(:w3c :encoding);

use File::Spec;
use Pod::Usage;

use constant {
  PML_SCHEMA_NS => "http://ufal.mff.cuni.cz/pdt/pml/schema/",
  EMPTY => q(),
};

my $old_args = "@ARGV";
use Getopt::Long;
Getopt::Long::Configure ("bundling");
my %opts;
GetOptions(\%opts,
	'path|p=s@',
	'no-format|f',
	'no-comments|c',
	'no-import|i',
	'no-derive|d',
	'debug|D',
	'help|h',
	'usage|u',
	'man',
       ) or $opts{usage}=1;

my $DEBUG = $opts{debug};
sub _DEBUG { print STDERR @_,"\n" if $DEBUG }


if ($opts{usage}) {
  pod2usage(-msg => 'pml_simplify');
}
if ($opts{help}) {
  pod2usage(-exitstatus => 0, -verbose => 1);
}
if ($opts{man}) {
  pod2usage(-exitstatus => 0, -verbose => 2);
}

my $pml_schema = shift || '-';
my $output = shift || '-';

my $PARSER = XML::LibXML->new();
$PARSER->keep_blanks($opts{'no-format'} ? 1 : 0);
$PARSER->line_numbers(1);
$PARSER->clean_namespaces(1);

my $dom = read_schema($pml_schema,{});

unless ($opts{'no-comments'}) {
  my $from = $pml_schema eq '-' ? "<STDIN>" : $pml_schema;
  my $comment1 = $dom->createComment("\n  Created by pml_simplify on ".localtime().
				     "\n  Command-line: pml_simlify $old_args\n");
  $dom->insertBefore($comment1,$dom->documentElement);
}
# reparse (for formatting and namespace cleanup) and dump
unless ($opts{'no-format'}) {
  _DEBUG("Reformatting result");
  $dom = $PARSER->parse_string( $dom->toString(1) );
}
_DEBUG("Saving to $output");
$dom->toFile($output, $opts{'no-format'} ? 0 : 1);
exit 0;

############################################################

# return 1 if a given argument looks like an URL (something://)
sub _is_url {
  return ($_[0] =~ m(^\s*[[:alnum:]]+://)) ? 1 : 0;
}

# distinguish relative filename from absolute paths and URLs
sub _is_absolute {
  my ($path) = @_;
  return (_is_url($path) or File::Spec->file_name_is_absolute($path));
}

# lookup a file in a given list of paths
sub FindIn {
  my ($filename,$paths)=@_;
  if (ref $paths and !_is_absolute($filename)) {
    for my $dir (@$paths) {
      my $f = File::Spec->catfile($dir,$filename);
      return $f if -f $f;
    }
  }
  return $filename;
}

# resolve relative file-names and URLs
sub ResolvePath {
  my ($orig, $href,$paths)=@_;
  return $href if _is_absolute($href);
  if (_is_url($orig)) {
    my $res = FindIn($href,$paths);
    if ($res ne $href) {
      return $res;
    } else {
      $orig =~ s{/[^/]*$}{};
      return $orig.'/'.$href;
    }
  } else {
    my ($vol,$dir) = File::Spec->splitpath(File::Spec->rel2abs($orig));
    my $rel = File::Spec->rel2abs($href,File::Spec->catfile($vol,$dir));
    if ( -f $rel ) {
      return $rel;
    } else {
      return FindIn($href,$paths);
    }
  }
}

# check if schema revision matches <import> revision flags
sub check_revision {
  my ($schema,$param)=@_;
    
  my $error = $param->{revision_error} || 'Error: wrong schema revision of %f: %e';
  my ($revision) = $schema->documentElement->getChildrenByTagNameNS(PML_SCHEMA_NS,'revision');
  my $revision_no = $revision ? $revision->textContent : 0;
  
  if (defined($param->{revision}) and  $param->{revision} ne EMPTY and
	_cmp_revisions($revision_no,$param->{revision})!=0) {
    die $error."required $param->{revision}, got $revision_no";
  } else {
    if (defined($param->{minimal_revision}) and $param->{minimal_revision} ne EMPTY and
	  _cmp_revisions($revision_no,$param->{minimal_revision})<0) {
      die $error."required at least $param->{minimal_revision}, got $revision_no";
    }
    if (defined($param->{maximal_revision}) and $param->{maximal_revision} ne EMPTY and
	  _cmp_revisions($revision_no,$param->{maximal_revision})>0) {
      die $error."required at most $param->{maximal_revision}, got $revision_no";
    }
  }
}

# compare schema revision numbers
sub _cmp_revisions {
  my ($revision1,$revision2)=@_;
  
  my @revision1 = split(/\./,$revision1);
  my @revision2 = split(/\./,$revision2);
  my $cmp=0;
  while ($cmp==0 and (@revision1 or @revision2)) {
    $cmp = (shift(@revision1) <=> shift(@revision2));
  }
  return $cmp;
}

# hash subelements by a given attribute or by content
sub _hash_by {
  my ($pml,$what,$by)=@_;
  my %hash;
  foreach my $type ($pml->getChildrenByTagNameNS(PML_SCHEMA_NS,$what)) {
    my $name = defined($by) ? $type->getAttribute($by) : $type->textContent;
    $hash{$name} = $type;
  }
  return \%hash;
}

# traverse a type declaration and collect types referred to
# type="type-name" declarations
sub _get_referred_types {
  my ($type,$named_types,$referred_types) = @_;
  if (ref($type)) {
    foreach my $ref ($type->findnodes(q(./descendant-or-self::*
					  [ contains(" list alt structure sequence element attribute member ",
						     concat(" ",local-name()," "))
					   ]/@type))) {
      my $ref_type = $ref->getValue;
      if ($ref_type ne EMPTY and !exists( $referred_types->{$ref} )) {
	if ( $named_types->{ $ref_type } ) {
	  $referred_types->{ $ref_type } = $named_types->{ $ref_type };
	  _get_referred_types( $ref_type, $named_types, $referred_types );
	} else {
	  warn "Cannot find declaration for type '$ref_type' in ".$type->ownerDocument->URI;
	}
      }
    }
  }
}

# copy each given named typee into $target_pml
# after $import_element unless a type with the
# same name already exists in the $existing hash
sub import_types {
  my ($target_pml,$import_element, $types, $existing) = @_;
  my $target = $target_pml->ownerDocument;
  my $after_element;

  my $comment;
  if (!$opts{'no-comments'}) {
    my $schema = $import_element->hasAttribute('schema') ? $import_element->getAttributeNode("schema")->toString : EMPTY;
    my $type   = $import_element->hasAttribute('type') ? $import_element->getAttributeNode("type")->toString : EMPTY;
    $comment = $target->createComment("  ".("=" x 12)." import$schema $type".("=" x 12)."  ");
    $import_element->replaceNode($comment);
    $after_element = $comment;
  } else {
    $after_element = $import_element;
  }

  foreach my $ref_type (@$types) {
    my $name = $ref_type->getAttribute('name');
    if (exists $existing->{ $name }) {
      _DEBUG("skipping type $name - already defined...");
      next;
    } 
    _DEBUG("copying type $name...");

    my $copy = $target->importNode($ref_type);
    $existing->{ $name } = $copy;

    $target_pml->insertAfter($copy, $after_element);
    $after_element = $copy;
  }
  if (!$opts{'no-comments'}) {
    $comment = $target->createComment("  ".("=" x (length($comment->nodeValue)-4))."  ");
    $target_pml->insertAfter($comment, $after_element);
  } else {
    $import_element->unbindNode;
  }
}

# process derive instructions
# $named_type is a hash of type declarations indexed by name
sub derive_types {
  my ($pml,$named_type) = @_;
  my $URL = $pml->ownerDocument->URI;
  _DEBUG("Deriving types in $URL ...");
  foreach my $derive ($pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'derive')) {
    my $name = $derive->getAttribute('name');
    my $source = $derive->getAttribute('type');
    my $type;
    if (!defined $source or $source eq EMPTY) {
      die "<derive> instruction must specify source type in the attribute 'type' in $URL, line ".$derive->line_number."\n";
    }
    if (!exists $named_type->{ $source }) {
      die "Can't find source type '$source' for <derive> in $URL, line ".$derive->line_number."\n";
    }
    if (defined $name and $name ne EMPTY) {
      if (exists ($named_type->{ $name })) {
	die "Refusing to derive existing type '$name' from '$source' in $URL, line ".$derive->line_number."\n";
      }
      $type = $named_type->{ $name } = $named_type->{ $source }->cloneNode(1);
      $pml->appendChild( $type );
      $type->setAttribute('name', $name);
    } else {
      $name = $source;
      $type = $named_type->{ $source };
    }
    _DEBUG("applying derive instructions to type $source, target type name $name...");
    unless ($opts{'no-comments'}) {
      $pml->insertBefore($pml->ownerDocument->createComment("  ".("=" x 12)." derived from $source ".("=" x 12)."  "), $type);
    }
    # deriving possible for structures, sequences and choices
    my ($inst) = $derive->findnodes(q(*[ contains(" structure sequence container choice ",
						  concat(" ",local-name()," "))
					]));
    my ($target) = $type->findnodes(q(*[ contains(" structure sequence container choice ",
						  concat(" ",local-name()," "))
					]));
    my %type_map = (
      structure => ['member', 'name'],
      sequence => ['element', 'name'],
      container => ['attribute', 'name'],
      choice => ['value', undef ], # index by content
     );
    
    if (!$inst) {
      die("<derive> must contain a <structure>, <sequence>, <container>, or <choice> in $URL, line ".$derive->line_number."\n");
    } elsif (!$target or $target->localname ne $inst->localname) {
      die("<derive> and its source '$source' differ in type in $URL, line ".$derive->line_number."\n");
    } else {
      my $tag = $inst->localname;
      my ( $item_tag, $index_by ) = @{ $type_map{ $tag } };
      _DEBUG("will derive a $tag ($item_tag)");

      foreach my $attr (grep { $_->nodeType == ATTRIBUTE_NODE } $inst->attributes) {
	if ($attr->getValue eq EMPTY) {
	  _DEBUG("removing attribute".$attr->toString. " from <$tag>");
	  $target->removeAttribute($attr->nodeName) if $target->hasAttribute($attr->nodeName);
	} else {
	  _DEBUG("copying attribute".$attr->toString. " to <$tag>");
	  $target->setAttribute($attr->nodeName,$attr->getValue);
	}
      }
      my $items = _hash_by( $target, $item_tag, $index_by );
      my ($before) = $target->findnodes('*[@role="#CHILDNODES"]');
      foreach my $item ($inst->getChildrenByTagNameNS(PML_SCHEMA_NS,$item_tag)) {
	my $iname = defined($index_by) ? $item->getAttribute($index_by) : $item->textContent;
	$item->unbindNode;
	if (exists( $items->{$iname} )) {
	  _DEBUG("replacing $item_tag name=$iname");
	  $items->{$iname}->replaceNode( $item );
	} else {
	  _DEBUG("adding $item_tag '$iname'");
	  # <attribute> should precede content declaration in a <container>
	  if ($item_tag eq 'attribute' and $target->firstChild) {
	    $target->insertBefore($item,$target->firstChild);
	  } elsif (ref $before) {
	    $target->insertBefore($item,$before);
	  } else {
	    $target->appendChild($item);
	  }
	}
      }
      foreach my $delete ($inst->getChildrenByTagNameNS(PML_SCHEMA_NS,'delete')) {
	my $dname = $delete->textContent;
	if (exists $items->{ $dname }) {
	  _DEBUG("deleting $item_tag '$dname'");
	  $items->{ $dname }->unbindNode;
	} else {
	  warn("No such $item_tag '$dname' in <$tag> '$source' in $URL, line ".$delete->line_number."\n");
	}
      }
    }
    $derive->unbindNode;
  }
  _DEBUG("Deriving finished.");
}

# move <root> declaration before any <type>
sub fix_root {
  my ($pml) = @_;
  my ($root) = $pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'root');
  if (ref($root) and 
      (my @before = $root->findnodes('preceding-sibling::*[local-name()="type"]'))) {
    $root->unbindNode;
    $pml->insertBefore($root,$before[0]);
  }

}

{
  my %cached;
  my %processing;

  # 1. resolve path $in relative to base_url
  # 2. try to locate an already processed schema in cache,
  # 3. otherwise, read schema from file and process it
  sub read_schema {
    my ($in, $param)=@_;
    my $base_url = $param->{base_url} || EMPTY;
    
    _DEBUG("read_schema: $in (base url: $base_url)");
    if ($base_url ne EMPTY) {
      $in = ResolvePath($base_url,$in,$opts{path});
    } elsif ( ! -f $in ) {
      $in = FindIn($in,$opts{path});
    }
    _DEBUG("resolved to: $in");
    
    if ($processing{$in}) {
      die "Fatal error: Cyclic imports detected when processing <import> instructions in schema $in\n";
    }
    
    my $dom;
    unless (exists($cached{ $in })) {
      _DEBUG("parsing...");
      $dom = $cached{ $in } = $PARSER->parse_file($in);
      _DEBUG("done.");
      $processing{ $in } = 1;
      process_schema( $dom, $param );
      delete $processing{ $in };
    } else {
      _DEBUG("cached (no need to parse and process).");
      $dom = $cached{ $in };
    }
    return $dom;
  }
}

# process all <import> and <derive> instructions
sub process_schema {
  my ($dom, $param)=@_;

  my $URL = $dom->URI;
  my $pml = $dom->documentElement;
  my $named_type = _hash_by($pml,'type', 'name');

  _DEBUG("Processing $URL");
  unless ($opts{'no-import'}) {
    _DEBUG("Processing imports in $URL...");
    foreach my $import ( $pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'import') ) {
      my $ref = $import->getAttribute('type');
      my $schema_url = $import->getAttribute('schema');
      if ($ref) {
	_DEBUG("Import type $ref from $schema_url");
      } else {
	_DEBUG("Import all types from $schema_url");
      }

      my $ref_schema = read_schema( $schema_url, 
				    { %$param,
				      base_url => $URL
				     });
      check_revision($ref_schema,{
	revision         => $import->getAttribute('revision'),
	minimal_revision => $import->getAttribute('minimal_revision'),
	maximal_revision => $import->getAttribute('maximal_revision'),
	revision_error => "Error importing from schema $schema_url to $URL - revision mismatch: ",
      });
      my @types_to_import=();
      if ( defined $ref and $ref ne EMPTY ) {
	my $schema_types = _hash_by( $ref_schema->documentElement, 'type', 'name' );
	my $ref_type = $schema_types->{ $ref };
	if ($ref_type) {
	  my %referred_type = ($ref => $ref_type);
	  _get_referred_types($ref_type, $schema_types, \%referred_type);
	  @types_to_import = values %referred_type;
	} else {
	  warn "Cannot import type '$ref' from ".$ref_schema->URI." to $URL: type declaration not found.";
	}
      } else {
	my $ref_pml = $ref_schema->documentElement;
	@types_to_import = $ref_pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'type');
	unless ($pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'root')) {
	  _DEBUG("will try to import <root> to $URL");
	  unshift @types_to_import, $ref_pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'root');
	}
      }
      _DEBUG("ready to copy types to $URL");
      import_types( $pml, $import, \@types_to_import, $named_type );
    }
    fix_root( $pml );
    _DEBUG("Importing to $URL finished.");
  }

  derive_types($pml, $named_type) unless $opts{'no-derive'};
}

############################################################

__END__

=head1 pml_simplify

pml_simplify - convert a modular PML schema to a simplified PML schema

=head1 SYNOPSIS


pml_simplify [options] < input_schema.xml > output_schema.xml   

pml_simplify [options] input_schema.xml output_schema.xml   

Get help:

  pml_simplify  -u|--usage          for usage (synopsis)
  pml_simplify  -h|--help           for help
  pml_simplify  --man               for the manual page

=head1 DESCRIPTION

Read a (modular) PML schema file and all its dependencies and output a
semantically equivalent simplified PML schema which does not contain
C<E<lt>includeE<gt>> or C<E<lt>deriveE<gt>> instructions.

=head1 OPTIONS

=over 4

=item B<--path|-p> dir

Try to locate missing schemas and schemas with network URLs in a given
directory. This option may be repeated.

=item B<--no-derive|-d>

Do not process C<E<lt>deriveE<gt>> instructions.

=item B<--no-import|-i>

Do not process C<E<lt>importE<gt>> instructions.

=item B<--no-comments|-c>

Do not insert informative comment nodes.

=item B<--no-format|-f>

Do not reformat the output (the result may look ugly and some elements
may contain redundant namespace declarations).

=item B<--debug|-D>

Print lots of debugging messages on the standard error output.

=head1 AUTHOR

Petr Pajas, E<lt>pajas@ufal.ms.mff.cuni.czE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Petr Pajas

This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.2 or,
at your option, any later version of Perl 5 you may have available.

=head1 BUGS

None reported... yet.

=cut