#!/usr/bin/perl # $Id: pml_simplify,v 1.5 2006/05/05 13:55:54 pajas Exp $ # pml_simplify pajas@ufal.ms.mff.cuni.cz use strict; $|=1; # flush on write use XML::LibXML; use XML::LibXML::Common qw(:w3c :encoding); use File::Spec; use Pod::Usage; use constant { PML_SCHEMA_NS => "http://ufal.mff.cuni.cz/pdt/pml/schema/", EMPTY => q(), }; my $old_args = "@ARGV"; use Getopt::Long; Getopt::Long::Configure ("bundling"); my %opts; GetOptions(\%opts, 'path|p=s@', 'no-format|f', 'no-comments|c', 'no-import|i', 'no-derive|d', 'debug|D', 'help|h', 'usage|u', 'man', ) or $opts{usage}=1; my $DEBUG = $opts{debug}; sub _DEBUG { print STDERR @_,"\n" if $DEBUG } if ($opts{usage}) { pod2usage(-msg => 'pml_simplify'); } if ($opts{help}) { pod2usage(-exitstatus => 0, -verbose => 1); } if ($opts{man}) { pod2usage(-exitstatus => 0, -verbose => 2); } my $pml_schema = shift || '-'; my $output = shift || '-'; my $PARSER = XML::LibXML->new(); $PARSER->keep_blanks($opts{'no-format'} ? 1 : 0); $PARSER->line_numbers(1); $PARSER->clean_namespaces(1); my $dom = read_schema($pml_schema,{}); unless ($opts{'no-comments'}) { my $from = $pml_schema eq '-' ? "" : $pml_schema; my $comment1 = $dom->createComment("\n Created by pml_simplify on ".localtime(). "\n Command-line: pml_simlify $old_args\n"); $dom->insertBefore($comment1,$dom->documentElement); } # reparse (for formatting and namespace cleanup) and dump unless ($opts{'no-format'}) { _DEBUG("Reformatting result"); $dom = $PARSER->parse_string( $dom->toString(1) ); } _DEBUG("Saving to $output"); $dom->toFile($output, $opts{'no-format'} ? 0 : 1); exit 0; ############################################################ # return 1 if a given argument looks like an URL (something://) sub _is_url { return ($_[0] =~ m(^\s*[[:alnum:]]+://)) ? 1 : 0; } # distinguish relative filename from absolute paths and URLs sub _is_absolute { my ($path) = @_; return (_is_url($path) or File::Spec->file_name_is_absolute($path)); } # lookup a file in a given list of paths sub FindIn { my ($filename,$paths)=@_; if (ref $paths and !_is_absolute($filename)) { for my $dir (@$paths) { my $f = File::Spec->catfile($dir,$filename); return $f if -f $f; } } return $filename; } # resolve relative file-names and URLs sub ResolvePath { my ($orig, $href,$paths)=@_; return $href if _is_absolute($href); if (_is_url($orig)) { my $res = FindIn($href,$paths); if ($res ne $href) { return $res; } else { $orig =~ s{/[^/]*$}{}; return $orig.'/'.$href; } } else { my ($vol,$dir) = File::Spec->splitpath(File::Spec->rel2abs($orig)); my $rel = File::Spec->rel2abs($href,File::Spec->catfile($vol,$dir)); if ( -f $rel ) { return $rel; } else { return FindIn($href,$paths); } } } # check if schema revision matches revision flags sub check_revision { my ($schema,$param)=@_; my $error = $param->{revision_error} || 'Error: wrong schema revision of %f: %e'; my ($revision) = $schema->documentElement->getChildrenByTagNameNS(PML_SCHEMA_NS,'revision'); my $revision_no = $revision ? $revision->textContent : 0; if (defined($param->{revision}) and $param->{revision} ne EMPTY and _cmp_revisions($revision_no,$param->{revision})!=0) { die $error."required $param->{revision}, got $revision_no"; } else { if (defined($param->{minimal_revision}) and $param->{minimal_revision} ne EMPTY and _cmp_revisions($revision_no,$param->{minimal_revision})<0) { die $error."required at least $param->{minimal_revision}, got $revision_no"; } if (defined($param->{maximal_revision}) and $param->{maximal_revision} ne EMPTY and _cmp_revisions($revision_no,$param->{maximal_revision})>0) { die $error."required at most $param->{maximal_revision}, got $revision_no"; } } } # compare schema revision numbers sub _cmp_revisions { my ($revision1,$revision2)=@_; my @revision1 = split(/\./,$revision1); my @revision2 = split(/\./,$revision2); my $cmp=0; while ($cmp==0 and (@revision1 or @revision2)) { $cmp = (shift(@revision1) <=> shift(@revision2)); } return $cmp; } # hash subelements by a given attribute or by content sub _hash_by { my ($pml,$what,$by)=@_; my %hash; foreach my $type ($pml->getChildrenByTagNameNS(PML_SCHEMA_NS,$what)) { my $name = defined($by) ? $type->getAttribute($by) : $type->textContent; $hash{$name} = $type; } return \%hash; } # traverse a type declaration and collect types referred to # type="type-name" declarations sub _get_referred_types { my ($type,$named_types,$referred_types) = @_; if (ref($type)) { foreach my $ref ($type->findnodes(q(./descendant-or-self::* [ contains(" list alt structure sequence element attribute member ", concat(" ",local-name()," ")) ]/@type))) { my $ref_type = $ref->getValue; if ($ref_type ne EMPTY and !exists( $referred_types->{$ref} )) { if ( $named_types->{ $ref_type } ) { $referred_types->{ $ref_type } = $named_types->{ $ref_type }; _get_referred_types( $ref_type, $named_types, $referred_types ); } else { warn "Cannot find declaration for type '$ref_type' in ".$type->ownerDocument->URI; } } } } } # copy each given named typee into $target_pml # after $import_element unless a type with the # same name already exists in the $existing hash sub import_types { my ($target_pml,$import_element, $types, $existing) = @_; my $target = $target_pml->ownerDocument; my $after_element; my $comment; if (!$opts{'no-comments'}) { my $schema = $import_element->hasAttribute('schema') ? $import_element->getAttributeNode("schema")->toString : EMPTY; my $type = $import_element->hasAttribute('type') ? $import_element->getAttributeNode("type")->toString : EMPTY; $comment = $target->createComment(" ".("=" x 12)." import$schema $type".("=" x 12)." "); $import_element->replaceNode($comment); $after_element = $comment; } else { $after_element = $import_element; } foreach my $ref_type (@$types) { my $name = $ref_type->getAttribute('name'); if (exists $existing->{ $name }) { _DEBUG("skipping type $name - already defined..."); next; } _DEBUG("copying type $name..."); my $copy = $target->importNode($ref_type); $existing->{ $name } = $copy; $target_pml->insertAfter($copy, $after_element); $after_element = $copy; } if (!$opts{'no-comments'}) { $comment = $target->createComment(" ".("=" x (length($comment->nodeValue)-4))." "); $target_pml->insertAfter($comment, $after_element); } else { $import_element->unbindNode; } } # process derive instructions # $named_type is a hash of type declarations indexed by name sub derive_types { my ($pml,$named_type) = @_; my $URL = $pml->ownerDocument->URI; _DEBUG("Deriving types in $URL ..."); foreach my $derive ($pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'derive')) { my $name = $derive->getAttribute('name'); my $source = $derive->getAttribute('type'); my $type; if (!defined $source or $source eq EMPTY) { die " instruction must specify source type in the attribute 'type' in $URL, line ".$derive->line_number."\n"; } if (!exists $named_type->{ $source }) { die "Can't find source type '$source' for in $URL, line ".$derive->line_number."\n"; } if (defined $name and $name ne EMPTY) { if (exists ($named_type->{ $name })) { die "Refusing to derive existing type '$name' from '$source' in $URL, line ".$derive->line_number."\n"; } $type = $named_type->{ $name } = $named_type->{ $source }->cloneNode(1); $pml->appendChild( $type ); $type->setAttribute('name', $name); } else { $name = $source; $type = $named_type->{ $source }; } _DEBUG("applying derive instructions to type $source, target type name $name..."); unless ($opts{'no-comments'}) { $pml->insertBefore($pml->ownerDocument->createComment(" ".("=" x 12)." derived from $source ".("=" x 12)." "), $type); } # deriving possible for structures, sequences and choices my ($inst) = $derive->findnodes(q(*[ contains(" structure sequence container choice ", concat(" ",local-name()," ")) ])); my ($target) = $type->findnodes(q(*[ contains(" structure sequence container choice ", concat(" ",local-name()," ")) ])); my %type_map = ( structure => ['member', 'name'], sequence => ['element', 'name'], container => ['attribute', 'name'], choice => ['value', undef ], # index by content ); if (!$inst) { die(" must contain a , , , or in $URL, line ".$derive->line_number."\n"); } elsif (!$target or $target->localname ne $inst->localname) { die(" and its source '$source' differ in type in $URL, line ".$derive->line_number."\n"); } else { my $tag = $inst->localname; my ( $item_tag, $index_by ) = @{ $type_map{ $tag } }; _DEBUG("will derive a $tag ($item_tag)"); foreach my $attr (grep { $_->nodeType == ATTRIBUTE_NODE } $inst->attributes) { if ($attr->getValue eq EMPTY) { _DEBUG("removing attribute".$attr->toString. " from <$tag>"); $target->removeAttribute($attr->nodeName) if $target->hasAttribute($attr->nodeName); } else { _DEBUG("copying attribute".$attr->toString. " to <$tag>"); $target->setAttribute($attr->nodeName,$attr->getValue); } } my $items = _hash_by( $target, $item_tag, $index_by ); my ($before) = $target->findnodes('*[@role="#CHILDNODES"]'); foreach my $item ($inst->getChildrenByTagNameNS(PML_SCHEMA_NS,$item_tag)) { my $iname = defined($index_by) ? $item->getAttribute($index_by) : $item->textContent; $item->unbindNode; if (exists( $items->{$iname} )) { _DEBUG("replacing $item_tag name=$iname"); $items->{$iname}->replaceNode( $item ); } else { _DEBUG("adding $item_tag '$iname'"); # should precede content declaration in a if ($item_tag eq 'attribute' and $target->firstChild) { $target->insertBefore($item,$target->firstChild); } elsif (ref $before) { $target->insertBefore($item,$before); } else { $target->appendChild($item); } } } foreach my $delete ($inst->getChildrenByTagNameNS(PML_SCHEMA_NS,'delete')) { my $dname = $delete->textContent; if (exists $items->{ $dname }) { _DEBUG("deleting $item_tag '$dname'"); $items->{ $dname }->unbindNode; } else { warn("No such $item_tag '$dname' in <$tag> '$source' in $URL, line ".$delete->line_number."\n"); } } } $derive->unbindNode; } _DEBUG("Deriving finished."); } # move declaration before any sub fix_root { my ($pml) = @_; my ($root) = $pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'root'); if (ref($root) and (my @before = $root->findnodes('preceding-sibling::*[local-name()="type"]'))) { $root->unbindNode; $pml->insertBefore($root,$before[0]); } } { my %cached; my %processing; # 1. resolve path $in relative to base_url # 2. try to locate an already processed schema in cache, # 3. otherwise, read schema from file and process it sub read_schema { my ($in, $param)=@_; my $base_url = $param->{base_url} || EMPTY; _DEBUG("read_schema: $in (base url: $base_url)"); if ($base_url ne EMPTY) { $in = ResolvePath($base_url,$in,$opts{path}); } elsif ( ! -f $in ) { $in = FindIn($in,$opts{path}); } _DEBUG("resolved to: $in"); if ($processing{$in}) { die "Fatal error: Cyclic imports detected when processing instructions in schema $in\n"; } my $dom; unless (exists($cached{ $in })) { _DEBUG("parsing..."); $dom = $cached{ $in } = $PARSER->parse_file($in); _DEBUG("done."); $processing{ $in } = 1; process_schema( $dom, $param ); delete $processing{ $in }; } else { _DEBUG("cached (no need to parse and process)."); $dom = $cached{ $in }; } return $dom; } } # process all and instructions sub process_schema { my ($dom, $param)=@_; my $URL = $dom->URI; my $pml = $dom->documentElement; my $named_type = _hash_by($pml,'type', 'name'); _DEBUG("Processing $URL"); unless ($opts{'no-import'}) { _DEBUG("Processing imports in $URL..."); foreach my $import ( $pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'import') ) { my $ref = $import->getAttribute('type'); my $schema_url = $import->getAttribute('schema'); if ($ref) { _DEBUG("Import type $ref from $schema_url"); } else { _DEBUG("Import all types from $schema_url"); } my $ref_schema = read_schema( $schema_url, { %$param, base_url => $URL }); check_revision($ref_schema,{ revision => $import->getAttribute('revision'), minimal_revision => $import->getAttribute('minimal_revision'), maximal_revision => $import->getAttribute('maximal_revision'), revision_error => "Error importing from schema $schema_url to $URL - revision mismatch: ", }); my @types_to_import=(); if ( defined $ref and $ref ne EMPTY ) { my $schema_types = _hash_by( $ref_schema->documentElement, 'type', 'name' ); my $ref_type = $schema_types->{ $ref }; if ($ref_type) { my %referred_type = ($ref => $ref_type); _get_referred_types($ref_type, $schema_types, \%referred_type); @types_to_import = values %referred_type; } else { warn "Cannot import type '$ref' from ".$ref_schema->URI." to $URL: type declaration not found."; } } else { my $ref_pml = $ref_schema->documentElement; @types_to_import = $ref_pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'type'); unless ($pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'root')) { _DEBUG("will try to import to $URL"); unshift @types_to_import, $ref_pml->getChildrenByTagNameNS(PML_SCHEMA_NS,'root'); } } _DEBUG("ready to copy types to $URL"); import_types( $pml, $import, \@types_to_import, $named_type ); } fix_root( $pml ); _DEBUG("Importing to $URL finished."); } derive_types($pml, $named_type) unless $opts{'no-derive'}; } ############################################################ __END__ =head1 pml_simplify pml_simplify - convert a modular PML schema to a simplified PML schema =head1 SYNOPSIS pml_simplify [options] < input_schema.xml > output_schema.xml pml_simplify [options] input_schema.xml output_schema.xml Get help: pml_simplify -u|--usage for usage (synopsis) pml_simplify -h|--help for help pml_simplify --man for the manual page =head1 DESCRIPTION Read a (modular) PML schema file and all its dependencies and output a semantically equivalent simplified PML schema which does not contain CincludeE> or CderiveE> instructions. =head1 OPTIONS =over 4 =item B<--path|-p> dir Try to locate missing schemas and schemas with network URLs in a given directory. This option may be repeated. =item B<--no-derive|-d> Do not process CderiveE> instructions. =item B<--no-import|-i> Do not process CimportE> instructions. =item B<--no-comments|-c> Do not insert informative comment nodes. =item B<--no-format|-f> Do not reformat the output (the result may look ugly and some elements may contain redundant namespace declarations). =item B<--debug|-D> Print lots of debugging messages on the standard error output. =head1 AUTHOR Petr Pajas, Epajas@ufal.ms.mff.cuni.czE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006 by Petr Pajas This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.2 or, at your option, any later version of Perl 5 you may have available. =head1 BUGS None reported... yet. =cut