#!/usr/bin/perl -w use strict; use warnings; use utf8; use open qw(:std :utf8); use integer; use Unicode::Normalize; use Getopt::Long; use Pod::Usage; $| = 1; require $ENV{'ECHO_SCRIPTS_DIR'} . "/share/common_functions.pl"; consoleStatus('Making the tags wellformed.'); # Filter_3_06_make_tags_wellformed.pl # 2010-08-15 # Wolfgang Schmidle my $name = "Filter_3_06_make_tags_wellformed"; my $textfilterParameters = "Filter_parameters/$name.txt"; # get command line options my $despecs = "chinese-2.1"; # default für chinesische Texte; für westliche Texte wäre der default "2.0" my $man = 0; my $help = 0; my $parameterformat = 'despecs=s,$'."\n"; sub readParameters { GetOptions("despecs=s" => \$despecs, "parameterformat" => sub { print $parameterformat; exit; }, 'help|?' => \$help, man => \$man) or pod2usage(2); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; } sub isTextfilter { return (exists $ENV{'BB_DOC_NAME'}) } readParameters(); # ohne Dateiname: brich mit USAGE ab. unless ($#ARGV > -1) { pod2usage(1); } # text input my @text; while(<>) { push @text, $_; } # read my %unknown = (); my $inParameters = 0; foreach (@text) { last if m!!; if (m!!) { $inParameters = 1; } if ($inParameters) { if (m![\t ,>]despecs *= *([^ ,\n<]+)!) { $despecs = $1; } } } # read the textfilter parameters if (isTextfilter()) { open (PARA, $textfilterParameters) or die "3.06, used as textfilter: can't find the parameter list!\n"; @ARGV = ; close(PARA); chomp @ARGV; readParameters(); } # go through the text my @chineseCharacterBlocks = ( "CJK Unified Ideographs", "CJK Compatibility Ideographs", "CJK Compatibility Ideographs Supplement" ); my $chineseCharacter = ""; foreach (@chineseCharacterBlocks) { s! !!g; $chineseCharacter .= '\p{' . $_ . '}'; } foreach (@text) { # reserved characters in XML s!&!&!g; s!&!&!g; # correct syntax for attributes s!<([a-z]+) it>!<$1 style="it">!g; s!<([a-z]+) fr>!<$1 style="fr">!g; s!]+)>!!g; # [0037] --> [0037] s!<([a-z]+) ita>!<$1 xml:lang="it">!g; # (note the change from ita to it) s!<([a-z]+) fra>!<$1 xml:lang="fr">!g; # (note the change from fra to fr) s!]+)>!!g; s!]+)>!!g; # was passiert bei ? s!]+)>!!g; # was passiert bei ? s!]+)>!!g; # was passiert bei ? # add / in empty elements s!!!g; # [0037] --> [0037] s!/]+)>!!g; s!!!g; s!!!g if $despecs eq "1.1.2"; s!!!g; # change the names of some elements # part 1: element names that have to be changed in oder to make the XML wellformed s!<\^>!!g; # will become later on s!!!g; s!<_>!_{!g; # _{will become later on
s!!}!g;
s!<(\d\d\d)>!!g;
s!\@!!g; # new!
s!<\?>!!g;
s!<\!>!!g;

# part 2: elements whose names differ in the DESpecs and the ECHO schema
s!!!g;

s!!!g;
s!]+)>!!g;
s!!!g;

s!])!!!g;
s!])!!!g;

# s!])!!!g;

s!!!;
s!!!;
s!!!;
s!])!!!;
s!])!!!;
s!])!!!; # (assuming it's inside and not a variable)

# chinese text

if ($despecs =~ m!^chinese!) {

s!<([$chineseCharacter])(R|V|RV)>!$1!g;

s!!
!;
s!
!
!;
s!
!
!;
s!
!
!;

s!!!; # kein !

s!!!;
s!!!;

s!!!;
}
}

# text output

# print @text;
printInOutputTextFile(@text);

consoleStatus('Finished. The XML should now be wellformed.');

# TO DO:
# Attribute: # in
, Zahl in ,
# irgendwas ohne Leerzeichen in , , ,, ,
# irgendwas in

# verschachtelte

# z.B: --> -->
wirklich in mehreren getrennten Schritten?
# stattdessen nach Modulen geordnet?

# in späteren Texten muss man und unterscheiden

# überlegen: Elemente umbenennen eleganter als xsl-Skript?

# Attribute in Listen statt alle Fälle explizit durchzugehen?
# dann: unterscheide zwischen it (steht in den DESpecs) und ita (steht nicht in den DESpecs)

# Mehrere Attribute: ,

# Wenn ein tag in den raw text eingefügt wurde, wie zum Beispiel CIↃIↃCCLVIII,
# soll das Skript das nicht ändern. Erkennungszeichen ist wohl, dass in ein "=" ist.
# (Allerdings würde dieses Beispiel sowieso nicht verändert werden.)

__END__

=head1 NAME

Filter_2_03_find_forbidden_characters

=head1 SYNOPSIS

XXX noch nicht angepasst

perl Filter_2_03_find_forbidden_characters.pl [options] [file]

Options:
-help brief help message
-man full documentation
-dir path to the additional files
-forbidden adds a list of forbidden characters

=head1 OPTIONS
=over 8

=item B<-help>
Print a brief help message and exits.

=item B<-man>
Prints the manual page and exits.

=item B<-forbidden>
Adds a list of forbidden characters. The order of lists is important.

=back

=head1 DESCRIPTION

B will check the input file for forbidden characters.

For the parameter fromat in the helper files see ...

=head1 TO DO

...

=cut}