#!/usr/bin/perl -w use strict; use warnings; use utf8; use open qw(:std :utf8); use integer; use Unicode::Normalize; use Getopt::Long; use Pod::Usage; $| = 1; require $ENV{'ECHO_SCRIPTS_DIR'} . "/share/common_functions.pl"; consoleStatus('Making the tags wellformed.'); # Filter_3_06_make_tags_wellformed.pl # 2010-08-15 # Wolfgang Schmidle my $name = "Filter_3_06_make_tags_wellformed"; my $textfilterParameters = "Filter_parameters/$name.txt"; # get command line options my $despecs = "chinese-2.1"; # default für chinesische Texte; für westliche Texte wäre der default "2.0" my $man = 0; my $help = 0; my $parameterformat = 'despecs=s,$'."\n"; sub readParameters { GetOptions("despecs=s" => \$despecs, "parameterformat" => sub { print $parameterformat; exit; }, 'help|?' => \$help, man => \$man) or pod2usage(2); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; } sub isTextfilter { return (exists $ENV{'BB_DOC_NAME'}) } readParameters(); # ohne Dateiname: brich mit USAGE ab. unless ($#ARGV > -1) { pod2usage(1); } # text input my @text; while(<>) { push @text, $_; } # read my %unknown = (); my $inParameters = 0; foreach (@text) { last if m!!; if (m!!) { $inParameters = 1; } if ($inParameters) { if (m![\t ,>]despecs *= *([^ ,\n<]+)!) { $despecs = $1; } } } # read the textfilter parameters if (isTextfilter()) { open (PARA, $textfilterParameters) or die "3.06, used as textfilter: can't find the parameter list!\n"; @ARGV = ; close(PARA); chomp @ARGV; readParameters(); } # go through the text my @chineseCharacterBlocks = ( "CJK Unified Ideographs", "CJK Compatibility Ideographs", "CJK Compatibility Ideographs Supplement" ); my $chineseCharacter = ""; foreach (@chineseCharacterBlocks) { s! !!g; $chineseCharacter .= '\p{' . $_ . '}'; } foreach (@text) { # reserved characters in XML s!&!&!g; s!&amp;!&!g; # correct syntax for attributes s!<([a-z]+) it>!<$1 style="it">!g; s!<([a-z]+) fr>!<$1 style="fr">!g; s!]+)>!!g; # [0037] --> [0037] s!<([a-z]+) ita>!<$1 xml:lang="it">!g; # (note the change from ita to it) s!<([a-z]+) fra>!<$1 xml:lang="fr">!g; # (note the change from fra to fr) s!]+)>!!g; s!]+)>!!g; # was passiert bei ? s!]+)>!!g; # was passiert bei ? s!]+)>!!g; # was passiert bei ? # add / in empty elements s!!!g; # [0037] --> [0037] s!/]+)>!!g; s!!!g; s!!!g if $despecs eq "1.1.2"; s!!!g; # change the names of some elements # part 1: element names that have to be changed in oder to make the XML wellformed s!<\^>!!g; # will become later on s!!!g; s!<_>!!g; # will become later on s!!!g; s!<(\d\d\d)>!!g; s!\@!!g; # new! s!<\?>!!g; s!<\!>!!g; # part 2: elements whose names differ in the DESpecs and the ECHO schema s!!!g; s!!!g; s!]+)>!!g; s!!!g; s!])!
!
!g; s!])!
!
!g; # s!])!
!
!g; s!!
!; s!!
!; s!!
!; s!])!!!; s!])!!!; s!])!!!; # (assuming it's inside
and not a variable) # chinese text if ($despecs =~ m!^chinese!) { s!<([$chineseCharacter])(R|V|RV)>!$1!g; s!

!

!; s!

!

!; s!

!

!; s!

!

!; s!!!; # kein ! s!!!; s!!!; s!!!; } } # text output # print @text; printInOutputTextFile(@text); consoleStatus('Finished. The XML should now be wellformed.'); # TO DO: # Attribute: # in

, Zahl in , # irgendwas ohne Leerzeichen in , , ,, , # irgendwas in # verschachtelte # z.B: --> -->

wirklich in mehreren getrennten Schritten? # stattdessen nach Modulen geordnet? # in späteren Texten muss man und unterscheiden # überlegen: Elemente umbenennen eleganter als xsl-Skript? # Attribute in Listen statt alle Fälle explizit durchzugehen? # dann: unterscheide zwischen it (steht in den DESpecs) und ita (steht nicht in den DESpecs) # Mehrere Attribute: , # Wenn ein tag in den raw text eingefügt wurde, wie zum Beispiel CIↃIↃCCLVIII, # soll das Skript das nicht ändern. Erkennungszeichen ist wohl, dass in ein "=" ist. # (Allerdings würde dieses Beispiel sowieso nicht verändert werden.) __END__ =head1 NAME Filter_2_03_find_forbidden_characters =head1 SYNOPSIS XXX noch nicht angepasst perl Filter_2_03_find_forbidden_characters.pl [options] [file] Options: -help brief help message -man full documentation -dir path to the additional files -forbidden adds a list of forbidden characters =head1 OPTIONS =over 8 =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =item B<-forbidden> Adds a list of forbidden characters. The order of lists is important. =back =head1 DESCRIPTION B will check the input file for forbidden characters. For the parameter fromat in the helper files see ... =head1 TO DO ... =cut