#!/usr/bin/perl -w use strict; use warnings; use utf8; use open qw(:std :utf8); use integer; use Unicode::Normalize; use shared::Options; use shared::Language; # Filter_3_06_make_tags_wellformed.pl # 2010-08-22 # Wolfgang Schmidle # get options my $language = ""; my $despecs = ""; shared::Options::readParameters( "language=s" => \$language, "despecs=s" => \$despecs ); # interpret options unless ($despecs) { if (shared::Language::standardizedLanguage($language) eq "chinese") { $despecs = "chinese-2.1"; } else { $despecs = "2.0"; } } # text input my @text; while(<>) { push @text, $_; } # go through the text my $chineseCharacter = shared::Language::chineseCharacter(); foreach (@text) { # reserved characters in XML s!&!&!g; s!&amp;!&!g; # correct syntax for attributes s!<([a-z]+) it>!<$1 style="it">!g; s!<([a-z]+) fr>!<$1 style="fr">!g; s!]+)>!!g; # [0037] --> [0037] s!<([a-z]+) ita>!<$1 xml:lang="it">!g; # (note the change from ita to it) s!<([a-z]+) fra>!<$1 xml:lang="fr">!g; # (note the change from fra to fr) s!]+)>!!g; s!]+)>!!g; # was passiert bei ? s!]+)>!!g; # was passiert bei ? s!]+)>!!g; # was passiert bei ? # add / in empty elements s!!!g; # [0037] --> [0037] s!/]+)>!!g; s!!!g; s!!!g if $despecs eq "1.1.2"; s!!!g; # change the names of some elements # part 1: element names that have to be changed in oder to make the XML wellformed s!<\^>!!g; # will become later on s!!!g; s!<_>!!g; # will become later on s!!!g; s!<(\d\d\d)>!!g; s!<\?>!!g; s!<\!>!!g; # part 2: elements whose names differ in the DESpecs and the ECHO schema s!!!g; s!!!g; s!]+)>!!g; s!!!g; s!])!
!
!g; s!])!
!
!g; # s!])!
!
!g; s!!
!; s!!
!; s!!
!; s!])!!!; s!])!!!; s!])!!!; # (assuming it's inside
and not a variable) # chinese text if ($despecs =~ m!^chinese!) { s!<($chineseCharacter)(R|V|RV)>!$1!g; s!

!

!; s!

!

!; s!

!

!; s!

!

!; s!!!; # kein ! s!!!; s!!!; s!!!; } } # text output print @text; # TO DO: # Attribute: # in

, Zahl in , # irgendwas ohne Leerzeichen in , , ,, , # irgendwas in # verschachtelte # z.B: --> -->

wirklich in mehreren getrennten Schritten? # stattdessen nach Modulen geordnet? # in späteren Texten muss man und unterscheiden # überlegen: Elemente umbenennen eleganter als xsl-Skript? # Attribute in Listen statt alle Fälle explizit durchzugehen? # dann: unterscheide zwischen it (steht in den DESpecs) und ita (steht nicht in den DESpecs) # Mehrere Attribute: , # Wenn ein tag in den raw text eingefügt wurde, wie zum Beispiel CIↃIↃCCLVIII, # soll das Skript das nicht ändern. Erkennungszeichen ist wohl, dass in ein "=" ist. # (Allerdings würde dieses Beispiel sowieso nicht verändert werden.) __END__ =head1 NAME Filter_2_03_find_forbidden_characters muh =head1 SYNOPSIS XXX noch nicht angepasst perl Filter_2_03_find_forbidden_characters.pl [options] [file] Options: -help brief help message -man full documentation -dir path to the additional files -forbidden adds a list of forbidden characters =head1 OPTIONS =over 8 =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =item B<-forbidden> Adds a list of forbidden characters. The order of lists is important. =back =head1 DESCRIPTION B will check the input file for forbidden characters. For the parameter fromat in the helper files see ... =head1 TO DO ... =cut