#!/usr/bin/perl -w use strict; use warnings; use utf8; use open qw(:std :utf8); use integer; use Unicode::Normalize; use shared::Options; use shared::Language; # Filter_3_06_make_tags_wellformed.pl # 2010-08-22 # Wolfgang Schmidle # get options my $language = ""; my $despecs = ""; shared::Options::readParameters( "language=s" => \$language, "despecs=s" => \$despecs ); # interpret options unless ($despecs) { if (shared::Language::standardizedLanguage($language) eq "chinese") { $despecs = "chinese-2.1"; } else { $despecs = "2.0"; } } # text input my @text; while(<>) { push @text, $_; } # go through the text my $chineseCharacter = shared::Language::chineseCharacter(); foreach (@text) { # reserved characters in XML s!&!&!g; s!&!&!g; # correct syntax for attributes s!<([a-z]+) it>!<$1 style="it">!g; s!<([a-z]+) fr>!<$1 style="fr">!g; s!]+)>!!g; # [0037] --> [0037] s!<([a-z]+) ita>!<$1 xml:lang="it">!g; # (note the change from ita to it) s!<([a-z]+) fra>!<$1 xml:lang="fr">!g; # (note the change from fra to fr) s!]+)>!!g; s!]+)>!!g; # was passiert bei ? s!]+)>!!g; # was passiert bei ? s!]+)>!!g; # was passiert bei ? # add / in empty elements s!!!g; # [0037] --> [0037] s!/]+)>!!g; s!!!g; s!!!g if $despecs eq "1.1.2"; s!!!g; # change the names of some elements # part 1: element names that have to be changed in oder to make the XML wellformed s!<\^>!!g; # will become later on s!!!g; s!<_>!_{!g; # _{will become later on
s!!}!g;
s!<(\d\d\d)>!!g;
s!<\?>!!g;
s!<\!>!!g;

# part 2: elements whose names differ in the DESpecs and the ECHO schema
s!!!g;

s!!!g;
s!]+)>!!g;
s!!!g;

s!])!!!g;
s!])!!!g;

# s!])!!!g;

s!!!;
s!!!;
s!!!;
s!])!!!;
s!])!!!;
s!])!!!; # (assuming it's inside and not a variable)

# chinese text

if ($despecs =~ m!^chinese!) {

s!<($chineseCharacter)(R|V|RV)>!$1!g;

s!!
!;
s!
!
!;
s!
!
!;
s!
!
!;

s!!!; # kein !

s!!!;
s!!!;

s!!!;
}
}

# text output

print @text;

# TO DO:
# Attribute: # in
, Zahl in ,
# irgendwas ohne Leerzeichen in , , ,, ,
# irgendwas in

# verschachtelte

# z.B: --> -->
wirklich in mehreren getrennten Schritten?
# stattdessen nach Modulen geordnet?

# in späteren Texten muss man und unterscheiden

# überlegen: Elemente umbenennen eleganter als xsl-Skript?

# Attribute in Listen statt alle Fälle explizit durchzugehen?
# dann: unterscheide zwischen it (steht in den DESpecs) und ita (steht nicht in den DESpecs)

# Mehrere Attribute: ,

# Wenn ein tag in den raw text eingefügt wurde, wie zum Beispiel CIↃIↃCCLVIII,
# soll das Skript das nicht ändern. Erkennungszeichen ist wohl, dass in ein "=" ist.
# (Allerdings würde dieses Beispiel sowieso nicht verändert werden.)

__END__

=head1 NAME

Filter_2_03_find_forbidden_characters muh

=head1 SYNOPSIS

XXX noch nicht angepasst

perl Filter_2_03_find_forbidden_characters.pl [options] [file]

Options:
-help brief help message
-man full documentation
-dir path to the additional files
-forbidden adds a list of forbidden characters

=head1 OPTIONS

=over 8

=item B<-help>
Print a brief help message and exits.

=item B<-man>
Prints the manual page and exits.

=item B<-forbidden>
Adds a list of forbidden characters. The order of lists is important.

=back

=head1 DESCRIPTION

B will check the input file for forbidden characters.

For the parameter fromat in the helper files see ...

=head1 TO DO

...

=cut}