#!/usr/bin/perl -w use strict; use warnings; use utf8; use open qw(:std :utf8); use integer; use Unicode::Normalize; # Filter_2_07_check_tags.pl # 2010-04-21 # Wolfgang Schmidle # This script checks whether the raw text has been marked up according to the DESpecs. # It performs no semantical check. Rather, it makes sure some formal criteria are fulfilled, # so that other scripts can rely on these criteria being fulfilled. # For example, scripts can rely on being on a separate line. # input: raw text, normally as a whole # output: a (possibly empty) list of warnings # text input my @text; while(<>) { push @text, $_; } # go through the text # overview of the possible tag properties: # tag types: Ic Iw Ss Sp Sa Se (a tag may have more than one type; in practice, only "fig" has two types) # fixed attributes: it fr "#" ("#" only in

) # attribute types: free number any (free: anything except spaces) # fixed attributes that may be added to the raw text: ita fra ... my @inlineTagsWithContent = qw(bf sc _ ^ ul ol red fr sp rom r); my @inlineTagsWithoutContent = qw(gap ? ! n); my @structureTagsSemiseparate = qw(h p q mgl mgr fn ac cap desc var); my @structureTagsSeparatePaired = qw(col tb ind toc fig); my @structureTagsSeparateAlone = qw(hd fig/); # (for pb and rh see below) # Chinese: push @inlineTagsWithContent, qw(sm sl dl cl wl); push @structureTagsSemiseparate, "ti"; push @structureTagsSeparatePaired, "list"; # < V>, < R>, < RV> sind hardcoded, siehe unten my %type = (); foreach (@inlineTagsWithContent) { $type{$_} = "Ic"; } # ... word ... foreach (@inlineTagsWithoutContent) { $type{$_} = "Iw"; } # ... ... foreach (@structureTagsSemiseparate) { $type{$_} = "Ss"; } #

... / ...

foreach (@structureTagsSeparatePaired) { $type{$_} = "Sp"; } # / foreach (@structureTagsSeparateAlone) { $type{$_} = "Sa"; } # $type{"pb"} = ""; $type{"rh"} = "Ss"; # a hack? my $std = " it fr ita fra "; my %attributes = (); my @attributeStandard = qw(pb rh h p q col tb ind toc mgl mgr fn ac cap desc var); my @attributeFree = qw(pb mgl mgr fn ac n r); foreach (@attributeStandard) { $attributes{$_} .= " $std "; } foreach (@attributeFree) { $attributes{$_} .= " free "; } $attributes{"p"} .= " # i ii iii x "; $attributes{"col"} .= " number "; # Chinese: $attributes{"h"} .= " number "; my $i = 0; my $inMetadata = 0; LINE: foreach (@text) { $i++; # remove comments s///g; if (m!^metadata:!) { $inMetadata = 1; next; } if ($inMetadata) { if (m!^$!) { $inMetadata = 0; } next; } # check for incorrect tag syntax if (m!<[^>]*$! || m!<[^>]*! || m!>[^<]*>!) { warn "Line $i: The < and > don't match: \n$_\n"; next LINE; } if (m!]! || m! /?>!) { warn "Line $i: Incorrectly formed tag: \n$_\n"; next LINE; } # check for unknown tags and incorrect attributes TAG: while (m!<([^>]+)>!g) { # accept <001> etc. next if ($1 =~ m!^\d\d\d$!); # Chinese: accept < V>, < R>, < RV> next if ($1 =~ m!^.V$!); next if ($1 =~ m!^.R$!); next if ($1 =~ m!^.RV$!); my ($tag, @attributes) = split " ", $1; # case 1: if ($tag =~ s!^/!!) { unless (exists $type{$tag}) { warn "Line $i: Unknown tag or incorrect attribute <$tag>: \n$_\n"; next LINE; } unless ($type{$tag} =~ m!^(Ic|Ss|Sp)$!) { warn "Line $i: <$tag> shouldn't have a closing tag: \n$_\n"; next LINE; } unless ($#attributes == -1) { warn "Line $i: shouldn't have attributes: \n$_\n"; next LINE; } next TAG; } # case 2: if ($tag =~ m!/$!) { unless (exists $type{$tag}) { warn "Line $i: Unknown tag or incorrect attribute <$tag>: \n$_\n"; next LINE; } unless ($#attributes == -1) { warn "Line $i: <$tag/> shouldn't have attributes: \n$_\n"; next LINE; } next TAG; } # case 3: unless (exists $type{$tag}) { warn "Line $i: Unknown tag or incorrect attribute <$tag>: \n$_\n"; next LINE; } my $freeUsed = 0; my $numberUsed = 0; my $attribute = ""; ATTRIBUTE: while ($attribute = shift @attributes) { next if ($attribute =~ m/^[a-z#]+$/ && $attributes{$tag} =~ m! $attribute !); if ($attribute =~ m!\d+! and (not $numberUsed) and $attributes{$tag} =~ m! number !) { $numberUsed = 1; next ATTRIBUTE; } if ((not $freeUsed) and $attributes{$tag} =~ m! free !) { $freeUsed = 1; next ATTRIBUTE; } warn "Line $i: <$tag> shouldn't have attribute \"$attribute\": \n$_\n"; next LINE; } } # only if all tags are syntactically correct, check some additional things: # special case , with or without if (m!]*>(\[[0-9a-z_.]+\])?(]*>[^<]+)?$!) { unless (m!^]*>(\[[^\]]+\])?(]*>.+)?$!) { warn "Line $i: should be on a separate line: \n$_\n"; } next LINE; } if (m!]!) { warn "Line $i: without : \n$_\n"; next LINE; } TAG: while (m!<([^ >]+)!g) { my $tag = $1; # special case <001> etc. next if ($tag =~ m!^\d\d\d$!); # Chinese: accept < V>, < R>, < RV> next if ($tag =~ m!^.V$!); next if ($tag =~ m!^.R$!); next if ($tag =~ m!^.RV$!); # make sure tags of type "Ss", "Sp" and "Sa" are at the beginning or end of a line # case 1: : Ss at the end, Sp alone, Sa (does not exist) if ($tag =~ s!^/!!) { if ($type{$tag} eq "Ss") { if (m!.!) { warn "Line $i: should be at the end of the line: \n$_\n"; next LINE; } } if ($type{$tag} eq "Sp") { unless (m!^$!) { warn "Line $i: should be on a separate line: \n$_\n"; next LINE; } } next TAG; } # case 2: : Ss at the beginning, Sp alone, Sa alone if ($type{$tag} eq "Ss") { if (m!.<$tag[ >]!) { warn "Line $i: <$tag> should be at the beginning of the line: \n$_\n"; next LINE; } } if ($type{$tag} eq "Sp" || $type{$tag} eq "Sa") { unless (m!^<$tag( [^>]+)?>$!) { warn "Line $i: <$tag> should be on a separate line: \n$_\n"; next LINE; } } } } my $inP = 0; $i = 0; foreach (@text) { $i++; # check for incorrectly nested tags if (m!]!) { unless ($inP == 0) { warn "Line $i : double

: \n$_\n"; } $inP = 1; } if (m!

!) { unless ($inP == 1) { warn "Line $i :

without preceding

: \n$_\n"; } $inP = 0; } } # check the inline model # ... die "That's all.\n"; # TO DO: # in Chinese text need not be at the beginning of a line (separate script for Chinese text?) # # , wenn es "whole book in Fraktur" meint # inline tags nur inline # Gruppen von tags, die nicht ineinander verschachtelt sein dürfen: # p h q (und eher esoterisch: cap desc var) # fn ac mgl mgr: dürfen p (und q?) enthalten, aber auch dort nicht verschachtelt # tag ohne Gegenstück:

ohne

und andersrum # --> schon durch verbotene Verschachtelung gefunden? # Einschränkungen bei # col, tb, ind, toc, fig, hd # Element überkreuz:

text

# wird überhaupt geprüft, dass es z.B. zu jedem auch ein gibt (und umgekehrt)? # zwei Phasen: # 1. wirklich falsche Dinge, # 2. Dinge, die leicht automatisch korrigiert werden können, z.B. auf eigene Zeile # beachte, dass fig nach DESpecs 1.1.2 "Sa" ist und nach DESpecs 2.0 entweder "Sp" oder "Se" # jetzt: # my @structureTagsSeparatePaired = qw(col tb ind toc); # my @structureTagsSeparateAlone = qw(hd fig); # später: # my @structureTagsSeparatePaired = qw(col tb ind toc fig); # my @structureTagsSeparateAlone = qw(hd fig/); # alternativ: eine Korrekturrunde in den frühen Texten, die in korrigiert? # q (blockquotes) analog zu p # a<_>n<_> # Sollen wir eine Liste von Standard-tags akzeptieren, wie: # für <_> # für # für _ _ # oder sogar in unseren Specs ändern? # Und wo wird q3 zu  ? Und hui<_>9 zu huiꝰ ?