#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use open qw(:std :utf8);
use integer;
use Unicode::Normalize;


# Filter_2_07_check_tags.pl
# 2010-04-21
# Wolfgang Schmidle

# This script checks whether the raw text has been marked up according to the DESpecs.
# It performs no semantical check. Rather, it makes sure some formal criteria are fulfilled,
# so that other scripts can rely on these criteria being fulfilled.
# For example, scripts can rely on <pb> being on a separate line.

# input: raw text, normally as a whole
# output: a (possibly empty) list of warnings


# text input

my @text;
while(<>) { push @text, $_; }


# go through the text

# overview of the possible tag properties:
# tag types: Ic Iw Ss Sp Sa Se (a tag may have more than one type; in practice, only "fig" has two types)
# fixed attributes: it fr "#" ("#" only in <p>)
# attribute types: free number any (free: anything except spaces)
# fixed attributes that may be added to the raw text: ita fra ...

my @inlineTagsWithContent = qw(bf sc _ ^ ul ol red   fr sp rom   r);
my @inlineTagsWithoutContent = qw(gap ? !    n);
my @structureTagsSemiseparate = qw(h p q   mgl mgr   fn ac   cap desc var);
my @structureTagsSeparatePaired = qw(col   tb   ind toc   fig);
my @structureTagsSeparateAlone = qw(hd fig/);
# (for pb and rh see below)

# Chinese:
push @inlineTagsWithContent, qw(sm  sl dl cl wl);
push @structureTagsSemiseparate, "ti";
push @structureTagsSeparatePaired, "list";
# < V>, < R>, < RV> sind hardcoded, siehe unten

my %type = ();
foreach (@inlineTagsWithContent) { $type{$_} = "Ic"; }  # ... <bf>word</bf> ...
foreach (@inlineTagsWithoutContent) { $type{$_} = "Iw"; } # ... <n *> ...
foreach (@structureTagsSemiseparate) { $type{$_} = "Ss"; } # <p> ...  /  ... </p>
foreach (@structureTagsSeparatePaired) { $type{$_} = "Sp"; } # <col 1>  /  </col>
foreach (@structureTagsSeparateAlone) { $type{$_} = "Sa"; } # <fig>
$type{"pb"} = "";
$type{"rh"} = "Ss"; # a hack?

my $std = " it fr ita fra ";

my %attributes = ();
my @attributeStandard = qw(pb rh h p q col tb ind toc mgl mgr fn ac cap desc var);
my @attributeFree = qw(pb mgl mgr fn ac n r);
foreach (@attributeStandard) { $attributes{$_} .= " $std "; }
foreach (@attributeFree) { $attributes{$_} .= " free "; }
$attributes{"p"} .= " # i ii iii x ";
$attributes{"col"} .= " number ";

# Chinese:
$attributes{"h"} .= " number ";


my $i = 0;
my $inMetadata = 0;

LINE: foreach (@text) {

	$i++;
	
	# remove comments
	s/<!--(.+?)-->//g;
	
	if (m!^metadata:!) { $inMetadata = 1; next; }
	if ($inMetadata) {
		if (m!^$!) { $inMetadata = 0; }
		next;
	}
	
	# check for incorrect tag syntax
	
	if (m!<[^>]*$! || m!<[^>]*<! 
		|| m!^[^<]*>! || m!>[^<]*>!) {
			warn "Line $i: The < and > don't match: \n$_\n";
			next LINE;	
	}
	if (m!</?[ >]! || m! /?>!) {
			warn "Line $i: Incorrectly formed tag: \n$_\n";
			next LINE;	
	}
	
	# check for unknown tags and incorrect attributes
	
	TAG: while (m!<([^>]+)>!g) {
	
		# accept <001> etc.
		
		next if ($1 =~ m!^\d\d\d$!);
		
		# Chinese: accept < V>, < R>, < RV>
		
		next if ($1 =~ m!^.V$!);
		next if ($1 =~ m!^.R$!);
		next if ($1 =~ m!^.RV$!);
		
		my ($tag, @attributes) = split " ", $1;
		
		# case 1: </tag>
		
		if ($tag =~ s!^/!!) {
			unless (exists $type{$tag}) {
				warn "Line $i: Unknown tag or incorrect attribute <$tag>: \n$_\n";
				next LINE;
			}
			unless ($type{$tag} =~ m!^(Ic|Ss|Sp)$!) {
				warn "Line $i: <$tag> shouldn't have a closing tag: \n$_\n";
				next LINE;
			}
			unless ($#attributes == -1) {
				warn "Line $i: </$tag> shouldn't have attributes: \n$_\n";
				next LINE;
			}
			next TAG;
			
		} 
		
		# case 2: <tag/>

		if ($tag =~ m!/$!) {
				unless (exists $type{$tag}) {
				warn "Line $i: Unknown tag or incorrect attribute <$tag>: \n$_\n";
				next LINE;
			}
			unless ($#attributes == -1) {
				warn "Line $i: <$tag/> shouldn't have attributes: \n$_\n";
				next LINE;
			}
			next TAG;
		}

		# case 3: <tag>

		unless (exists $type{$tag}) {
			warn "Line $i: Unknown tag or incorrect attribute <$tag>: \n$_\n";
			next LINE;
		}
		my $freeUsed = 0;
		my $numberUsed = 0;
		my $attribute = "";
		ATTRIBUTE: while ($attribute = shift @attributes) {
			next if ($attribute =~ m/^[a-z#]+$/ && $attributes{$tag} =~ m! $attribute !);
			if ($attribute =~ m!\d+! and (not $numberUsed) and $attributes{$tag} =~ m! number !) {
				$numberUsed = 1;
				next ATTRIBUTE;
			}
			if ((not $freeUsed) and $attributes{$tag} =~ m! free !) {
				$freeUsed = 1;
				next ATTRIBUTE;
			}
			warn "Line $i: <$tag> shouldn't have attribute \"$attribute\": \n$_\n";
			next LINE;
		}
	}

	# only if all tags are syntactically correct, check some additional things:

	# special case <pb>, with or without <rh>
	
	if (m!<pb!) {
		#unless (m!^<pb[^>]*>(\[[0-9a-z_.]+\])?(<rh[^>]*>[^<]+</rh>)?$!) {
		unless (m!^<pb[^>]*>(\[[^\]]+\])?(<rh[^>]*>.+</rh>)?$!) {
			warn "Line $i: <pb> should be on a separate line: \n$_\n";
		}
		next LINE;
	}
	if (m!<rh[ >]!) {
		warn "Line $i: <rh> without <pb>: \n$_\n";
		next LINE;		
	}

	TAG: while (m!<([^ >]+)!g) {

		my $tag = $1;
		
		# special case <001> etc.

		next if ($tag =~ m!^\d\d\d$!);

		# Chinese: accept < V>, < R>, < RV>
		
		next if ($tag =~ m!^.V$!);
		next if ($tag =~ m!^.R$!);
		next if ($tag =~ m!^.RV$!);

		# make sure tags of type "Ss", "Sp" and "Sa" are at the beginning or end of a line
	
		# case 1: </tag>: Ss at the end, Sp alone, Sa (does not exist)

		if ($tag =~ s!^/!!) {
		
			if ($type{$tag} eq "Ss") {
				if (m!</$tag>.!) {
					warn "Line $i: </$tag> should be at the end of the line: \n$_\n";
					next LINE;
				}
			}
			if ($type{$tag} eq "Sp") {
				unless (m!^</$tag>$!) {
					warn "Line $i: </$tag> should be on a separate line: \n$_\n";
					next LINE;				
				}
			}
			next TAG;
		}

		# case 2: <tag>: Ss at the beginning, Sp alone, Sa alone

		if ($type{$tag} eq "Ss") {
			if (m!.<$tag[ >]!) {
				warn "Line $i: <$tag> should be at the beginning of the line: \n$_\n";
				next LINE;
			}
		}
		if ($type{$tag} eq "Sp" || $type{$tag} eq "Sa") {
			unless (m!^<$tag( [^>]+)?>$!) {
				warn "Line $i: <$tag> should be on a separate line: \n$_\n";
				next LINE;				
			}
		}
	}
}


my $inP = 0;

$i = 0;

foreach (@text) {

	$i++;

	# check for incorrectly nested tags
	
	if (m!<p[ >]!) { 
		unless ($inP == 0) {
			warn "Line $i : double <p>: \n$_\n";		
		}
		$inP = 1; 
	}
	if (m!</p>!) { 
		unless ($inP == 1) {
			warn "Line $i : </p> without preceding <p>: \n$_\n";		
		}
		$inP = 0; 
	}
}


# check the inline model
# ... 


die "That's all.\n";


# TO DO:
# <h> in Chinese text need not be at the beginning of a line (separate script for Chinese text?)
# <hd>
# <fr>, wenn es "whole book in Fraktur" meint
# inline tags nur inline
# Gruppen von tags, die nicht ineinander verschachtelt sein dürfen: 
#   p h q  (und eher esoterisch: cap desc var)
#   fn ac mgl mgr: dürfen p (und q?) enthalten, aber auch dort nicht verschachtelt
# tag ohne Gegenstück: <p> ohne </p> und andersrum
#   --> schon durch verbotene Verschachtelung gefunden?
# Einschränkungen bei 	
#   col, tb, ind, toc, fig, hd
# Element überkreuz: <mgl><p> text </mgl></p>

# wird überhaupt geprüft, dass es z.B. zu jedem <sc> auch ein </sc> gibt (und umgekehrt)?

# zwei Phasen: 
# 1. wirklich falsche Dinge, 
# 2. Dinge, die leicht automatisch korrigiert werden können, z.B. <pb> auf eigene Zeile

# beachte, dass fig nach DESpecs 1.1.2 "Sa" ist und nach DESpecs 2.0 entweder "Sp" oder "Se"
# jetzt:
# my @structureTagsSeparatePaired = qw(col   tb   ind toc);
# my @structureTagsSeparateAlone = qw(hd fig);
# später:
# my @structureTagsSeparatePaired = qw(col   tb   ind toc   fig);
# my @structureTagsSeparateAlone = qw(hd fig/);
# alternativ: eine Korrekturrunde in den frühen Texten, die <fig> in <fig/> korrigiert?

# q (blockquotes) analog zu p

# a<_>n<_>

# Sollen wir eine Liste von Standard-tags akzeptieren, wie:
# <x> für <_>
# <sqrt> für <r>
# <i> für _ _
# oder sogar in unseren Specs ändern?
# Und wo wird q<x>3</x> zu  ? Und hui<_>9</_> zu huiꝰ ?