#!/usr/bin/perl -w use strict; use warnings; use utf8; use open qw(:std :utf8); use integer; use Unicode::Normalize; # Filter_3_03_replace_escape_sequences.pl # 2010-04-04 # Wolfgang Schmidle # This script replaces all escape sequences defined in the DESpecs by their Unicode equivalent. # It does not perform any semantically motivated replacements. For example, # it does not change <^>9 ("superscript 9") into the medievalist character U+A770. # input: raw text, normally as a whole # output: the text with all escape sequences replaced # text input my @text; while(<>) { push @text, $_; } # go through the text my %combining = ( "'" => "\x{0301}", # acute accent "`" => "\x{0300}", # grave accent '\^' => "\x{0302}", # circumflex accent '"' => "\x{0308}", # diaeresis/umlaut "~" => "\x{0303}", # tilde "," => "\x{0327}", # cedilla '\.' => "\x{0307}", # dot above "=" => "\x{0304}", # macron "-" => "\x{0306}", # breve # (some fancy versions, due to a LaTeX conversion problem in the DESpecs) "’" => "\x{0301}", # fancy acute accent "‘" => "\x{0300}", # fancy grave accent "”" => "\x{0308}" # fancy diaeresis ); my %bracket = ( "ij" => "ij", # note that the ligature ij will be silently resolved later on in the workflow "is" => "is", # (instantly resolved as there is no is ligature in Unicode) "ſs" => "ß", "ae" => "ę", "con" => "ↄ", # U+2184: see MUFI 3.0 part 2, p.69 "que" => "", # PUA U+E8BF "quam" => "̈", # PUA U+E8BF plus U+0308 (or U+0303 ?) "quis" => "ꝙ", # U+A759 "quo" => "q̊", # q plus U+030A "pro" => "ꝓ", # U+A753 "secundu" => "ẜ" ); foreach (@text) { $_ = NFC($_); # normalise before # resolve \ sequences foreach my $char (keys %combining) { s!\\$char([a-z])!$1$combining{$char}!g; } # resolve $ to long s (must be before resolving { }) s!\$!ſ!g; # resolve { } foreach my $char (keys %bracket) { s!{$char}!$bracket{$char}!g; } $_ = NFC($_); # normalise after } # text output print @text; # TO DO: # fractions: { / } # Die Definition von %bracket auch in eine Hilfsdatei auslagern? Aber das sind die # escape sequences aus den Specs, also eine geschlossene Liste. # Andererseits: neue Specs, neu Liste