1 | #!/usr/bin/perl -w |
---|
2 | use strict; |
---|
3 | use warnings; |
---|
4 | use utf8; |
---|
5 | binmode STDOUT, ':utf8'; |
---|
6 | use integer; |
---|
7 | |
---|
8 | |
---|
9 | # Euclid_1607.pl |
---|
10 | # |
---|
11 | # author: Wolfgang Schmidle |
---|
12 | # (c) Max Planck Institute for the History of Science, Berlin, Germany |
---|
13 | # version 1.5, 2009-04-07 |
---|
14 | # |
---|
15 | # This script processes the raw text of Euclid (1607), i.e. it turns (a local copy of) |
---|
16 | # http://pythia.mpiwg-berlin.mpg.de/department1/mpdl/raw-texts/WO1_Euclid_1607.txt/WO1_Euclid_1607_V1.txt |
---|
17 | # into a valid XML file. |
---|
18 | # |
---|
19 | # For Euclid (1607) see also |
---|
20 | # https://itgroup.mpiwg-berlin.mpg.de:8080/tracs/mpdl-project-content/wiki/WO1_Euclid_1607 |
---|
21 | # |
---|
22 | # Usage: perl Euclid_1607.txt > Euclid_1607.xml |
---|
23 | |
---|
24 | |
---|
25 | my $rawtext = shift @ARGV; |
---|
26 | |
---|
27 | |
---|
28 | # text-specific preparations |
---|
29 | |
---|
30 | |
---|
31 | # global variables |
---|
32 | |
---|
33 | my @text; |
---|
34 | |
---|
35 | # counters |
---|
36 | my $line; |
---|
37 | my $doNotCheckBeforeLine = 0; |
---|
38 | my $pb = 0; |
---|
39 | my $def = 0; |
---|
40 | my $post = 0; |
---|
41 | my $comm = 0; |
---|
42 | my $propos = 0; |
---|
43 | |
---|
44 | my %divTypeCount; |
---|
45 | |
---|
46 | # booleans |
---|
47 | my $inH = 0; |
---|
48 | my $inP = 0; |
---|
49 | my $inMath = 0; |
---|
50 | my $inMg = 0; |
---|
51 | my $italicsH = 0; |
---|
52 | my $italicsP = 0; |
---|
53 | my $italicsTb = 0; |
---|
54 | my $italicsMg = 0; |
---|
55 | my $lineStartsWithS = 0; |
---|
56 | my $inTb = 0; |
---|
57 | my $mgInItalics = 0; |
---|
58 | |
---|
59 | my $inDiv3 = 0; |
---|
60 | my $inDef = 0; |
---|
61 | my $defOnHold = 0; |
---|
62 | my $inPost = 0; |
---|
63 | my $inComm = 0; |
---|
64 | my $inPropos = 0; |
---|
65 | |
---|
66 | my $checkForDiv4 = 0; |
---|
67 | |
---|
68 | # pointers |
---|
69 | my $inMgSince = 0; |
---|
70 | |
---|
71 | # others |
---|
72 | my $pbTag = ""; |
---|
73 | my $div4 = ""; |
---|
74 | |
---|
75 | |
---|
76 | # read in the raw text |
---|
77 | # and immediate emendations (things that break the processing, and changes of the line structure) |
---|
78 | |
---|
79 | $line = 0; |
---|
80 | |
---|
81 | open (RAWTEXT, "<:utf8", $rawtext) or die "Can't open the raw text file \"$rawtext\"!\n"; |
---|
82 | while (<RAWTEXT>) { |
---|
83 | |
---|
84 | $line++; |
---|
85 | |
---|
86 | s!\r\n!!; # remove Windows line ends |
---|
87 | |
---|
88 | next if $line == 73677; # the last line contains the forbidden character SUBSTITUTE (U+001A) |
---|
89 | |
---|
90 | # <pb> and <h> on separate lines (line 7, 303) |
---|
91 | # (note that this emendation changes the line numbering!) |
---|
92 | if (m!<pb><h!) { |
---|
93 | push @text, "<pb>"; |
---|
94 | s!<pb>!!; |
---|
95 | } |
---|
96 | |
---|
97 | # resolve ten occurrences of </mg[lr]></p> |
---|
98 | # (this code would fail if <mgr>...</mgr><p> is in one line) |
---|
99 | if (m!</mg[lr]></p>!) { |
---|
100 | s!</p>!!; |
---|
101 | my $beginningOfNote = $line-2 + 2; # +2 because of lines 7 and 303, and it doesn't happen in one-line notes |
---|
102 | while (!($text[$beginningOfNote] =~ m!<mg[lr]!)) { $beginningOfNote--; } |
---|
103 | $text[$beginningOfNote-1] .= '</p>'; |
---|
104 | } |
---|
105 | |
---|
106 | # remove library stamp |
---|
107 | if (($line >= 26) && ($line <= 31)) { $_ = ""; } |
---|
108 | |
---|
109 | push @text, $_; |
---|
110 | } |
---|
111 | close (RAWTEXT); |
---|
112 | |
---|
113 | |
---|
114 | # emendations of the raw text before XMLifying |
---|
115 | |
---|
116 | $line = 0; |
---|
117 | |
---|
118 | for (@text) { |
---|
119 | |
---|
120 | $line++; |
---|
121 | |
---|
122 | # $ --> ſ (long s) |
---|
123 | s!\$!ſ!g; |
---|
124 | |
---|
125 | # & is a reserved character in XML |
---|
126 | s!&!&!g; |
---|
127 | |
---|
128 | # provisional treatment of a table which fills a whole page |
---|
129 | # The problem is that one sentence starts on p.501 and continues on p.503, and p.502 is a table with caption. |
---|
130 | # The first "solution" (<cap> --> <h>) was problematic because it meant that <head> could be inside <s>. |
---|
131 | # Now we provisionally solve the problem by inserting a period, i.e. by artificially splitting up one sentence into two. |
---|
132 | # An alternative would be to accept <cap> and <tb> within <s>, but there is only this single table within any <s>. |
---|
133 | # Another alternative would be to move the last two lines of the paragraph from p.503 to p.501 (all other tables are between two <p>). |
---|
134 | s!Al<007>as proportiones eiuſdem!Al<007>as proportiones eiuſdem.!; # line 23791 |
---|
135 | s!<cap>TABVLA PYTHA GORICA.</cap>!<cap>TABVLA PYTHAGORICA.</cap>!; # line 23792: remove a space |
---|
136 | |
---|
137 | # mid dot after fractions should be normal period |
---|
138 | s!·!.!g; |
---|
139 | |
---|
140 | # centered formulas that look like tables or headings, and similar cases |
---|
141 | s!<tb>16. # 24. # 36. # 54. # 81.</tb>!<math><seq>16, 24, 36, 54, 81</seq></math>!; # line 25489 |
---|
142 | s!<tb>3. # 4. # 12. # 16. # 48. # 64. # 192. # 256.</tb>!<math><seq>3, 4, 12, 16, 48, 64, 192, 256</seq></math>!; # line 25510 |
---|
143 | if (s!<h>([\d\@{/}]+\.( [\d\@{/}]+\.)*( &c\.)?)</h>!<math><seq>$1</seq></math>!) { |
---|
144 | s!\.!,!g; |
---|
145 | s!,</seq>!</seq>!g; |
---|
146 | } |
---|
147 | s!<tb it>7 # R. q. 63. # 9.</tb>!<math>7 # _R. q._ 63. # 9.</math>!; # line 25749 |
---|
148 | |
---|
149 | # typos that are a problem for the script |
---|
150 | s!<scG</sc>!<sc>G</sc>!; # line 57290; must be before "obviously missing spaces between words" |
---|
151 | s!R Q\. Q T,!R Q, Q T,!; # line 61676: would produce an artifact sentence |
---|
152 | |
---|
153 | # old-style numerals typed as letters |
---|
154 | # (must be before "obviously missing spaces between words") |
---|
155 | s!PROPOS !PROPOS. !; # normalize the period |
---|
156 | s!PROPOS\. I!PROPOS. 1!; |
---|
157 | s!PROPOS\. II!PROPOS. 11!; |
---|
158 | s!<mgl aI>5\.!<mgl a>15.!; # line 38201 |
---|
159 | s!<mgl bI>I\.!<mgl b>11.!; # line 38206 |
---|
160 | s!<mgl dI>5\.!<mgl d>15.!; # line 38226 |
---|
161 | # s!(<mg[lr].*?)>II\.!$1>11.!; # this may be wrong! |
---|
162 | |
---|
163 | # obviously missing spaces between words |
---|
164 | # (part 1: remove the exceptions) |
---|
165 | s!peZ!pez!g; # z in italics has often been typed as capital Z (trapeZ...) |
---|
166 | s!pe Z!pez!; # (by the way: remove some superfluous spaces) |
---|
167 | s!ClaZomenius!Clazomenius!; # line 4741 |
---|
168 | s!θaμaIoIIoιηπnὴ!θαυματοποιητικὴ!; # line 4673: a single Greek word going very wrong (corrected version without ligature braces) |
---|
169 | s!oſtend<007>mW!oſtendimus!; # line 20009: badly printed |
---|
170 | s!lxIx!lxix!; # line 54383: dotless i typed as capital I in roman number |
---|
171 | s!lxIv!lxiv!; # line 54416: dotless i typed as capital I in roman number |
---|
172 | s!X CIV!xciv!; # (line 55865: superfluous space in roman number) |
---|
173 | s!lXXXIX!lxxxix!; # line 55866: small and capital letters in roman number |
---|
174 | s!paraHelæ!parallelæ!; # line 61171: ll --> H |
---|
175 | s!kL M!K L M!; # line 63775: typesetter's fault |
---|
176 | s!eCùm!<n e> Cùm!; # line 64564: the anchor is difficult to identify |
---|
177 | s!AbCD!ABCD!; # line 68929: typesetter's fault |
---|
178 | s!CBkHAE!CBKHAE!; # line 71905: typesetter's fault |
---|
179 | # (part 2: apply the rule) |
---|
180 | s!([a-zæ])([A-Z])!$1 $2!; # insert space between small letter and capital letter |
---|
181 | |
---|
182 | s! exdefin\.! ex defin.!g; |
---|
183 | |
---|
184 | # long s --> f |
---|
185 | s! deſin\.! defin.!g; |
---|
186 | s! fint! ſint!g; # I didn't check whether the typesetter or the typists got it wrong; likely the latter |
---|
187 | |
---|
188 | # running heads |
---|
189 | # (all 1451 running heads have the form <rh>[A-Z. Æ´:_]+</rh>) |
---|
190 | s!LIBER I\.´!LIBER I.!; |
---|
191 | s!LIBER Y!LIBER I!; |
---|
192 | s!LIBER XII:!LIBER XII.!; |
---|
193 | s!<rh>_(.+?)_(\.?</rh>)!<rh>$1$2!; # remove italics in the running head (rather pointless information in the running head) |
---|
194 | |
---|
195 | # italics |
---|
196 | s!<h>_(.+?)_</h>!<h it>$1</h>!; # replace _ _ by it in one-line headings |
---|
197 | s!_K_!K!g; # 264 occurrences of K in italics: typesetter's fault |
---|
198 | |
---|
199 | # notes in italics: _ _ --> it (problem of the Specs 1.1.2) |
---|
200 | if (m!<mg[lr]!) { |
---|
201 | $inMgSince = $line; |
---|
202 | $mgInItalics = 1; |
---|
203 | } |
---|
204 | if ($inMgSince) { |
---|
205 | # examples of lines that count as italics: <mgl>_text_<mgl>; <mgl>_text_.<mgl>; <mgl>12.; <mgl>12. _pri_.</mgl> |
---|
206 | # does not catch: 2 2. and a few other cases |
---|
207 | unless (m!(<mg[lr].*?>)?(\d+\. )?_.+?_[.-]?(</mg[lr])?! || m!<mg[lr].*?>[\d\@]+\.?!) { $mgInItalics = 0; } |
---|
208 | if (m!</mg[lr]>!) { |
---|
209 | if ($mgInItalics) { |
---|
210 | $text[$inMgSince-1] =~ s!<(mg[lr])(.*?)>!<$1 it$2>!; |
---|
211 | for my $i ($inMgSince-1 .. $line-1) { $text[$i] =~ s!_!!g; } # remove ALL italics markers in these notes |
---|
212 | } |
---|
213 | $inMgSince = 0; |
---|
214 | } |
---|
215 | } |
---|
216 | |
---|
217 | # incorrect periods, which would produce artifact sentence tags |
---|
218 | # (a regex such as "\. [a-z]" would not work well) |
---|
219 | # line ends |
---|
220 | s!3 DVABVS datis rectis lineis inæqualibus\.!3 DVABVS datis rectis lineis inæqualibus,!; # badly printed comma |
---|
221 | s!reliquis trianguli lateribus deſcribuntur\.!reliquis trianguli lateribus deſcribuntur,!; # line 516: badly printed comma |
---|
222 | s!ex denominatori\.!ex denominatori-!; # line 24587 |
---|
223 | s! Et\.! Et!; # line 2179 |
---|
224 | s! Non\.! Non!; # line 3671 |
---|
225 | s!pun\.!pun-!; # line 16033 |
---|
226 | # others |
---|
227 | s!4c\.!40.!; # line 33222 |
---|
228 | s!duo\. us!duobus!; # line 33949 |
---|
229 | s!commenſura\.!commenſura-!; # line 54153 |
---|
230 | |
---|
231 | # resolve punctuation clusters (which would cause artifact sentences) |
---|
232 | # (be careful with & !) |
---|
233 | s!\.\.!.!; |
---|
234 | |
---|
235 | # emendations of headings |
---|
236 | s!<h>SCHOL!<h it>SCHOL!; # lines 6820, 12246, 59058, 66953: I haven't checked the originals; call it normalization |
---|
237 | s!<h>_SCHOLIVM_!<h it>SCHOLIVM!; # 5 times; not checked |
---|
238 | s!EXCAMPANO!EX CAMPANO!; |
---|
239 | s!<h>EX CAMPANO!<h it>EX CAMPANO!; # not checked |
---|
240 | s!EXPROCLO!EX PROCLO!; |
---|
241 | s!<h>EX PROCLO!<h it>EX PROCLO!; # not checked |
---|
242 | s!EXORONTIO!EX ORONTIO.!; |
---|
243 | s!<h>EX CARDANO!<h it>EX CARDANO!; # not checked |
---|
244 | s!EX PELET ARIO!EX PELETARIO!; |
---|
245 | s!<h>EX PELETARIO!<h it>EX PELETARIO!; # not checked |
---|
246 | s!<h>PRAXIS!<h it>PRAXIS!; # not checked |
---|
247 | s!<h>EX FEDERICO COMMANDINO!<h it>EX FEDERICO COMMANDINO!; # not checked |
---|
248 | |
---|
249 | s!PROBLEMA I\.!PROBLEMA 1.!; |
---|
250 | s!PROPOSITIO I\.!PROPOSITIO 1.!; |
---|
251 | s!PROPOS\. 1I\.!PROPOS. 11.!; |
---|
252 | s!PROPOS\. I\.!PROPOS. 1.!; |
---|
253 | |
---|
254 | s!CONVERSV M!CONVERSVM!; |
---|
255 | s!COROLLARIVMI!COROLLARIVM I!; |
---|
256 | |
---|
257 | s!<h>THEOR\. 13\. PROPOS\. 19</h>!<h>THEOR. 13. PROPOS. 19.</h>!; # line 34061 |
---|
258 | |
---|
259 | if ($line == 13631 + 2) { s!SECVNDVM\.!SECVNDVM.</h>!; } |
---|
260 | if ($line == 13632 + 2) { s!_DEFINITIONES_\.!<h it>DEFINITIONES.</h>!; } |
---|
261 | if ($line == 13633 + 2) { s!I\.</h>!<h>I.</h>!; } |
---|
262 | |
---|
263 | if ($line == 16232 + 2) { s!TERTIVM\.!TERTIVM.</h>!; } |
---|
264 | if ($line == 16233 + 2) { s!_DEFINITIONES\._</h>!<h it>DEFINITIONES.</h>!; } |
---|
265 | |
---|
266 | if ($line == 21217 + 2) { s!QVARTVM\.!QVARTVM.</h>!; } |
---|
267 | if ($line == 21218 + 2) { s!_DEFINITIONES_\.!<h it>DEFINITIONES.</h>!; } |
---|
268 | if ($line == 21219 + 2) { s!I\.</h>!<h>I.</h>!; } |
---|
269 | |
---|
270 | if ($line == 38501 + 2) { s!SEPTIMVM\.!SEPTIMVM.</h>!; } |
---|
271 | if ($line == 38502 + 2) { s!_DEFINITIONES\._</h>!<h it>DEFINITIONES.</h>!; } |
---|
272 | # these changes are tedious in a script; better emend it directly in the txt file! |
---|
273 | |
---|
274 | s!<h it>_DEFINITIONES_\.</h>!<h it>DEFINITIONES.</h>!; |
---|
275 | s!<h it>_DEFINITIONES\._</h>!<h it>DEFINITIONES.</h>!; |
---|
276 | |
---|
277 | if ($line == 63713) { s!<h>THEOR. 16. PROPOS. 16.</h>!<h>THEOR. 16. PROPOS. 18.</h>!; } # silent correction of obvious typo! |
---|
278 | if ($line == 66659) { s!PROOPS!PROPOS!; } # silent correction of obvious typo in the book |
---|
279 | |
---|
280 | s!<h>INVENIRE quintam Apotomen\.</h>!<p>INVENIRE quintam Apotomen.</p>!; # line 55545: would create artifact div4 |
---|
281 | |
---|
282 | # superscript marked as subscript (there is no <^> in Clavius) |
---|
283 | s!<_>e</_>x E\.!ex E!; # line 11142 (p.0250): typesetter's mistake: remove superscript and dot |
---|
284 | s!2<_>a</_>!<reg orig="2a" type="abbrev">secunda</reg>!; # line 20594 (p.0435) |
---|
285 | |
---|
286 | # misc. typos I stumbled upon |
---|
287 | s!I hilo!Philo!; # badly printed P |
---|
288 | s!Mænum!Mœnum!; # æ --> œ |
---|
289 | s!mθnſtratione!monſtratione!; # θ --> o |
---|
290 | s!IV M!IVM!; # superfluous space as in SCHOLIV M, COROLLARIV M |
---|
291 | s!SCHOLI M!SCHOLIVM!; # missing V |
---|
292 | s!ZVART!QVART!; # ZV --> QV |
---|
293 | s!7. quind. Septima libri quindecimi.!7. quind. # Septima libri quindecimi.!; # line 6769: missing # in table |
---|
294 | |
---|
295 | } |
---|
296 | |
---|
297 | # raw text --> XML |
---|
298 | |
---|
299 | # metadata: 30 additional lines |
---|
300 | print <<'METADATA'; |
---|
301 | <?xml version="1.0" encoding="UTF-8"?> |
---|
302 | <echo xmlns="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/" |
---|
303 | xmlns:dc="http://purl.org/dc/elements/1.1/" |
---|
304 | xmlns:dcq="http://purl.org/dc/qualifiers/1.0/" |
---|
305 | xmlns:dct="http://purl.org/dc/terms/1.0/" |
---|
306 | xmlns:echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/" |
---|
307 | xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" |
---|
308 | xmlns:xhtml="http://www.w3.org/1999/xhtml" |
---|
309 | xmlns:xlink="http://www.w3.org/1999/xlink" |
---|
310 | xmlns:xml="http://www.w3.org/XML/1998/namespace"> |
---|
311 | <metadata> |
---|
312 | <dc:creator>Clavius</dc:creator> |
---|
313 | <dc:title xml:lang="la">Elementorum Libri XV</dc:title> |
---|
314 | <dc:date> |
---|
315 | <rdf:Description> |
---|
316 | <dcq:dateScheme>ISO 8601</dcq:dateScheme> |
---|
317 | <rdf:value>1607</rdf:value> |
---|
318 | </rdf:Description> |
---|
319 | </dc:date> |
---|
320 | <dc:language> |
---|
321 | <rdf:Description> |
---|
322 | <dcq:languageScheme>ISO 639-2</dcq:languageScheme> |
---|
323 | <rdf:value>lat</rdf:value> |
---|
324 | </rdf:Description> |
---|
325 | </dc:language> |
---|
326 | <dc:identifier>test.xml</dc:identifier> |
---|
327 | <dc:rights>open access</dc:rights> |
---|
328 | <dct:license>http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration</dct:license> |
---|
329 | </metadata> |
---|
330 | <text xml:lang="la"> |
---|
331 | METADATA |
---|
332 | |
---|
333 | $line = 0; |
---|
334 | |
---|
335 | for (@text) { |
---|
336 | |
---|
337 | $line++; |
---|
338 | |
---|
339 | |
---|
340 | # ad hoc tagging of the higher-level div structure |
---|
341 | # including some additional lowest-level closing div's |
---|
342 | # note that single lines have been inserted before lines 7 and 303 ! |
---|
343 | |
---|
344 | # frontmatter |
---|
345 | if ($line == 1) { print '<front><div level="1" type="title" n="1">'; } |
---|
346 | if ($line == 33) { print '</div><div level="1" type="dedication" n="1">'; } # from here on: offset 1 |
---|
347 | if ($line == 223) { print '</div><div level="1" type="preface" n="1">'; } |
---|
348 | if ($line == 304) { print '</div><div level="1" type="index" n="1">'; } |
---|
349 | if ($line == 386) { print '</div><div level="1" type="index" n="2">'; } # from here on: offset 2 |
---|
350 | if ($line == 2087) { print '</div><div level="1" type="index" n="3">'; } |
---|
351 | if ($line == 4377) { print '</div><div level="1" type="preface" n="2">'; } |
---|
352 | if ($line == 4505) { print '</div><div level="1" type="dedication" n="2">'; } |
---|
353 | if ($line == 4563) { print '</div><div level="1" type="introduction" n="1">'; } |
---|
354 | |
---|
355 | # book 1 |
---|
356 | if ($line == 5225) { print '</div></front><body><div level="1" type="book" n="1"><div level="2" type="booktitle" n="1">'; } |
---|
357 | if ($line == 5230) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
358 | if ($line == 6226) { print '</div></div>'; $inDiv3 = 0; $inDef = 0; $inPost = 1; $post = 0; } |
---|
359 | if ($line == 6283) { print '</div></div>'; $inDiv3 = 0; $inPost = 0; $inComm = 1; $comm = 0; } |
---|
360 | if ($line == 6777) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inComm = 0; $inPropos = 1; $propos = 0; } |
---|
361 | |
---|
362 | # book 2 |
---|
363 | if ($line == 13629) { print '</div></div></div></div><div level="1" type="book" n="2"><div level="2" type="booktitle" n="1">'; } |
---|
364 | if ($line == 13634) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
365 | if ($line == 13806) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
366 | |
---|
367 | # book 3 |
---|
368 | if ($line == 16230) { print '</div></div></div></div><div level="1" type="book" n="3"><div level="2" type="booktitle" n="1">'; } |
---|
369 | if ($line == 16235) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
370 | if ($line == 16481) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
371 | |
---|
372 | # book 4 |
---|
373 | if ($line == 21215) { print '</div></div></div></div><div level="1" type="book" n="4"><div level="2" type="booktitle" n="1">'; } |
---|
374 | if ($line == 21220) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
375 | if ($line == 21301) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
376 | |
---|
377 | # book 5 |
---|
378 | if ($line == 23058) { print '</div></div></div></div><div level="1" type="book" n="5"><div level="2" type="booktitle" n="1">'; } |
---|
379 | if ($line == 23063) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
380 | if ($line == 23194) { print '</div><div level="3" type="addendum" n="1">'; $defOnHold = 1; } |
---|
381 | if ($line == 27240) { $defOnHold = 0; } |
---|
382 | if ($line == 27611) { print '</div><div level="3" type="addendum" n="2">'; $defOnHold = 1; } |
---|
383 | if ($line == 28186) { $defOnHold = 0; } |
---|
384 | if ($line == 28707) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
385 | |
---|
386 | # book 6 |
---|
387 | if ($line == 31048) { print '</div></div></div></div><div level="1" type="book" n="6"><div level="2" type="booktitle" n="1">'; } |
---|
388 | if ($line == 31053) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
389 | if ($line == 31640) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
390 | |
---|
391 | # second book title |
---|
392 | if ($line == 38391) { print '</div></div></div></div><div level="1" type="index" n="1">'; $inPropos = 0; $inDiv3 = 0; } # quick'n'dirty !!! |
---|
393 | if ($line == 38480) { print '</div><div level="1" type="title" n="1">'; } |
---|
394 | |
---|
395 | # book 7 |
---|
396 | if ($line == 38499) { print '</div><div level="1" type="book" n="7"><div level="2" type="booktitle" n="1">'; } |
---|
397 | if ($line == 38504) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
398 | if ($line == 39239) { print '</div></div>'; $inDiv3 = 0; $inDef = 0; $inPost = 1; $post = 0; } |
---|
399 | if ($line == 39250) { print '</div></div>'; $inDiv3 = 0; $inPost = 0; $inComm = 1; $comm = 0; } |
---|
400 | if ($line == 39355) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inComm = 0; $inPropos = 1; $propos = 0; } |
---|
401 | |
---|
402 | # book 8 |
---|
403 | if ($line == 42003) { print '</div></div></div></div><div level="1" type="book" n="8"><div level="2" type="booktitle" n="1">'; } |
---|
404 | if ($line == 42008) { print '</div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inPropos = 1; $propos = 0; } |
---|
405 | |
---|
406 | # book 9 |
---|
407 | if ($line == 44322) { print '</div></div></div></div><div level="1" type="book" n="9"><div level="2" type="booktitle" n="1">'; } |
---|
408 | if ($line == 44327) { print '</div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inPropos = 1; $propos = 0; } |
---|
409 | |
---|
410 | # book 10 |
---|
411 | if ($line == 48379) { print '</div></div></div></div><div level="1" type="book" n="10"><div level="2" type="booktitle" n="1">'; } |
---|
412 | if ($line == 48384) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
413 | if ($line == 48638) { print '</div>'; $inDef = 0; } |
---|
414 | if ($line == 48639) { print '<div type="postulatum" level="3" n="1">'; } # a single postulate, without number "I." |
---|
415 | if ($line == 48650) { print '</div></div>'; $inDiv3 = 0; $inComm = 1; $comm = 0; } |
---|
416 | if ($line == 48677) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inComm = 0; $inPropos = 1; $propos = 0; } |
---|
417 | |
---|
418 | if ($line == 52978) { print '</div></div></div><div level="2" type="bookhead" n="2">'; $inDiv3 = 0; $inPropos = 0; $inDef = 1; $def = 0; } |
---|
419 | if ($line == 53045) { print '</div></div></div><div level="2" type="bookmain" n="2">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
420 | if ($line == 55353) { print '</div></div></div><div level="2" type="bookhead" n="3">'; $inDiv3 = 0; $inPropos = 0; $inDef = 1; $def = 0; } |
---|
421 | if ($line == 55386) { print '</div></div></div><div level="2" type="bookmain" n="3">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
422 | |
---|
423 | # book 11 |
---|
424 | if ($line == 57419) { print '</div></div></div></div><div level="1" type="book" n="11"><div level="2" type="booktitle" n="1">'; } |
---|
425 | if ($line == 57425) { print '</div><div level="2" type="bookhead" n="1">'; $inDiv3 = 0; $inDef = 1; $def = 0; } |
---|
426 | if ($line == 58209) { print '</div></div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
427 | |
---|
428 | # book 12 |
---|
429 | if ($line == 61352) { print '</div></div></div></div><div level="1" type="book" n="12"><div level="2" type="booktitle" n="1">'; } |
---|
430 | if ($line == 61358) { print '</div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inPropos = 1; $propos = 0; } |
---|
431 | |
---|
432 | # book 13 |
---|
433 | if ($line == 63790) { print '</div></div></div></div><div level="1" type="book" n="13"><div level="2" type="booktitle" n="1">'; } |
---|
434 | if ($line == 63796) { print '</div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inPropos = 1; $propos = 0; } |
---|
435 | |
---|
436 | # book 14 |
---|
437 | if ($line == 66389) { print '</div></div></div></div><div level="1" type="book" n="14"><div level="2" type="booktitle" n="1">'; } |
---|
438 | if ($line == 66398) { print '<div level="2" type="prooemium" n="1">'; } |
---|
439 | if ($line == 66461) { print '</div></div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inDef = 0; $inPropos = 1; $propos = 0; } |
---|
440 | |
---|
441 | # book 15 |
---|
442 | if ($line == 68896) { print '</div></div></div></div><div level="1" type="book" n="15"><div level="2" type="booktitle" n="1">'; $inDiv3 = 0; } # $inDiv3 = 0: quick'n'dirty solution !!! |
---|
443 | if ($line == 68905) { print '</div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inPropos = 1; $propos = 0; } |
---|
444 | |
---|
445 | # book 16 |
---|
446 | if ($line == 70684) { print '</div></div></div></div><div level="1" type="book" n="16"><div level="2" type="booktitle" n="1">'; } |
---|
447 | if ($line == 70705) { print '</div><div level="2" type="bookmain" n="1">'; $inDiv3 = 0; $inPropos = 1; $propos = 0; } |
---|
448 | |
---|
449 | if ($line == 72170) { print '</div></div></div><div level="2" type="addendum" n="1">'; $inPropos = 0; $inDiv3 = 0; } # $inDiv3 = 0: ignore the three addenda |
---|
450 | if ($line == 73209) { print '</div><div level="2" type="addendum" n="2">'; } |
---|
451 | if ($line == 73378) { print '</div><div level="2" type="addendum" n="3">'; } |
---|
452 | |
---|
453 | |
---|
454 | # resolve common abbreviations |
---|
455 | # (problem of &c.: The period may or may not mark the end of a sentence. Heuristics will be: etc. plus capital letter --> <s>) |
---|
456 | |
---|
457 | s!([a-z])propoſ\.!$1 propoſ.!g; # 24 occurrences of propo$. without preceding space, all checked (all other: space, _, beginning of line) |
---|
458 | s!([Aa]d) (propoſ)\.!$1 <reg orig="MURP" type="abbrev">$2itionem</reg>!g; # ad propo$itionem |
---|
459 | s!([Pp]ropoſ)\.!<reg orig="$1." type="abbrev">$1itio</reg>!g; # propo$itio (may be the wrong casus in some cases, e.g. ex propo$.) |
---|
460 | s!MURP!propoſ.!g; |
---|
461 | # does not catch e.g. pro-_ // poſ. |
---|
462 | |
---|
463 | s!([a-z])coroll\.!$1 coroll.!g; # 15 occurrences of coroll. without preceding space, all checked |
---|
464 | s!([Cc]oroll)\.!<reg orig="$1." type="abbrev">$1arium</reg>!g; # (may sometimes be the wrong casus, e.g. ex coroll.) |
---|
465 | |
---|
466 | s!lib\.!<reg orig="lib." type="abbrev">liber</reg>!g; # liber (does that make sense? It may be e.g. "libro".) |
---|
467 | # missing: defin. def. cap. and others (see below) |
---|
468 | |
---|
469 | # resolve ligatures |
---|
470 | s!\\’q;!{que}!g; # -que ligature with preceding \’ |
---|
471 | s!q;!{que}!g; # -que ligature without preceding \’ |
---|
472 | s!{(.+?)}!$1<\!-- {$1} -->!g; |
---|
473 | |
---|
474 | # unknown characters |
---|
475 | s!<007>!i<\!-- 007 -->!g; # <007> (dotless i) is resolved as i<!-- 007 --> |
---|
476 | s!<(\d\d\d)>!<unknown code=\"$1\"/>!g; |
---|
477 | s!<\?>!<unsure/>!g; |
---|
478 | s!<gap>!<gap/>!g; |
---|
479 | |
---|
480 | # figures |
---|
481 | # (must be before the tagging of div's to avoid </div></fig>) |
---|
482 | # (the internal ordering is important for successive figures) |
---|
483 | s!<fig>!<figure/>!g; |
---|
484 | |
---|
485 | # # more elaborate version for texts which have been sent with DESpecs 2 or higher |
---|
486 | # if ($inFig) { |
---|
487 | # if (m!<cap>! || m!<desc>! || m!<var>!) { |
---|
488 | # print "$_\n"; |
---|
489 | # next; |
---|
490 | # } |
---|
491 | # print "</figure>"; |
---|
492 | # $inFig = 0; |
---|
493 | # } |
---|
494 | # if (s!<fig>!<figure>!) { |
---|
495 | # $inFig = 1; |
---|
496 | # print "$_\n"; |
---|
497 | # next; |
---|
498 | # } |
---|
499 | |
---|
500 | |
---|
501 | # handwritten material |
---|
502 | s!<hd>!<hd/>!; |
---|
503 | |
---|
504 | |
---|
505 | # tables |
---|
506 | # (must be before paragraphs) |
---|
507 | # many tables are preceded by <cap> </cap>, which can be one or two lines long; we ignore <cap> for the moment |
---|
508 | |
---|
509 | if (m!</?cap>!) { # all captions are one or two lines long; this code would break with a longer caption! |
---|
510 | print "$_\n"; |
---|
511 | next; |
---|
512 | } |
---|
513 | |
---|
514 | if (s!<tb it>!<tb>!) { $italicsTb = 1; } |
---|
515 | if (m!<tb>!) { $inTb = 1; } |
---|
516 | |
---|
517 | if ($inTb) { |
---|
518 | unless (s!<(/?)tb>!<$1xhtml:table>!) { |
---|
519 | $_ = "<xhtml:tr><xhtml:td>$_</xhtml:td></xhtml:tr>"; # this breaks if the table consists of only one line! |
---|
520 | s!#!</xhtml:td><xhtml:td>!g; |
---|
521 | } |
---|
522 | if ($italicsTb) { s!<xhtml:table>!<xhtml:table style="italics">!; } |
---|
523 | print "$_\n"; |
---|
524 | if (m!</xhtml:table>!) { $inTb = 0; $italicsTb = 0; } |
---|
525 | next; |
---|
526 | } |
---|
527 | |
---|
528 | |
---|
529 | # mathematical formulas |
---|
530 | |
---|
531 | if (m!<math>!) { $inMath = 1; } |
---|
532 | |
---|
533 | if ($inMath) { |
---|
534 | print "$_\n"; |
---|
535 | if (m!</math>!) { $inMath = 0; } |
---|
536 | next; |
---|
537 | } |
---|
538 | |
---|
539 | |
---|
540 | # page breaks |
---|
541 | |
---|
542 | if (m!<pb!) { |
---|
543 | $pb++; |
---|
544 | |
---|
545 | my $pagenumber = (m!<pb (.+?)>! ? " o=\"$1\"" : "" ); |
---|
546 | my $rhead = (m!<rh>(.+?)</rh>! ? " rhead=\"$1\"" : "" ); |
---|
547 | my $link = 'http://echo.mpiwg-berlin.mpg.de/ECHOdocuView/ECHOzogiLib?'. |
---|
548 | 'url=/mpiwg/online/permanent/library/2QTVUHDT/pageimg&mode=imagepath&pn='.$pb; |
---|
549 | |
---|
550 | $_ = "<pb$pagenumber$rhead n=\"$pb\" xlink:href='$link'/>"; |
---|
551 | |
---|
552 | $pbTag = ""; |
---|
553 | if (($inDef) && (!$defOnHold)) { if ( $text[$line] =~ m!<h>[XVI]+\.</h>!) { $pbTag = $_; }} |
---|
554 | if ($inPost) { if ( $text[$line] =~ m!<h>[XVI]+\.</h>!) { $pbTag = $_; }} |
---|
555 | if ($inComm) { if ( $text[$line] =~ m!<h>[XVI]+\.</h>!) { $pbTag = $_; }} |
---|
556 | if ($inPropos) { if ( $text[$line] =~ m!<h>.+? PROPOS\. \d+\.</h>!) { $pbTag = $_; }} |
---|
557 | |
---|
558 | print "$_" unless $pbTag ne ""; |
---|
559 | print "\n"; |
---|
560 | next; |
---|
561 | } |
---|
562 | |
---|
563 | |
---|
564 | # anchors |
---|
565 | s!<n (.+?)>!<anchor symbol="$1" anchorLocation="given"/>!g; # <n> never occours without anchor |
---|
566 | |
---|
567 | |
---|
568 | # marginal notes |
---|
569 | # (must be before paragraphs) |
---|
570 | if (s!(<mg[lr]) it!$1!) { $italicsMg = 1; } |
---|
571 | if (s!<mgl>!<note place="left">!) { $inMg = 1; } |
---|
572 | if (s!<mgr>!<note place="right">!) { $inMg = 1; } |
---|
573 | if (s!<mgl (.+?)>!<note place="left" anchor="$1">!) { $inMg = 1; } |
---|
574 | if (s!<mgr (.+?)>!<note place="right" anchor="$1">!) { $inMg = 1; } |
---|
575 | if ($italicsMg) { |
---|
576 | s!(<note.*?)>!$1 style="italics">!; |
---|
577 | $italicsMg = 0; |
---|
578 | } |
---|
579 | if ($inMg) { |
---|
580 | if (s!</mg[lr]>!</note>!) { $inMg = 0; } else { $_ .= "<lb/>"; } |
---|
581 | print "$_\n"; |
---|
582 | next; |
---|
583 | } |
---|
584 | |
---|
585 | |
---|
586 | sub resetDiv4 { |
---|
587 | $div4 = ""; |
---|
588 | $divTypeCount{"comment"} = 0; |
---|
589 | $divTypeCount{"euclid"} = 0; |
---|
590 | $divTypeCount{"euclidComment"} = 0; |
---|
591 | $divTypeCount{"pseudoEuclid"} = 0; |
---|
592 | $divTypeCount{"pseudoEuclidComment"} = 0; |
---|
593 | $divTypeCount{"corollarium"} = 0; |
---|
594 | $divTypeCount{"corollariumComment"} = 0; |
---|
595 | } |
---|
596 | |
---|
597 | sub addDiv4 { |
---|
598 | if ($div4) { print '</div>'; } |
---|
599 | $div4 = shift; |
---|
600 | $divTypeCount{$div4}++; |
---|
601 | print '<div type="'.$div4.'" level="4" n="'.$divTypeCount{$div4}.'">'; |
---|
602 | } |
---|
603 | |
---|
604 | sub checkForNewDiv4 { |
---|
605 | |
---|
606 | return unless $inDiv3; |
---|
607 | return unless $line >= $doNotCheckBeforeLine; |
---|
608 | return unless (m!<head>! || m!<p>!); |
---|
609 | |
---|
610 | # corollarium |
---|
611 | if (m!<head>COR!) { # <head>COR always triggers a new div4 |
---|
612 | addDiv4("corollarium"); |
---|
613 | return; |
---|
614 | } |
---|
615 | |
---|
616 | # headings: Always start a new div4, but ignore upright/italics in the heading and take upright/italics from the next paragraph instead |
---|
617 | if (m!<head>!) { |
---|
618 | my $nextP = $line; |
---|
619 | while (!($text[$nextP] =~ m!<p[ >]!)) { |
---|
620 | return if m!<pb!; # do not look further than the next page break: don't start a new div4 in this (hopefully rare) case |
---|
621 | $nextP++; |
---|
622 | } |
---|
623 | $doNotCheckBeforeLine = $nextP; |
---|
624 | |
---|
625 | # upright: always pseudo-Euclid |
---|
626 | if ($text[$nextP] =~ m!<p>!) { |
---|
627 | addDiv4("pseudoEuclid"); |
---|
628 | return; |
---|
629 | } |
---|
630 | # italics: either a comment div on the preceding upright div, or the same comment div type as before |
---|
631 | if ($div4 =~ m![cC]omment!) { |
---|
632 | addDiv4($div4); |
---|
633 | return; |
---|
634 | } |
---|
635 | addDiv4($div4."Comment"); # we assume that there is no "comment", i.e. a comment without preceding upright p |
---|
636 | return; |
---|
637 | } |
---|
638 | |
---|
639 | # paragraph in italics |
---|
640 | if ($italicsP) { |
---|
641 | if ($div4 eq "") { |
---|
642 | addDiv4("comment"); # this does not exist in Clavius |
---|
643 | return; |
---|
644 | } |
---|
645 | return if $div4 =~ m![cC]omment!; # return if we are already in a comment div |
---|
646 | |
---|
647 | addDiv4($div4."Comment"); |
---|
648 | return; |
---|
649 | } |
---|
650 | |
---|
651 | # euclid and pseudo-euclid |
---|
652 | return if $div4 =~ m![eE]uclid$!; # return if we are already in a euclid or pseudoEuclid div |
---|
653 | return if $div4 =~ m!corollarium$!; # return if we are already in a corollarium div |
---|
654 | |
---|
655 | # the first euclid part |
---|
656 | if ($div4 eq "") { |
---|
657 | addDiv4("euclid"); |
---|
658 | return; |
---|
659 | } |
---|
660 | |
---|
661 | # every euclid part apart from the first is called pseudo-euclid |
---|
662 | addDiv4("pseudoEuclid"); |
---|
663 | return; |
---|
664 | } |
---|
665 | |
---|
666 | |
---|
667 | # headings |
---|
668 | |
---|
669 | if (s!<h it>!<h>!) { $italicsH = 1; } |
---|
670 | if (s!<h>!<head>!) { $inH = 1; } |
---|
671 | |
---|
672 | if ($inH) { |
---|
673 | |
---|
674 | s!</h>!</head>!; |
---|
675 | |
---|
676 | # headings at level 3 |
---|
677 | if (($inDef) && (!$defOnHold)) { |
---|
678 | if (m!<head>[XVI]+\.</head>!) { |
---|
679 | $def++; |
---|
680 | if ($inDiv3) { print '</div></div>'; } |
---|
681 | print '<div type="definitio" level="3" n="'.$def.'">'; |
---|
682 | if ($pbTag) { print $pbTag; $pbTag = ""; } |
---|
683 | $inDiv3 = 1; |
---|
684 | resetDiv4; |
---|
685 | $doNotCheckBeforeLine = $line + 1; |
---|
686 | } |
---|
687 | } |
---|
688 | if ($inPost) { |
---|
689 | if (m!<head>[XVI]+\.</head>!) { |
---|
690 | $post++; |
---|
691 | if ($inDiv3) { print '</div></div>'; } |
---|
692 | print '<div type="postulatum" level="3" n="'.$post.'">'; |
---|
693 | if ($pbTag) { print $pbTag; $pbTag = ""; } |
---|
694 | $inDiv3 = 1; |
---|
695 | resetDiv4; |
---|
696 | $doNotCheckBeforeLine = $line + 1; |
---|
697 | } |
---|
698 | } |
---|
699 | if ($inComm) { |
---|
700 | if (m!<head>[XVI]+\.</head>!) { |
---|
701 | $comm++; |
---|
702 | if ($inDiv3) { print '</div></div>'; } |
---|
703 | print '<div type="axioma" level="3" n="'.$comm.'">'; |
---|
704 | if ($pbTag) { print $pbTag; $pbTag = ""; } |
---|
705 | $inDiv3 = 1; |
---|
706 | resetDiv4; |
---|
707 | $doNotCheckBeforeLine = $line + 1; |
---|
708 | } |
---|
709 | } |
---|
710 | if ($inPropos) { |
---|
711 | if ((m!<head>PROBLEMA 1\.</head>!) || (m!<head>.+? PROPOS\. \d+\.</head>!)) { |
---|
712 | $propos++; |
---|
713 | if ($inDiv3) { print '</div></div>'; } |
---|
714 | print '<div type="propositio" level="3" n="'.$propos.'">'; |
---|
715 | if ($pbTag) { print $pbTag; $pbTag = ""; } |
---|
716 | $inDiv3 = 1; |
---|
717 | resetDiv4; |
---|
718 | $doNotCheckBeforeLine = $line + 1; |
---|
719 | if (m!<head>PROBLEMA 1\.</head>!) { $doNotCheckBeforeLine++; } # allow for the following PROPOSITIO 1. line |
---|
720 | } |
---|
721 | } |
---|
722 | |
---|
723 | checkForNewDiv4; |
---|
724 | |
---|
725 | # end each line with <lb/> |
---|
726 | $_ .= "<lb/>"; |
---|
727 | s!</head><lb/>!</head>!; |
---|
728 | |
---|
729 | # italics |
---|
730 | if ($italicsH) { s!<head>!<head style="italics">! } |
---|
731 | |
---|
732 | # print all headings |
---|
733 | print "$_\n"; |
---|
734 | |
---|
735 | if (m!</head>!) { $inH = 0; $italicsH = 0; } |
---|
736 | next; |
---|
737 | } |
---|
738 | |
---|
739 | |
---|
740 | # paragraphs |
---|
741 | |
---|
742 | if (s!<p it>!<p>!) { $italicsP = 1; } |
---|
743 | if (m!<p>!) { $inP = 1; } |
---|
744 | |
---|
745 | if ($inP) { |
---|
746 | |
---|
747 | checkForNewDiv4; |
---|
748 | |
---|
749 | # normalize the periods: there should always be a period before </p> |
---|
750 | s!([^.])</p>!$1\.</p>!g; |
---|
751 | |
---|
752 | |
---|
753 | # tagging sentences |
---|
754 | # (A period . may or may not indicate a new sentence; see "handle exceptions".) |
---|
755 | # (A colon : always indicates a new sentence.) |
---|
756 | # (A semicolon ; indicates a new sentence, probably unless there seems to be a variable before it.) |
---|
757 | |
---|
758 | # - handle exceptions |
---|
759 | # period |
---|
760 | s!\."!MURP"!g; # avoid artifact sentences in xml attributes, especially resolved abbreviations |
---|
761 | s!(\d+)\.!$1MURP!g; # assumption: no new sentence after an arabic number |
---|
762 | s!([a-zA-Z])\. (\d+)!$1MURP $2!g; # assumption: no new sentence before an arabic number |
---|
763 | s!}\.!}MURP!g; # no new sentence after fractions |
---|
764 | s!<p>([IVXLC]+)\.!<p>$1MURP!; # roman number at the beginning of the paragraph |
---|
765 | |
---|
766 | s! defin\.! definMURP!g; # some ad hoc MURPing; better properly resolve these abbreviations? |
---|
767 | s! def\.! defMURP!g; |
---|
768 | s! cap\.! capMURP!g; |
---|
769 | s! Rep\.! RepMURP!g; |
---|
770 | s! quib\.! quibMURP!g; |
---|
771 | s! duob\.! duobMURP!g; |
---|
772 | s! proportionib.! proportionibMURP!g; |
---|
773 | s! verticib.! verticibMURP!g; |
---|
774 | s! ſchol\.! ſcholMURP!g; |
---|
775 | s! Regiom\.! RegiomMURP!g; # Regiomontanus |
---|
776 | |
---|
777 | s!li<\!-- 007 -->b\.!li<\!-- 007 -->bMURP!g; |
---|
778 | s! ([A-Z])\. ([^A-Z])! $1MURP $2!g; # catches, among others, " R. q." |
---|
779 | if (m! [A-Z]\.$!) { |
---|
780 | if ($text[$line] =~ m!^[^A-Z]!) { s!( [A-Z])\.$!$1MURP!; } # same rule at line end |
---|
781 | } |
---|
782 | |
---|
783 | # semicolon |
---|
784 | s!& c\.!&c.!g; # normalize &c. |
---|
785 | s!&c\. ([^A-Z])!&cMURP $1!g; # problem of &c., see above. |
---|
786 | if (m!&c\.$!) { # &c. at line end |
---|
787 | if ($text[$line] =~ m!^[^A-Z]!) { s!&c\.$!&cMURP!; } |
---|
788 | } |
---|
789 | # (This rule will inevitably produce some incorrect tags, e.g. "&c. _Ad" in lines 2802, 3414.) |
---|
790 | |
---|
791 | s!q;!qHUBA!g; # -que ligature |
---|
792 | # s!( [a-zA-Z]);!$1HUBA!g; # variables before the ; |
---|
793 | s!( [a-zA-Z]); ([a-zA-Z] )!$1HUBA $2!g; # semicolon between two apparent variables |
---|
794 | if (m! [a-zA-Z];$!) { |
---|
795 | if ($text[$line] =~ m!^[a-zA-Z] !) { s!( [a-zA-Z]);!$1HUBA!; } # same rule at line end |
---|
796 | } |
---|
797 | s!&!&HUBA!g; # & |
---|
798 | |
---|
799 | |
---|
800 | # - other preparations |
---|
801 | s!([.:;])</sc>!</sc>$1!g; |
---|
802 | |
---|
803 | |
---|
804 | # - main |
---|
805 | s!<p>!<p><s>!; |
---|
806 | if ($lineStartsWithS) { $_ = "<s>$_"; $lineStartsWithS = 0; } |
---|
807 | s![.:;]!$&</s><s>!g; |
---|
808 | if (s!<s>$!!) { $lineStartsWithS = 1; } |
---|
809 | |
---|
810 | # - corrections |
---|
811 | s!([.:;])</s><s>_!$1_</s><s>!g; # (the underscore _ should be made into italics!) |
---|
812 | |
---|
813 | # - change back non-sentence-ending punctuation marks |
---|
814 | s!MURP!.!g; # period |
---|
815 | s!HUBA!;!g; # semicolon |
---|
816 | |
---|
817 | # - final corrections |
---|
818 | s!([.:;])</p>!$1</s></p>!; |
---|
819 | s!<s></p>!</p>!; |
---|
820 | # s!<s> !<s>!g; |
---|
821 | |
---|
822 | # mark italics |
---|
823 | # (must be after the sentence tagging) |
---|
824 | if ($italicsP) { s!<p([ >])!<p style="italics"$1!; } # however, in Clavius there are no attributes other than it |
---|
825 | |
---|
826 | # end each line with <lb/> |
---|
827 | $_ .= "<lb/>" unless (($line < $#text) && (($text[$line] =~ m!<tb>!) || ($text[$line] =~ m!<pb!))); |
---|
828 | s!</p><lb/>!</p>!; |
---|
829 | s!<s><lb/>!<lb/><s>!g; |
---|
830 | |
---|
831 | # corrections after a first discussion |
---|
832 | s!</s><s>\)!)</s><s>!g; |
---|
833 | s!</s><s> ! </s><s>!g; # beachte Zeilenumbruch?? |
---|
834 | |
---|
835 | print "$_\n"; |
---|
836 | |
---|
837 | if (m!</p>!) { $inP = 0; $italicsP = 0; } |
---|
838 | next; |
---|
839 | } |
---|
840 | |
---|
841 | |
---|
842 | # everything else |
---|
843 | |
---|
844 | print "$_\n"; |
---|
845 | } |
---|
846 | |
---|
847 | |
---|
848 | print "</div></div></body></text></echo>\n"; |
---|