1 | #!/usr/bin/perl -w |
---|
2 | use strict; |
---|
3 | use warnings; |
---|
4 | use utf8; |
---|
5 | binmode STDOUT, ':utf8'; |
---|
6 | use integer; |
---|
7 | |
---|
8 | |
---|
9 | # Euclid_1966.pl |
---|
10 | # |
---|
11 | # author: Wolfgang Schmidle |
---|
12 | # (c) Max Planck Institute for the History of Science, Berlin, Germany |
---|
13 | # version 1.3, 2009-04-07 |
---|
14 | # |
---|
15 | # This script processes the raw text of Euclid (1966), i.e. it turns (a local copy of) |
---|
16 | # http://pythia.mpiwg-berlin.mpg.de/department1/mpdl/raw-texts/WO3_Euclid_1966.txt/WO3_Euclid_1966_V1.txt |
---|
17 | # into a valid XML file. |
---|
18 | # |
---|
19 | # For Euclid (1966) see also |
---|
20 | # https://itgroup.mpiwg-berlin.mpg.de:8080/tracs/mpdl-project-content/wiki/WO3_Euclid_1966 |
---|
21 | # |
---|
22 | # Usage: perl Euclid_1966.txt > Euclid_1966.xml |
---|
23 | |
---|
24 | |
---|
25 | # Some remaining issues: |
---|
26 | # |
---|
27 | # - Replacing <001> by the correct character works fine, but <002> is in a higher plane of Unicode and kills off oXygen, |
---|
28 | # so I have used the simpler standard version of this character. |
---|
29 | # - Some <?> have been post-processed already, and I have removed the respective <?> tags because a <?> tag |
---|
30 | # has no value in itself once the line has been checked. I have compiled a list of lines which contain <?> and/or @ |
---|
31 | # and haven't been post-processed yet. |
---|
32 | # |
---|
33 | # - The <desc> and <var> in figures have not been used very consistently by Formax. (They did not use <cap> |
---|
34 | # at all, but I only know of one figure where it would make sense to use it.) |
---|
35 | # - Figures and note have no place attribute. |
---|
36 | # - No <num>, <var> (outside of figures), <ptr>, no IDs yet. |
---|
37 | # |
---|
38 | # - The parts of problems with more than one part have not been encoded yet. |
---|
39 | # - Four books, i.e. four titles, one in the front (attribute n=1) and three in the body (n=1, 2, 3). |
---|
40 | |
---|
41 | |
---|
42 | my $rawtext = shift @ARGV; |
---|
43 | |
---|
44 | |
---|
45 | # chinese into arabic numbers |
---|
46 | |
---|
47 | my $digit = "[一二三四五六七八九]"; |
---|
48 | my %arabic = ( 一 => 1, 二 => 2, 三 => 3, 四 => 4, 五 => 5, 六 => 6, 七 => 7, 八 => 8, 九 => 9 ); |
---|
49 | |
---|
50 | sub chinese2arabic { |
---|
51 | my $ch = shift; |
---|
52 | if ($ch =~ m!($digit)十($digit)!) { return 10*$arabic{$1}+$arabic{$2}; } |
---|
53 | if ($ch =~ m!($digit)十!) { return 10*$arabic{$1}; } |
---|
54 | if ($ch =~ m!十($digit)!) { return 10+$arabic{$1}; } |
---|
55 | if ($ch =~ m!十!) { return 10; } |
---|
56 | if ($ch =~ m!($digit)!) { return $arabic{$1}; } |
---|
57 | return "???"; |
---|
58 | } |
---|
59 | |
---|
60 | my $moderndigit = "[一二三四五六七八九〇]"; |
---|
61 | my %modernarabic = ( 一 => 1, 二 => 2, 三 => 3, 四 => 4, 五 => 5, 六 => 6, 七 => 7, 八 => 8, 九 => 9, 〇 => 0 ); |
---|
62 | |
---|
63 | sub modernchinese2arabic { |
---|
64 | my $ch = shift; |
---|
65 | my $result = 0; |
---|
66 | while ($ch ne "") { $result = 10 * $result + $modernarabic{substr($ch, 0, 1, "")}; } |
---|
67 | return $result; |
---|
68 | } |
---|
69 | |
---|
70 | |
---|
71 | # text-specific preparations |
---|
72 | |
---|
73 | my $number = "$digit|$digit?十$digit?"; # numbers from 1 to 99 (there is no higher number in any heading) |
---|
74 | my $nth = "第"; |
---|
75 | |
---|
76 | my $headingType = "[界|求|論|題]"; |
---|
77 | my %englishHeadingType = ( 界 => "definition", 求 => "construction", 論 => "axiom", 題 => "problem" ); |
---|
78 | |
---|
79 | |
---|
80 | # global variables |
---|
81 | |
---|
82 | my @text; |
---|
83 | |
---|
84 | # counters |
---|
85 | my $line; |
---|
86 | my $pb = 0; |
---|
87 | my %div = ( 界 => 0, 求 => 0, 論 => 0, 題 => 0); |
---|
88 | my $note = 0; |
---|
89 | |
---|
90 | # booleans |
---|
91 | my $inParagraph = 0; |
---|
92 | my $lineStartsWithS = 0; |
---|
93 | my $inDiv = 0; |
---|
94 | my $inFig = 0; |
---|
95 | my $inTb = 0; |
---|
96 | |
---|
97 | |
---|
98 | # read in the raw text |
---|
99 | |
---|
100 | open (RAWTEXT, "<:utf8", $rawtext) or die "Can't open the raw text file \"$rawtext\"!\n"; |
---|
101 | while (<RAWTEXT>) { |
---|
102 | |
---|
103 | s!\r\n!!; # remove Windows line ends |
---|
104 | push @text, $_; |
---|
105 | } |
---|
106 | close (RAWTEXT); |
---|
107 | |
---|
108 | |
---|
109 | # emendations of the raw text |
---|
110 | # (see also the reduplicated ECHO pages in the processing of page breaks) |
---|
111 | |
---|
112 | $line = 0; |
---|
113 | |
---|
114 | for (@text) { |
---|
115 | |
---|
116 | $line++; |
---|
117 | |
---|
118 | # normalize the zero in modern page numbers |
---|
119 | s!○!〇!g; # white circle U+25CB --> ideographic number zero U+3007 |
---|
120 | |
---|
121 | # ignore outdentation in the preface |
---|
122 | if ($line < 153) { s!<p x>!<p>!; } |
---|
123 | |
---|
124 | # pre-process notes that continue on the next line |
---|
125 | if ($text[$line-1] =~ m!</sm>$!) { |
---|
126 | my $counterpart = $line; |
---|
127 | if ($text[$counterpart] =~ m!^<pb!) { $counterpart++; } |
---|
128 | if ($text[$counterpart] =~ m!^<sm>!) { |
---|
129 | $text[$line-1] =~ s!</sm>$!!; |
---|
130 | $text[$counterpart] =~ s!^<sm>!!; |
---|
131 | } |
---|
132 | } |
---|
133 | |
---|
134 | # fill in the unknown characters (MSi) |
---|
135 | s!<001>!轂!g; # s!<001>!<unknown code="001" unicode="8F42">轂</unknown>!g; |
---|
136 | s!<002>!<unknown code="002" unicode="2F88D">庶</unknown>!g; # the actual Unicode character 庶 breaks oXygen |
---|
137 | |
---|
138 | # clarify <?> (the list is not complete!) |
---|
139 | s!<?>!<?>!; # line 811: fullwidth question mark U+FF1F --> ASCII question mark U+003F |
---|
140 | s!餘<\?>二倣此。!餘二倣此。!; # line 811 |
---|
141 | s!愈<\?>!愈!g; # MSi: the reading is correct |
---|
142 | s!。<\?>!。!; # lines 1041, 1087, 2104, 3855: all unclear periods are plausible |
---|
143 | # (e.g. line 1041: MSi: It is in the middle of a sentence, but a period at this position is quite common nonetheless.) |
---|
144 | s!有兩種幾何<\?>。!有兩種幾何。!; # line 2208 |
---|
145 | s!其元<\?>大者!其元大者!; # line 2208 again |
---|
146 | |
---|
147 | # replace the @ |
---|
148 | s!\@增至於無窮。!遞增至於無窮。!; # line 2208 again: U+905E |
---|
149 | s!\@減至於無窮。!遞減至於無窮。!; # line 2208 again: U+905E |
---|
150 | s!<p>@!<p><gap/>!; # lines 3642, 3667, 3815: missing lines of text (last lines on pages 278, 280 and 288) |
---|
151 | |
---|
152 | # missing line breaks (the list is not complete!) |
---|
153 | s!小於兩直角。則此二橫直線。!小於兩直角。則此二橫<lb/>直線。!; # line 403; may have to do with the neighboring figure |
---|
154 | s!俱小於直角。或幷之小於兩直角。!俱小於直角。或幷之小<lb/>於兩直角。!; # line 404 |
---|
155 | |
---|
156 | # normalize the hash in the table |
---|
157 | s!#!#!g; # fullwidth number sign U+FF03 --> ASCII hash, i.e. number sign U+0023 |
---|
158 | |
---|
159 | # move the only table in the text (ECHO p.327) out of its surrounding sentence |
---|
160 | s!却云十六與十二之比例。若!却云十六與十二之比例。!; # line 4562 |
---|
161 | s!八與三、及二與四之比例。!若<lb/>八與三、及二與四之比例。!; #line 4573 |
---|
162 | |
---|
163 | # misc. emendations |
---|
164 | s!N12<114608657010!N12x114608657010!; # line 5: replace "<" in library stamp junk |
---|
165 | s!<pb 六><h>幾何原本 卷一之首</h>!<pb 六><rh>幾何原本 卷一之首</rh>!; # line 245 (obvious mistake) |
---|
166 | s!<h>後支前己正論</h>!<p>後支前己正論</p>!; # line 2175 (Tian Miao: wrong tag) |
---|
167 | if ($line == 2992) { s!<h>第三十四題</h>!<h>第十四題</h>!; } # line 2992 (obvious mistake) |
---|
168 | |
---|
169 | } |
---|
170 | |
---|
171 | |
---|
172 | # process the raw text |
---|
173 | |
---|
174 | # The metadata add 31 lines to the text. Apart from these prefix lines, the line structure of the original is preserved. |
---|
175 | |
---|
176 | print <<'METADATA'; |
---|
177 | <?xml version="1.0" encoding="UTF-8"?> |
---|
178 | <echo xmlns="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/" |
---|
179 | xmlns:dc="http://purl.org/dc/elements/1.1/" |
---|
180 | xmlns:dcq="http://purl.org/dc/qualifiers/1.0/" |
---|
181 | xmlns:dct="http://purl.org/dc/terms/1.0/" |
---|
182 | xmlns:echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/" |
---|
183 | xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" |
---|
184 | xmlns:xhtml="http://www.w3.org/1999/xhtml" |
---|
185 | xmlns:xlink="http://www.w3.org/1999/xlink" |
---|
186 | xmlns:xml="http://www.w3.org/XML/1998/namespace"> |
---|
187 | <metadata> |
---|
188 | <dc:creator>Euclid</dc:creator> |
---|
189 | <dc:title xml:lang="zh">Jihe yuanben</dc:title> |
---|
190 | <dc:title xml:lang="zh">幾何原本</dc:title> |
---|
191 | <dc:date> |
---|
192 | <rdf:Description> |
---|
193 | <dcq:dateScheme>ISO 8601</dcq:dateScheme> |
---|
194 | <rdf:value>1966</rdf:value> |
---|
195 | </rdf:Description> |
---|
196 | </dc:date> |
---|
197 | <dc:language> |
---|
198 | <rdf:Description> |
---|
199 | <dcq:languageScheme>ISO 639-2</dcq:languageScheme> |
---|
200 | <rdf:value>zho-Hant</rdf:value> |
---|
201 | </rdf:Description> |
---|
202 | </dc:language> |
---|
203 | <dc:identifier>test.xml</dc:identifier> |
---|
204 | <dc:rights>open access</dc:rights> |
---|
205 | <dct:license>http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration</dct:license> |
---|
206 | </metadata> |
---|
207 | <text xml:lang="zh"> |
---|
208 | METADATA |
---|
209 | |
---|
210 | |
---|
211 | $line = 0; |
---|
212 | |
---|
213 | for (@text) { |
---|
214 | |
---|
215 | $line++; |
---|
216 | |
---|
217 | # unknown characters |
---|
218 | # (must be before figures so that of <?> in a <var> will be processed) |
---|
219 | s!<\?>!<unsure/>!g; |
---|
220 | |
---|
221 | |
---|
222 | # figures |
---|
223 | # (must be before the tagging of div's to avoid </div></fig>) |
---|
224 | # (the internal ordering is important for successive figures) |
---|
225 | |
---|
226 | if ($inFig) { |
---|
227 | if (m!<cap>! || m!<desc>! || m!<var>!) { |
---|
228 | print "$_\n"; |
---|
229 | next; |
---|
230 | } |
---|
231 | print "</figure>"; |
---|
232 | $inFig = 0; |
---|
233 | } |
---|
234 | if (s!<fig>!<figure>!) { # !<figure place="text">! |
---|
235 | $inFig = 1; |
---|
236 | print "$_\n"; |
---|
237 | next; |
---|
238 | } |
---|
239 | |
---|
240 | |
---|
241 | # the table on ECHO p.327 |
---|
242 | # (must be before paragraphs) |
---|
243 | |
---|
244 | if (m!<tb>!) { $inTb = 1; } |
---|
245 | |
---|
246 | if ($inTb) { |
---|
247 | unless (s!<(/?)tb>!<$1xhtml:table>!) { |
---|
248 | $_ = "<xhtml:tr><xhtml:td>$_</xhtml:td></xhtml:tr>"; |
---|
249 | s!#!</xhtml:td><xhtml:td>!g; |
---|
250 | } |
---|
251 | print "$_\n"; |
---|
252 | if (m!</xhtml:table>!) { $inTb = 0; } |
---|
253 | next; |
---|
254 | } |
---|
255 | |
---|
256 | |
---|
257 | # ad hoc tagging of book covers, preface, chapters, chapter heads, chapter mains, backmatter |
---|
258 | # including some additional lowest-level closing div's |
---|
259 | |
---|
260 | sub beginDiv { |
---|
261 | $inDiv = 0; |
---|
262 | foreach my $key (keys %div) { $div{$key} = 0; } |
---|
263 | } |
---|
264 | |
---|
265 | if ( $line == 1) { print '<front><div level="1" type="title" n="1">'; } # 1 - 32 |
---|
266 | if ( $line == 33) { print '</div>'; } |
---|
267 | |
---|
268 | if ( $line == 33) { print '<div level="1" type="preface" n="1">'; } # 33 - 152 |
---|
269 | if ( $line == 153) { print '</div></front>'; } |
---|
270 | |
---|
271 | if ( $line == 153) { print '<body><div level="1" type="chapter" n="1"><div level="2" type="chapterhead" n="1">'; beginDiv; } |
---|
272 | if ( $line == 452) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; } |
---|
273 | if ( $line == 1364) { print '</div></div></div>'; } |
---|
274 | |
---|
275 | if ( $line == 1364) {print '<div type="title" n="1">'; } # 1364 - 1385 |
---|
276 | if ( $line == 1386) { print '</div>'; } |
---|
277 | |
---|
278 | if ( $line == 1386) { print '<div level="1" type="chapter" n="2"><div level="2" type="chapterhead" n="1">'; beginDiv; } |
---|
279 | if ( $line == 1416) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; } |
---|
280 | if ( $line == 1753) { print '</div></div></div>'; } |
---|
281 | |
---|
282 | if ( $line == 1753) { print '<div level="1" type="chapter" n="3"><div level="2" type="chapterhead" n="1">'; beginDiv; } |
---|
283 | if ( $line == 1848) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; } |
---|
284 | if ( $line == 2722) { print '</div></div></div>'; } |
---|
285 | |
---|
286 | if ( $line == 2722) { print '<div type="title" n="2">'; } # 2722 - 2744 |
---|
287 | if ( $line == 2745) { print '</div>'; } |
---|
288 | |
---|
289 | if ( $line == 2745) { print '<div level="1" type="chapter" n="4"><div level="2" type="chapterhead" n="1">'; beginDiv; } |
---|
290 | if ( $line == 2787) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; } |
---|
291 | if ( $line == 3109) { print '</div></div></div>'; } |
---|
292 | |
---|
293 | if ( $line == 3109) { print '<div level="1" type="chapter" n="5"><div level="2" type="chapterhead" n="1">'; beginDiv; } |
---|
294 | if ( $line == 3520) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; } |
---|
295 | if ( $line == 4387) { print '</div></div></div>'; } |
---|
296 | |
---|
297 | if ( $line == 4387) { print '<div type="title" n="3">'; } # 4387 - 4392 |
---|
298 | if ( $line == 4393) { print '</div>'; } |
---|
299 | |
---|
300 | if ( $line == 4393) { print '<div level="1" type="chapter" n="6"><div level="2" type="chapterhead" n="1">'; beginDiv; } |
---|
301 | if ( $line == 4594) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; } |
---|
302 | if ( $line == 5984) { print '</div></div></div></body>'; } |
---|
303 | |
---|
304 | if ( $line == 5984) { print '<back><div type="backmatter">'; } # 5984 - end; </div></back> is printed together with </xml>, see below. |
---|
305 | |
---|
306 | |
---|
307 | # page breaks |
---|
308 | |
---|
309 | if (m!<pb!) { |
---|
310 | $pb++; |
---|
311 | |
---|
312 | # # The ECHO pages 215 to 220 reduplicate the pages 209 to 214 and have been typed only once by Formax |
---|
313 | # if ($pb == 215) { $pb = 221; } |
---|
314 | |
---|
315 | m!^<pb( ($moderndigit+))?>(<rh>(.+)</rh>)? *$!; |
---|
316 | my $pagenumber = ($2 ? " o=\"$2\" oE=\"" . &modernchinese2arabic($2). "\"" : ""); |
---|
317 | my $runninghead = ($4 ? " rhead=\"$4\"" : "" ); |
---|
318 | my $link = 'http://echo.mpiwg-berlin.mpg.de/ECHOdocuView/ECHOzogiLib?'. |
---|
319 | 'url=/mpiwg/online/permanent/library/02NT95YF/pageimg&mode=imagepath&pn='.$pb; |
---|
320 | $_ = "<pb$pagenumber$runninghead n=\"$pb\" xlink:href='$link'/>"; |
---|
321 | print "$_\n"; |
---|
322 | next; |
---|
323 | } |
---|
324 | |
---|
325 | |
---|
326 | # headings |
---|
327 | |
---|
328 | if (m!<h>!) { |
---|
329 | |
---|
330 | s!<h>!<head>!; |
---|
331 | s!</h>!</head>!; |
---|
332 | |
---|
333 | # notes in headings |
---|
334 | s! ?\\\\ ?!<lb type="halfline"/>!g; |
---|
335 | s!<sm>(.+)</sm>!<note>$1</note>!; |
---|
336 | |
---|
337 | # headings at the lowest level |
---|
338 | if (m!<head>$nth($number)($headingType).*?</head>$!) { |
---|
339 | my $o = $1; |
---|
340 | my $oE = &chinese2arabic($o); |
---|
341 | my $type = $2; |
---|
342 | my $typeE = $englishHeadingType{$2}; |
---|
343 | s! (.+?)</head>!<note>$1</note></head>!; |
---|
344 | $div{$type}++; |
---|
345 | if ($div{$type} != $oE) { die "$line: The numbering of the div's is incorrect!\n"; } |
---|
346 | |
---|
347 | if ($inDiv) { print "</div>"; } |
---|
348 | print '<div type="'.$type.'" typeE="'.$typeE.'" o="'.$o.'" level="3" n="'.$div{$type}.'">'; |
---|
349 | $inDiv = 1; |
---|
350 | } |
---|
351 | |
---|
352 | # all headings |
---|
353 | print "$_\n"; |
---|
354 | next; |
---|
355 | } |
---|
356 | |
---|
357 | |
---|
358 | # paragraphs |
---|
359 | # (<p> is always at the beginning, </p> always at the end of a line) |
---|
360 | # inside a paragraph: <pb ...>, <rh>; <sm>; <fig>, <desc>, <var>; 1x <tb>; 3x <002> |
---|
361 | |
---|
362 | my $outdented = s!<p x>!<p>!; |
---|
363 | |
---|
364 | if (m!<p>!) { $inParagraph = 1; } |
---|
365 | |
---|
366 | if ($inParagraph) { |
---|
367 | |
---|
368 | # normalize the periods: there should always be a period before </p> and </sm> |
---|
369 | # (if the period is missing, insert an ASCII period) |
---|
370 | s!([^。])</p>!$1\.</p>!g; |
---|
371 | s!</sm>\.</p>!</sm></p>!; |
---|
372 | s!([^。])<sm>!$1\.<sm>!g; |
---|
373 | s!^\.<sm>!<sm>!; |
---|
374 | s!<p>\.<sm>!<p><sm>!; |
---|
375 | s!([^。])</sm>!$1\.</sm>!g; |
---|
376 | |
---|
377 | # turn small text into notes |
---|
378 | s!<sm>!<note>!g; # <sm> and </sm> need not be in the same line |
---|
379 | s!</sm>!</note>!g; |
---|
380 | s! ?\\\\ ?!<lb type="halfline"/>!g; |
---|
381 | |
---|
382 | # tag sentences |
---|
383 | s!<p>!<p><s>!; |
---|
384 | if ($lineStartsWithS) { $_ = "<s>$_"; $lineStartsWithS = 0; } |
---|
385 | s![。.]!$&</s><s>!g; |
---|
386 | if (s!<s>$!!) { $lineStartsWithS = 1; } |
---|
387 | s!<s><note>!<note><s>!g; # ??? |
---|
388 | s!<s></note>!</note><s>!g; |
---|
389 | s!<s><lb type="halfline"/>!<lb type="halfline"/><s>!g; |
---|
390 | s!<s></p>!</p>!; |
---|
391 | |
---|
392 | # end each line with <lb/> |
---|
393 | $_ .= "<lb/>" unless (($line < $#text) &&(($text[$line] =~ m!<tb>!) || ($text[$line] =~ m!<pb!))); |
---|
394 | s!</p><lb/>!</p>!; |
---|
395 | s!<s><lb/>!<lb/><s>!g; |
---|
396 | |
---|
397 | # outdented paragraphs |
---|
398 | if ($outdented) { s!<p>!<p outdented="1">!; } |
---|
399 | |
---|
400 | print "$_\n"; |
---|
401 | |
---|
402 | if (m!</p>!) { $inParagraph = 0; } |
---|
403 | next; |
---|
404 | } |
---|
405 | |
---|
406 | |
---|
407 | # everything else |
---|
408 | |
---|
409 | print "$_\n"; |
---|
410 | } |
---|
411 | |
---|
412 | |
---|
413 | print "</div>"; # end of backmatter |
---|
414 | print "</back></text></echo>\n"; |
---|
415 | |
---|
416 | |
---|