Context Navigation

Back to WO3_Euclid_1966

WO3_Euclid_1966: Euclid_1966.pl

File Euclid_1966.pl, 13.4 KB (added by Wolfgang Schmidle, 15 years ago)

Line
1	#!/usr/bin/perl -w
2	use strict;
3	use warnings;
4	use utf8;
5	binmode STDOUT, ':utf8';
6	use integer;
7
8
9	# Euclid_1966.pl
10	#
11	# author: Wolfgang Schmidle
12	# (c) Max Planck Institute for the History of Science, Berlin, Germany
13	# version 1.3, 2009-04-07
14	#
15	# This script processes the raw text of Euclid (1966), i.e. it turns (a local copy of)
16	# http://pythia.mpiwg-berlin.mpg.de/department1/mpdl/raw-texts/WO3_Euclid_1966.txt/WO3_Euclid_1966_V1.txt
17	# into a valid XML file.
18	#
19	# For Euclid (1966) see also
20	# https://itgroup.mpiwg-berlin.mpg.de:8080/tracs/mpdl-project-content/wiki/WO3_Euclid_1966
21	#
22	# Usage: perl Euclid_1966.txt > Euclid_1966.xml
23
24
25	# Some remaining issues:
26	#
27	# - Replacing <001> by the correct character works fine, but <002> is in a higher plane of Unicode and kills off oXygen,
28	# so I have used the simpler standard version of this character.
29	# - Some <?> have been post-processed already, and I have removed the respective <?> tags because a <?> tag
30	# has no value in itself once the line has been checked. I have compiled a list of lines which contain <?> and/or @
31	# and haven't been post-processed yet.
32	#
33	# - The <desc> and <var> in figures have not been used very consistently by Formax. (They did not use <cap>
34	# at all, but I only know of one figure where it would make sense to use it.)
35	# - Figures and note have no place attribute.
36	# - No <num>, <var> (outside of figures), <ptr>, no IDs yet.
37	#
38	# - The parts of problems with more than one part have not been encoded yet.
39	# - Four books, i.e. four titles, one in the front (attribute n=1) and three in the body (n=1, 2, 3).
40
41
42	my $rawtext = shift @ARGV;
43
44
45	# chinese into arabic numbers
46
47	my $digit = "[一二三四五六七八九]";
48	my %arabic = ( 一 => 1, 二 => 2, 三 => 3, 四 => 4, 五 => 5, 六 => 6, 七 => 7, 八 => 8, 九 => 9 );
49
50	sub chinese2arabic {
51	my $ch = shift;
52	if ($ch =~ m!($digit)十($digit)!) { return 10*$arabic{$1}+$arabic{$2}; }
53	if ($ch =~ m!($digit)十!) { return 10*$arabic{$1}; }
54	if ($ch =~ m!十($digit)!) { return 10+$arabic{$1}; }
55	if ($ch =~ m!十!) { return 10; }
56	if ($ch =~ m!($digit)!) { return $arabic{$1}; }
57	return "???";
58	}
59
60	my $moderndigit = "[一二三四五六七八九〇]";
61	my %modernarabic = ( 一 => 1, 二 => 2, 三 => 3, 四 => 4, 五 => 5, 六 => 6, 七 => 7, 八 => 8, 九 => 9, 〇 => 0 );
62
63	sub modernchinese2arabic {
64	my $ch = shift;
65	my $result = 0;
66	while ($ch ne "") { $result = 10 * $result + $modernarabic{substr($ch, 0, 1, "")}; }
67	return $result;
68	}
69
70
71	# text-specific preparations
72
73	my $number = "$digit\|$digit?十$digit?"; # numbers from 1 to 99 (there is no higher number in any heading)
74	my $nth = "第";
75
76	my $headingType = "[界\|求\|論\|題]";
77	my %englishHeadingType = ( 界 => "definition", 求 => "construction", 論 => "axiom", 題 => "problem" );
78
79
80	# global variables
81
82	my @text;
83
84	# counters
85	my $line;
86	my $pb = 0;
87	my %div = ( 界 => 0, 求 => 0, 論 => 0, 題 => 0);
88	my $note = 0;
89
90	# booleans
91	my $inParagraph = 0;
92	my $lineStartsWithS = 0;
93	my $inDiv = 0;
94	my $inFig = 0;
95	my $inTb = 0;
96
97
98	# read in the raw text
99
100	open (RAWTEXT, "<:utf8", $rawtext) or die "Can't open the raw text file \"$rawtext\"!\n";
101	while (<RAWTEXT>) {
102
103	s!\r\n!!; # remove Windows line ends
104	push @text, $_;
105	}
106	close (RAWTEXT);
107
108
109	# emendations of the raw text
110	# (see also the reduplicated ECHO pages in the processing of page breaks)
111
112	$line = 0;
113
114	for (@text) {
115
116	$line++;
117
118	# normalize the zero in modern page numbers
119	s!○!〇!g; # white circle U+25CB --> ideographic number zero U+3007
120
121	# ignore outdentation in the preface
122	if ($line < 153) { s!<p x>!<p>!; }
123
124	# pre-process notes that continue on the next line
125	if ($text[$line-1] =~ m!</sm>$!) {
126	my $counterpart = $line;
127	if ($text[$counterpart] =~ m!^<pb!) { $counterpart++; }
128	if ($text[$counterpart] =~ m!^<sm>!) {
129	$text[$line-1] =~ s!</sm>$!!;
130	$text[$counterpart] =~ s!^<sm>!!;
131	}
132	}
133
134	# fill in the unknown characters (MSi)
135	s!<001>!轂!g; # s!<001>!<unknown code="001" unicode="8F42">轂</unknown>!g;
136	s!<002>!<unknown code="002" unicode="2F88D">庶</unknown>!g; # the actual Unicode character 庶 breaks oXygen
137
138	# clarify <?> (the list is not complete!)
139	s!<？>!<?>!; # line 811: fullwidth question mark U+FF1F --> ASCII question mark U+003F
140	s!餘<\?>二倣此。!餘二倣此。!; # line 811
141	s!愈<\?>!愈!g; # MSi: the reading is correct
142	s!。<\?>!。!; # lines 1041, 1087, 2104, 3855: all unclear periods are plausible
143	# (e.g. line 1041: MSi: It is in the middle of a sentence, but a period at this position is quite common nonetheless.)
144	s!有兩種幾何<\?>。!有兩種幾何。!; # line 2208
145	s!其元<\?>大者!其元大者!; # line 2208 again
146
147	# replace the @
148	s!\@增至於無窮。!遞增至於無窮。!; # line 2208 again: U+905E
149	s!\@減至於無窮。!遞減至於無窮。!; # line 2208 again: U+905E
150	s!<p>@!<p><gap/>!; # lines 3642, 3667, 3815: missing lines of text (last lines on pages 278, 280 and 288)
151
152	# missing line breaks (the list is not complete!)
153	s!小於兩直角。則此二橫直線。!小於兩直角。則此二橫<lb/>直線。!; # line 403; may have to do with the neighboring figure
154	s!俱小於直角。或幷之小於兩直角。!俱小於直角。或幷之小<lb/>於兩直角。!; # line 404
155
156	# normalize the hash in the table
157	s!＃!#!g; # fullwidth number sign U+FF03 --> ASCII hash, i.e. number sign U+0023
158
159	# move the only table in the text (ECHO p.327) out of its surrounding sentence
160	s!却云十六與十二之比例。若!却云十六與十二之比例。!; # line 4562
161	s!八與三、及二與四之比例。!若<lb/>八與三、及二與四之比例。!; #line 4573
162
163	# misc. emendations
164	s!N12<114608657010!N12x114608657010!; # line 5: replace "<" in library stamp junk
165	s!<pb 六><h>幾何原本　卷一之首</h>!<pb 六><rh>幾何原本　卷一之首</rh>!; # line 245 (obvious mistake)
166	s!<h>後支前己正論</h>!<p>後支前己正論</p>!; # line 2175 (Tian Miao: wrong tag)
167	if ($line == 2992) { s!<h>第三十四題</h>!<h>第十四題</h>!; } # line 2992 (obvious mistake)
168
169	}
170
171
172	# process the raw text
173
174	# The metadata add 31 lines to the text. Apart from these prefix lines, the line structure of the original is preserved.
175
176	print <<'METADATA';
177	<?xml version="1.0" encoding="UTF-8"?>
178	<echo xmlns="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"
179	xmlns:dc="http://purl.org/dc/elements/1.1/"
180	xmlns:dcq="http://purl.org/dc/qualifiers/1.0/"
181	xmlns:dct="http://purl.org/dc/terms/1.0/"
182	xmlns:echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"
183	xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
184	xmlns:xhtml="http://www.w3.org/1999/xhtml"
185	xmlns:xlink="http://www.w3.org/1999/xlink"
186	xmlns:xml="http://www.w3.org/XML/1998/namespace">
187	<metadata>
188	<dc:creator>Euclid</dc:creator>
189	<dc:title xml:lang="zh">Jihe yuanben</dc:title>
190	<dc:title xml:lang="zh">幾何原本</dc:title>
191	<dc:date>
192	<rdf:Description>
193	<dcq:dateScheme>ISO 8601</dcq:dateScheme>
194	<rdf:value>1966</rdf:value>
195	</rdf:Description>
196	</dc:date>
197	<dc:language>
198	<rdf:Description>
199	<dcq:languageScheme>ISO 639-2</dcq:languageScheme>
200	<rdf:value>zho-Hant</rdf:value>
201	</rdf:Description>
202	</dc:language>
203	<dc:identifier>test.xml</dc:identifier>
204	<dc:rights>open access</dc:rights>
205	<dct:license>http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration</dct:license>
206	</metadata>
207	<text xml:lang="zh">
208	METADATA
209
210
211	$line = 0;
212
213	for (@text) {
214
215	$line++;
216
217	# unknown characters
218	# (must be before figures so that of <?> in a <var> will be processed)
219	s!<\?>!<unsure/>!g;
220
221
222	# figures
223	# (must be before the tagging of div's to avoid </div></fig>)
224	# (the internal ordering is important for successive figures)
225
226	if ($inFig) {
227	if (m!<cap>! \|\| m!<desc>! \|\| m!<var>!) {
228	print "$_\n";
229	next;
230	}
231	print "</figure>";
232	$inFig = 0;
233	}
234	if (s!<fig>!<figure>!) { # !<figure place="text">!
235	$inFig = 1;
236	print "$_\n";
237	next;
238	}
239
240
241	# the table on ECHO p.327
242	# (must be before paragraphs)
243
244	if (m!<tb>!) { $inTb = 1; }
245
246	if ($inTb) {
247	unless (s!<(/?)tb>!<$1xhtml:table>!) {
248	$_ = "<xhtml:tr><xhtml:td>$_</xhtml:td></xhtml:tr>";
249	s!#!</xhtml:td><xhtml:td>!g;
250	}
251	print "$_\n";
252	if (m!</xhtml:table>!) { $inTb = 0; }
253	next;
254	}
255
256
257	# ad hoc tagging of book covers, preface, chapters, chapter heads, chapter mains, backmatter
258	# including some additional lowest-level closing div's
259
260	sub beginDiv {
261	$inDiv = 0;
262	foreach my $key (keys %div) { $div{$key} = 0; }
263	}
264
265	if ( $line == 1) { print '<front><div level="1" type="title" n="1">'; } # 1 - 32
266	if ( $line == 33) { print '</div>'; }
267
268	if ( $line == 33) { print '<div level="1" type="preface" n="1">'; } # 33 - 152
269	if ( $line == 153) { print '</div></front>'; }
270
271	if ( $line == 153) { print '<body><div level="1" type="chapter" n="1"><div level="2" type="chapterhead" n="1">'; beginDiv; }
272	if ( $line == 452) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; }
273	if ( $line == 1364) { print '</div></div></div>'; }
274
275	if ( $line == 1364) {print '<div type="title" n="1">'; } # 1364 - 1385
276	if ( $line == 1386) { print '</div>'; }
277
278	if ( $line == 1386) { print '<div level="1" type="chapter" n="2"><div level="2" type="chapterhead" n="1">'; beginDiv; }
279	if ( $line == 1416) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; }
280	if ( $line == 1753) { print '</div></div></div>'; }
281
282	if ( $line == 1753) { print '<div level="1" type="chapter" n="3"><div level="2" type="chapterhead" n="1">'; beginDiv; }
283	if ( $line == 1848) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; }
284	if ( $line == 2722) { print '</div></div></div>'; }
285
286	if ( $line == 2722) { print '<div type="title" n="2">'; } # 2722 - 2744
287	if ( $line == 2745) { print '</div>'; }
288
289	if ( $line == 2745) { print '<div level="1" type="chapter" n="4"><div level="2" type="chapterhead" n="1">'; beginDiv; }
290	if ( $line == 2787) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; }
291	if ( $line == 3109) { print '</div></div></div>'; }
292
293	if ( $line == 3109) { print '<div level="1" type="chapter" n="5"><div level="2" type="chapterhead" n="1">'; beginDiv; }
294	if ( $line == 3520) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; }
295	if ( $line == 4387) { print '</div></div></div>'; }
296
297	if ( $line == 4387) { print '<div type="title" n="3">'; } # 4387 - 4392
298	if ( $line == 4393) { print '</div>'; }
299
300	if ( $line == 4393) { print '<div level="1" type="chapter" n="6"><div level="2" type="chapterhead" n="1">'; beginDiv; }
301	if ( $line == 4594) { print '</div></div><div level="2" type="chaptermain" n="1">'; beginDiv; }
302	if ( $line == 5984) { print '</div></div></div></body>'; }
303
304	if ( $line == 5984) { print '<back><div type="backmatter">'; } # 5984 - end; </div></back> is printed together with </xml>, see below.
305
306
307	# page breaks
308
309	if (m!<pb!) {
310	$pb++;
311
312	# # The ECHO pages 215 to 220 reduplicate the pages 209 to 214 and have been typed only once by Formax
313	# if ($pb == 215) { $pb = 221; }
314
315	m!^<pb( ($moderndigit+))?>(<rh>(.+)</rh>)? *$!;
316	my $pagenumber = ($2 ? " o=\"$2\" oE=\"" . &modernchinese2arabic($2). "\"" : "");
317	my $runninghead = ($4 ? " rhead=\"$4\"" : "" );
318	my $link = 'http://echo.mpiwg-berlin.mpg.de/ECHOdocuView/ECHOzogiLib?'.
319	'url=/mpiwg/online/permanent/library/02NT95YF/pageimg&mode=imagepath&pn='.$pb;
320	$_ = "<pb$pagenumber$runninghead n=\"$pb\" xlink:href='$link'/>";
321	print "$_\n";
322	next;
323	}
324
325
326	# headings
327
328	if (m!<h>!) {
329
330	s!<h>!<head>!;
331	s!</h>!</head>!;
332
333	# notes in headings
334	s!　?\\\\　?!<lb type="halfline"/>!g;
335	s!<sm>(.+)</sm>!<note>$1</note>!;
336
337	# headings at the lowest level
338	if (m!<head>$nth($number)($headingType).*?</head>$!) {
339	my $o = $1;
340	my $oE = &chinese2arabic($o);
341	my $type = $2;
342	my $typeE = $englishHeadingType{$2};
343	s!　(.+?)</head>!<note>$1</note></head>!;
344	$div{$type}++;
345	if ($div{$type} != $oE) { die "$line: The numbering of the div's is incorrect!\n"; }
346
347	if ($inDiv) { print "</div>"; }
348	print '<div type="'.$type.'" typeE="'.$typeE.'" o="'.$o.'" level="3" n="'.$div{$type}.'">';
349	$inDiv = 1;
350	}
351
352	# all headings
353	print "$_\n";
354	next;
355	}
356
357
358	# paragraphs
359	# (<p> is always at the beginning, </p> always at the end of a line)
360	# inside a paragraph: <pb ...>, <rh>; <sm>; <fig>, <desc>, <var>; 1x <tb>; 3x <002>
361
362	my $outdented = s!<p x>!<p>!;
363
364	if (m!<p>!) { $inParagraph = 1; }
365
366	if ($inParagraph) {
367
368	# normalize the periods: there should always be a period before </p> and </sm>
369	# (if the period is missing, insert an ASCII period)
370	s!([^。])</p>!$1\.</p>!g;
371	s!</sm>\.</p>!</sm></p>!;
372	s!([^。])<sm>!$1\.<sm>!g;
373	s!^\.<sm>!<sm>!;
374	s!<p>\.<sm>!<p><sm>!;
375	s!([^。])</sm>!$1\.</sm>!g;
376
377	# turn small text into notes
378	s!<sm>!<note>!g; # <sm> and </sm> need not be in the same line
379	s!</sm>!</note>!g;
380	s!　?\\\\　?!<lb type="halfline"/>!g;
381
382	# tag sentences
383	s!<p>!<p><s>!;
384	if ($lineStartsWithS) { $_ = "<s>$_"; $lineStartsWithS = 0; }
385	s![。.]!$&</s><s>!g;
386	if (s!<s>$!!) { $lineStartsWithS = 1; }
387	s!<s><note>!<note><s>!g; # ???
388	s!<s></note>!</note><s>!g;
389	s!<s><lb type="halfline"/>!<lb type="halfline"/><s>!g;
390	s!<s></p>!</p>!;
391
392	# end each line with <lb/>
393	$_ .= "<lb/>" unless (($line < $#text) &&(($text[$line] =~ m!<tb>!) \|\| ($text[$line] =~ m!<pb!)));
394	s!</p><lb/>!</p>!;
395	s!<s><lb/>!<lb/><s>!g;
396
397	# outdented paragraphs
398	if ($outdented) { s!<p>!<p outdented="1">!; }
399
400	print "$_\n";
401
402	if (m!</p>!) { $inParagraph = 0; }
403	next;
404	}
405
406
407	# everything else
408
409	print "$_\n";
410	}
411
412
413	print "</div>"; # end of backmatter
414	print "</back></text></echo>\n";
415
416

Download in other formats:

Original Format