Mercurial > hg > extraction-interface
comparison interface/insert_new_columns_into_books/parse_data_from_sinica.php @ 0:b12c99b7c3f0
commit for previous development
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 19 Jan 2015 17:13:49 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b12c99b7c3f0 |
---|---|
1 <?php | |
2 set_time_limit(0); | |
3 ini_set('memory_limit', '-1'); | |
4 //group the duplicate books in source 1 | |
5 function mergeBooks(){ | |
6 $fileName="./data_from_sinica/all_data.csv"; | |
7 $inputFp=fopen($fileName, "r"); | |
8 $fileName="./data_from_sinica/merged_books.csv"; | |
9 $outputFp=fopen($fileName, "w"); | |
10 $bookArray=array(); | |
11 $count=0; | |
12 $bookCount=0; | |
13 $columnNameMapping=[2=>'level1', 3=>'level2', 'year'=>'year', | |
14 4=>'title', 5=>'volume', 6=>'timeOfTheAuthor', 7=>'author', | |
15 8=>'time', 9=>'commonEra', 10=>'edition', 11=>'location', 12=>'note', 1=>'source',0=>'id']; | |
16 while(!feof($inputFp)){ | |
17 $data=fgetcsv($inputFp,1000000,"\t"); | |
18 //0=>id, 1=>source, 2=>省份, 3=>地區, 4=>地方志名, 5=>卷數, 6=>編修者年代, 7=>人名, 8=>年代, 9=>西元, 10=>性質, 11=>館藏地, 12=>註記 | |
19 $book=array(); | |
20 foreach($columnNameMapping as $oldIdx=>$newIdx){ | |
21 if(!is_numeric($oldIdx)){ | |
22 $book[$newIdx]=''; | |
23 continue; | |
24 } | |
25 $book[$newIdx]=$data[$oldIdx]; | |
26 } | |
27 $bracketArray=["(",")"]; | |
28 $book['timeOfTheAuthor']=str_replace($bracketArray,"",$book['timeOfTheAuthor']); | |
29 $leftBracketArray=["﹝"]; | |
30 $rightBracketArray=["﹞","}"]; | |
31 $book['title']=str_replace($leftBracketArray,"〔",$book['title']); | |
32 $book['title']=str_replace($rightBracketArray,"〕",$book['title']); | |
33 $pattern="/〔(.{2,4})〕(.+)/u"; | |
34 //0=whole,1=year,2=title | |
35 if(preg_match($pattern,$book['title'],$match)){ | |
36 $book['year']=$match[1]; | |
37 $book['title']=$match[2]; | |
38 } | |
39 $pattern="/﹝(.{2,4})﹞(.+)/u"; | |
40 //0=whole,1=year,2=title | |
41 if(preg_match($pattern,$book['title'],$match)){ | |
42 $book['year']=$match[1]; | |
43 $book['title']=$match[2]; | |
44 } | |
45 if($count==0){ | |
46 $bookArray[$count]=$book; | |
47 $count++; | |
48 continue; | |
49 } | |
50 $lastBook=$bookArray[$count-1]; | |
51 if($lastBook['year']==$book['year'] && $lastBook['title']==$book['title']){//the same book | |
52 //push into the buffer | |
53 $bookArray[$count]=$book; | |
54 $count++; | |
55 }else{//differnt books | |
56 //flush out the buffer | |
57 $array=["#",$lastBook['level1'],$lastBook['level2'],$lastBook['year'],$lastBook['title'],$count]; | |
58 fputcsv($outputFp,$array,"\t"); | |
59 foreach($bookArray as $bookInfo){ | |
60 fputcsv($outputFp,$bookInfo,"\t"); | |
61 } | |
62 $bookArray=array(); | |
63 $count=0; | |
64 $bookArray[$count]=$book; | |
65 $count++; | |
66 $bookCount++; | |
67 } | |
68 if($book['source']>1) break; | |
69 } | |
70 echo "# of books: ".$bookCount."<br>"; | |
71 fclose($inputFp); | |
72 fclose($outputFp); | |
73 } | |
74 function printMergedBooks(){ | |
75 echo "<table>"; | |
76 echo "<tr>"; | |
77 echo "<th class='level'>level 1"; | |
78 echo "<th class='level'>level 2"; | |
79 echo "<th class='time'>year"; | |
80 echo "<th class='title'>title"; | |
81 echo "<th class='volume'>volume"; | |
82 echo "<th class='time'>time of the author"; | |
83 echo "<th class='author'>author"; | |
84 echo "<th class='time'>time"; | |
85 echo "<th class='time'>ce"; | |
86 echo "<th class='edition'>edition"; | |
87 echo "<th class='location'>location"; | |
88 echo "<th class='note'>note"; | |
89 echo "<th class='source'>src"; | |
90 echo "<th class='source'>id"; | |
91 $fileName="./data_from_sinica/merged_books.csv"; | |
92 $fp=fopen($fileName, "r"); | |
93 $count=0; | |
94 while(!feof($fp)){ | |
95 $data=fgetcsv($fp,100000,"\t"); | |
96 if($data[0]=="#"){ | |
97 $lastBook=array(); | |
98 for($i=0; $i<$data[5];$i++){ | |
99 $book=fgetcsv($fp,100000,"\t"); | |
100 | |
101 if($i==0){ | |
102 echo "<tr class='separator'>"; | |
103 foreach($book as $item){ | |
104 echo "<td>"; | |
105 } | |
106 } | |
107 echo "<tr>"; | |
108 $j=0; | |
109 foreach($book as $idx=>$item){ | |
110 $class=''; | |
111 if($i!=0){ | |
112 if($idx==4 && $lastBook[4]!=$item){ | |
113 $class="different"; | |
114 } | |
115 if($idx==6 && $lastBook[6]!=$item){ | |
116 $class="different"; | |
117 } | |
118 } | |
119 if($idx==2 && $item=="") $class="empty"; | |
120 echo "<td class='".$class."'>".$item; | |
121 $j++; | |
122 } | |
123 $lastBook=$book; | |
124 $count++; | |
125 } | |
126 } | |
127 } | |
128 echo "</table>"; | |
129 } | |
130 | |
131 ?> | |
132 | |
133 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |
134 <html> | |
135 <head> | |
136 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> | |
137 <style type="text/css"> | |
138 table{ | |
139 border-collapse:collapse; | |
140 } | |
141 td,th{ | |
142 border:1px solid #eee; | |
143 padding:5px 5px; | |
144 } | |
145 td{vertical-align:top;} | |
146 .separator{background:#000;} | |
147 .book{background:#ddd;} | |
148 .empty{background:#F6CECE;} | |
149 .different{background:#F3E2A9;} | |
150 .level{width:80px;} | |
151 .time{width:40px;} | |
152 .title{width:100px;} | |
153 .volume{width:120px;} | |
154 .author{width:200px;} | |
155 .edition,.location{width:200px;} | |
156 .note{width:200px;} | |
157 .source{20px;} | |
158 </style> | |
159 <!--<script src="js/check_sections.js" charset="utf-8"></script>!--> | |
160 </head> | |
161 <body> | |
162 <?php | |
163 | |
164 //mergeBooks(); | |
165 printMergedBooks(); | |
166 | |
167 ?> | |
168 </body> | |
169 </html> |