comparison interface/insert_new_columns_into_books/parse_data_from_sinica.php @ 0:b12c99b7c3f0

commit for previous development
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Mon, 19 Jan 2015 17:13:49 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b12c99b7c3f0
1 <?php
2 set_time_limit(0);
3 ini_set('memory_limit', '-1');
4 //group the duplicate books in source 1
5 function mergeBooks(){
6 $fileName="./data_from_sinica/all_data.csv";
7 $inputFp=fopen($fileName, "r");
8 $fileName="./data_from_sinica/merged_books.csv";
9 $outputFp=fopen($fileName, "w");
10 $bookArray=array();
11 $count=0;
12 $bookCount=0;
13 $columnNameMapping=[2=>'level1', 3=>'level2', 'year'=>'year',
14 4=>'title', 5=>'volume', 6=>'timeOfTheAuthor', 7=>'author',
15 8=>'time', 9=>'commonEra', 10=>'edition', 11=>'location', 12=>'note', 1=>'source',0=>'id'];
16 while(!feof($inputFp)){
17 $data=fgetcsv($inputFp,1000000,"\t");
18 //0=>id, 1=>source, 2=>省份, 3=>地區, 4=>地方志名, 5=>卷數, 6=>編修者年代, 7=>人名, 8=>年代, 9=>西元, 10=>性質, 11=>館藏地, 12=>註記
19 $book=array();
20 foreach($columnNameMapping as $oldIdx=>$newIdx){
21 if(!is_numeric($oldIdx)){
22 $book[$newIdx]='';
23 continue;
24 }
25 $book[$newIdx]=$data[$oldIdx];
26 }
27 $bracketArray=["(",")"];
28 $book['timeOfTheAuthor']=str_replace($bracketArray,"",$book['timeOfTheAuthor']);
29 $leftBracketArray=["﹝"];
30 $rightBracketArray=["﹞","}"];
31 $book['title']=str_replace($leftBracketArray,"〔",$book['title']);
32 $book['title']=str_replace($rightBracketArray,"〕",$book['title']);
33 $pattern="/〔(.{2,4})〕(.+)/u";
34 //0=whole,1=year,2=title
35 if(preg_match($pattern,$book['title'],$match)){
36 $book['year']=$match[1];
37 $book['title']=$match[2];
38 }
39 $pattern="/﹝(.{2,4})﹞(.+)/u";
40 //0=whole,1=year,2=title
41 if(preg_match($pattern,$book['title'],$match)){
42 $book['year']=$match[1];
43 $book['title']=$match[2];
44 }
45 if($count==0){
46 $bookArray[$count]=$book;
47 $count++;
48 continue;
49 }
50 $lastBook=$bookArray[$count-1];
51 if($lastBook['year']==$book['year'] && $lastBook['title']==$book['title']){//the same book
52 //push into the buffer
53 $bookArray[$count]=$book;
54 $count++;
55 }else{//differnt books
56 //flush out the buffer
57 $array=["#",$lastBook['level1'],$lastBook['level2'],$lastBook['year'],$lastBook['title'],$count];
58 fputcsv($outputFp,$array,"\t");
59 foreach($bookArray as $bookInfo){
60 fputcsv($outputFp,$bookInfo,"\t");
61 }
62 $bookArray=array();
63 $count=0;
64 $bookArray[$count]=$book;
65 $count++;
66 $bookCount++;
67 }
68 if($book['source']>1) break;
69 }
70 echo "# of books: ".$bookCount."<br>";
71 fclose($inputFp);
72 fclose($outputFp);
73 }
74 function printMergedBooks(){
75 echo "<table>";
76 echo "<tr>";
77 echo "<th class='level'>level 1";
78 echo "<th class='level'>level 2";
79 echo "<th class='time'>year";
80 echo "<th class='title'>title";
81 echo "<th class='volume'>volume";
82 echo "<th class='time'>time of the author";
83 echo "<th class='author'>author";
84 echo "<th class='time'>time";
85 echo "<th class='time'>ce";
86 echo "<th class='edition'>edition";
87 echo "<th class='location'>location";
88 echo "<th class='note'>note";
89 echo "<th class='source'>src";
90 echo "<th class='source'>id";
91 $fileName="./data_from_sinica/merged_books.csv";
92 $fp=fopen($fileName, "r");
93 $count=0;
94 while(!feof($fp)){
95 $data=fgetcsv($fp,100000,"\t");
96 if($data[0]=="#"){
97 $lastBook=array();
98 for($i=0; $i<$data[5];$i++){
99 $book=fgetcsv($fp,100000,"\t");
100
101 if($i==0){
102 echo "<tr class='separator'>";
103 foreach($book as $item){
104 echo "<td>";
105 }
106 }
107 echo "<tr>";
108 $j=0;
109 foreach($book as $idx=>$item){
110 $class='';
111 if($i!=0){
112 if($idx==4 && $lastBook[4]!=$item){
113 $class="different";
114 }
115 if($idx==6 && $lastBook[6]!=$item){
116 $class="different";
117 }
118 }
119 if($idx==2 && $item=="") $class="empty";
120 echo "<td class='".$class."'>".$item;
121 $j++;
122 }
123 $lastBook=$book;
124 $count++;
125 }
126 }
127 }
128 echo "</table>";
129 }
130
131 ?>
132
133 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
134 <html>
135 <head>
136 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
137 <style type="text/css">
138 table{
139 border-collapse:collapse;
140 }
141 td,th{
142 border:1px solid #eee;
143 padding:5px 5px;
144 }
145 td{vertical-align:top;}
146 .separator{background:#000;}
147 .book{background:#ddd;}
148 .empty{background:#F6CECE;}
149 .different{background:#F3E2A9;}
150 .level{width:80px;}
151 .time{width:40px;}
152 .title{width:100px;}
153 .volume{width:120px;}
154 .author{width:200px;}
155 .edition,.location{width:200px;}
156 .note{width:200px;}
157 .source{20px;}
158 </style>
159 <!--<script src="js/check_sections.js" charset="utf-8"></script>!-->
160 </head>
161 <body>
162 <?php
163
164 //mergeBooks();
165 printMergedBooks();
166
167 ?>
168 </body>
169 </html>