Mercurial > hg > extraction-interface
comparison interface/insert_new_columns_into_books/analyze_data_from_sinica.php @ 0:b12c99b7c3f0
commit for previous development
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 19 Jan 2015 17:13:49 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b12c99b7c3f0 |
---|---|
1 <?php | |
2 set_time_limit(0); | |
3 ini_set('memory_limit', '-1'); | |
4 //obsolete function | |
5 function findBrokenCharacter(){ | |
6 for($i=17; $i<=17; $i++){ | |
7 $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv"; | |
8 $fp=fopen($fileName,"r"); | |
9 $cfp=fopen("./data_from_sinica/_".sprintf("%02d",$i).".csv","w"); | |
10 $j=0; | |
11 echo "page ".$i."<br>"; | |
12 while(!feof($fp)){ | |
13 $line=fgets($fp); | |
14 | |
15 $pattern="/td/"; | |
16 if(preg_match($pattern,$line,$match)){ | |
17 echo "find the broken character at line ".($j+1)."<br>"; | |
18 $correctInfo=getCorrectInfo($i,$j+1); | |
19 foreach($correctInfo as $column){ | |
20 //echo $column."\t"; | |
21 $pattern="/[\x81-\xA0][\x40-\xFE]/"; | |
22 $replaced=preg_replace($pattern,"\xA1\xBC",$column); | |
23 $column=mb_convert_encoding($replaced,"UTF-8","BIG5"); | |
24 echo "".$column."\t"; | |
25 } | |
26 echo "<br>"; | |
27 $line=fgets($fp); | |
28 } | |
29 $j++; | |
30 } | |
31 echo "page ".$i." has ".($j-1)." lines<br><br>"; | |
32 | |
33 } | |
34 fclose($fp); | |
35 } | |
36 //obsolete function | |
37 function getCorrectInfo($page,$line){ | |
38 $url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$page."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3="; | |
39 $bookListDoc=new DOMDocument(); | |
40 $data=getData($url); | |
41 $data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5"); | |
42 $bookInfoArray=array(); | |
43 @$bookListDoc->loadHTML($data); | |
44 $bookList=$bookListDoc->getElementsByTagName("a"); | |
45 $entry=$bookList->item($line-1+5); | |
46 $link=$entry->attributes->getNamedItem("href")->value; | |
47 $pattern="/detail.asp\?ID=[0-9]+&Source=([0-9]+)/u"; | |
48 $correctInfo=array(); | |
49 if(preg_match($pattern,$link,$match)){ | |
50 $source=$match[1]; | |
51 $correctInfo[0]=$source; | |
52 $link="http://webgis.sinica.edu.tw/place/".$link; | |
53 $data=getData($link,true); | |
54 $pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/'; | |
55 if(preg_match_all($pattern,$data,$match)){ | |
56 foreach($match[1] as $idx=>$th){ | |
57 $info=$match[2][$idx]; | |
58 $correctInfo[$idx+1]=$info; | |
59 } | |
60 } | |
61 } | |
62 return $correctInfo; | |
63 } | |
64 function writeAllDataToOneFile(){ | |
65 $fileName="./data_from_sinica/all_data.csv"; | |
66 $fp=fopen($fileName,"w"); | |
67 for($i=1; $i<=71; $i++){ | |
68 $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv"; | |
69 $data=file_get_contents($fileName); | |
70 $data=str_replace("\r","",$data); | |
71 fwrite($fp,$data); | |
72 } | |
73 fclose($fp); | |
74 } | |
75 function getData($url){ | |
76 $curl=curl_init(); | |
77 $timeout=5000; | |
78 curl_setopt($curl,CURLOPT_URL,$url); | |
79 curl_setopt($curl,CURLOPT_RETURNTRANSFER,1); | |
80 curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout); | |
81 $data=curl_exec($curl); | |
82 curl_close($curl); | |
83 $data=str_replace(" ","",$data); | |
84 return $data; | |
85 } | |
86 function countBooks(){ | |
87 $fileName="./data_from_sinica/all_data.csv"; | |
88 $fp=fopen($fileName, "r"); | |
89 $countArray=[1=>0,2=>0,3=>0,4=>0,5=>0,6=>0,7=>0]; | |
90 while(!feof($fp)){ | |
91 $book=fgetcsv($fp,1000000,"\t"); | |
92 if($book[1]!=""){ | |
93 $countArray[$book[1]]++; | |
94 } | |
95 } | |
96 return $countArray; | |
97 } | |
98 //writeAllDataToOneFile(); | |
99 ?> | |
100 | |
101 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |
102 <html> | |
103 <head> | |
104 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> | |
105 <style type="text/css"> | |
106 .row{ | |
107 display:block; | |
108 } | |
109 .column{ | |
110 display:inline-block; | |
111 width:90px; | |
112 } | |
113 table{ | |
114 border-collapse:collapse; | |
115 } | |
116 td,th{ | |
117 padding:3px 10px; | |
118 } | |
119 th{ | |
120 text-align:left; | |
121 } | |
122 </style> | |
123 <!--<script src="js/check_sections.js" charset="utf-8"></script>!--> | |
124 </head> | |
125 <body> | |
126 <table> | |
127 <tr><th>source<th># of books<th>note | |
128 <?php | |
129 | |
130 //findBrokenCharacter(); | |
131 //writeAllDataToOneFile(); | |
132 $countArray=countBooks(); | |
133 $descrArray=[ | |
134 1=>"大部份有年號資訊 年代不知為出版或是編纂年代", | |
135 2=>"大部份沒有時間資訊", | |
136 3=>"只有朝代 沒有年號或年份", | |
137 4=>"基本上沒有時間資訊", | |
138 5=>"基本上沒有時間資訊", | |
139 6=>"基本上沒有時間資訊", | |
140 7=>"基本上沒有時間資訊" | |
141 | |
142 ]; | |
143 foreach($countArray as $source=>$count){ | |
144 echo "<tr><td>".$source."<td>".$count."<td>".$descrArray[$source]; | |
145 } | |
146 | |
147 ?> | |
148 </table> | |
149 </body> | |
150 </html> |