comparison interface/insert_new_columns_into_books/analyze_data_from_sinica.php @ 0:b12c99b7c3f0

commit for previous development
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Mon, 19 Jan 2015 17:13:49 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b12c99b7c3f0
1 <?php
2 set_time_limit(0);
3 ini_set('memory_limit', '-1');
4 //obsolete function
5 function findBrokenCharacter(){
6 for($i=17; $i<=17; $i++){
7 $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv";
8 $fp=fopen($fileName,"r");
9 $cfp=fopen("./data_from_sinica/_".sprintf("%02d",$i).".csv","w");
10 $j=0;
11 echo "page ".$i."<br>";
12 while(!feof($fp)){
13 $line=fgets($fp);
14
15 $pattern="/td/";
16 if(preg_match($pattern,$line,$match)){
17 echo "find the broken character at line ".($j+1)."<br>";
18 $correctInfo=getCorrectInfo($i,$j+1);
19 foreach($correctInfo as $column){
20 //echo $column."\t";
21 $pattern="/[\x81-\xA0][\x40-\xFE]/";
22 $replaced=preg_replace($pattern,"\xA1\xBC",$column);
23 $column=mb_convert_encoding($replaced,"UTF-8","BIG5");
24 echo "".$column."\t";
25 }
26 echo "<br>";
27 $line=fgets($fp);
28 }
29 $j++;
30 }
31 echo "page ".$i." has ".($j-1)." lines<br><br>";
32
33 }
34 fclose($fp);
35 }
36 //obsolete function
37 function getCorrectInfo($page,$line){
38 $url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$page."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3=";
39 $bookListDoc=new DOMDocument();
40 $data=getData($url);
41 $data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5");
42 $bookInfoArray=array();
43 @$bookListDoc->loadHTML($data);
44 $bookList=$bookListDoc->getElementsByTagName("a");
45 $entry=$bookList->item($line-1+5);
46 $link=$entry->attributes->getNamedItem("href")->value;
47 $pattern="/detail.asp\?ID=[0-9]+&Source=([0-9]+)/u";
48 $correctInfo=array();
49 if(preg_match($pattern,$link,$match)){
50 $source=$match[1];
51 $correctInfo[0]=$source;
52 $link="http://webgis.sinica.edu.tw/place/".$link;
53 $data=getData($link,true);
54 $pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/';
55 if(preg_match_all($pattern,$data,$match)){
56 foreach($match[1] as $idx=>$th){
57 $info=$match[2][$idx];
58 $correctInfo[$idx+1]=$info;
59 }
60 }
61 }
62 return $correctInfo;
63 }
64 function writeAllDataToOneFile(){
65 $fileName="./data_from_sinica/all_data.csv";
66 $fp=fopen($fileName,"w");
67 for($i=1; $i<=71; $i++){
68 $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv";
69 $data=file_get_contents($fileName);
70 $data=str_replace("\r","",$data);
71 fwrite($fp,$data);
72 }
73 fclose($fp);
74 }
75 function getData($url){
76 $curl=curl_init();
77 $timeout=5000;
78 curl_setopt($curl,CURLOPT_URL,$url);
79 curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
80 curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout);
81 $data=curl_exec($curl);
82 curl_close($curl);
83 $data=str_replace("&nbsp;","",$data);
84 return $data;
85 }
86 function countBooks(){
87 $fileName="./data_from_sinica/all_data.csv";
88 $fp=fopen($fileName, "r");
89 $countArray=[1=>0,2=>0,3=>0,4=>0,5=>0,6=>0,7=>0];
90 while(!feof($fp)){
91 $book=fgetcsv($fp,1000000,"\t");
92 if($book[1]!=""){
93 $countArray[$book[1]]++;
94 }
95 }
96 return $countArray;
97 }
98 //writeAllDataToOneFile();
99 ?>
100
101 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
102 <html>
103 <head>
104 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
105 <style type="text/css">
106 .row{
107 display:block;
108 }
109 .column{
110 display:inline-block;
111 width:90px;
112 }
113 table{
114 border-collapse:collapse;
115 }
116 td,th{
117 padding:3px 10px;
118 }
119 th{
120 text-align:left;
121 }
122 </style>
123 <!--<script src="js/check_sections.js" charset="utf-8"></script>!-->
124 </head>
125 <body>
126 <table>
127 <tr><th>source<th># of books<th>note
128 <?php
129
130 //findBrokenCharacter();
131 //writeAllDataToOneFile();
132 $countArray=countBooks();
133 $descrArray=[
134 1=>"大部份有年號資訊 年代不知為出版或是編纂年代",
135 2=>"大部份沒有時間資訊",
136 3=>"只有朝代 沒有年號或年份",
137 4=>"基本上沒有時間資訊",
138 5=>"基本上沒有時間資訊",
139 6=>"基本上沒有時間資訊",
140 7=>"基本上沒有時間資訊"
141
142 ];
143 foreach($countArray as $source=>$count){
144 echo "<tr><td>".$source."<td>".$count."<td>".$descrArray[$source];
145 }
146
147 ?>
148 </table>
149 </body>
150 </html>