Mercurial > hg > extraction-interface
diff interface/insert_new_columns_into_books/analyze_data_from_sinica.php @ 0:b12c99b7c3f0
commit for previous development
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 19 Jan 2015 17:13:49 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interface/insert_new_columns_into_books/analyze_data_from_sinica.php Mon Jan 19 17:13:49 2015 +0100 @@ -0,0 +1,150 @@ +<?php +set_time_limit(0); +ini_set('memory_limit', '-1'); +//obsolete function +function findBrokenCharacter(){ + for($i=17; $i<=17; $i++){ + $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv"; + $fp=fopen($fileName,"r"); + $cfp=fopen("./data_from_sinica/_".sprintf("%02d",$i).".csv","w"); + $j=0; + echo "page ".$i."<br>"; + while(!feof($fp)){ + $line=fgets($fp); + + $pattern="/td/"; + if(preg_match($pattern,$line,$match)){ + echo "find the broken character at line ".($j+1)."<br>"; + $correctInfo=getCorrectInfo($i,$j+1); + foreach($correctInfo as $column){ + //echo $column."\t"; + $pattern="/[\x81-\xA0][\x40-\xFE]/"; + $replaced=preg_replace($pattern,"\xA1\xBC",$column); + $column=mb_convert_encoding($replaced,"UTF-8","BIG5"); + echo "".$column."\t"; + } + echo "<br>"; + $line=fgets($fp); + } + $j++; + } + echo "page ".$i." has ".($j-1)." lines<br><br>"; + + } + fclose($fp); +} +//obsolete function +function getCorrectInfo($page,$line){ + $url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$page."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3="; + $bookListDoc=new DOMDocument(); + $data=getData($url); + $data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5"); + $bookInfoArray=array(); + @$bookListDoc->loadHTML($data); + $bookList=$bookListDoc->getElementsByTagName("a"); + $entry=$bookList->item($line-1+5); + $link=$entry->attributes->getNamedItem("href")->value; + $pattern="/detail.asp\?ID=[0-9]+&Source=([0-9]+)/u"; + $correctInfo=array(); + if(preg_match($pattern,$link,$match)){ + $source=$match[1]; + $correctInfo[0]=$source; + $link="http://webgis.sinica.edu.tw/place/".$link; + $data=getData($link,true); + $pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/'; + if(preg_match_all($pattern,$data,$match)){ + foreach($match[1] as $idx=>$th){ + $info=$match[2][$idx]; + $correctInfo[$idx+1]=$info; + } + } + } + return $correctInfo; +} +function writeAllDataToOneFile(){ + $fileName="./data_from_sinica/all_data.csv"; + $fp=fopen($fileName,"w"); + for($i=1; $i<=71; $i++){ + $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv"; + $data=file_get_contents($fileName); + $data=str_replace("\r","",$data); + fwrite($fp,$data); + } + fclose($fp); +} +function getData($url){ + $curl=curl_init(); + $timeout=5000; + curl_setopt($curl,CURLOPT_URL,$url); + curl_setopt($curl,CURLOPT_RETURNTRANSFER,1); + curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout); + $data=curl_exec($curl); + curl_close($curl); + $data=str_replace(" ","",$data); + return $data; +} +function countBooks(){ + $fileName="./data_from_sinica/all_data.csv"; + $fp=fopen($fileName, "r"); + $countArray=[1=>0,2=>0,3=>0,4=>0,5=>0,6=>0,7=>0]; + while(!feof($fp)){ + $book=fgetcsv($fp,1000000,"\t"); + if($book[1]!=""){ + $countArray[$book[1]]++; + } + } + return $countArray; +} +//writeAllDataToOneFile(); +?> + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> + <style type="text/css"> + .row{ + display:block; + } + .column{ + display:inline-block; + width:90px; + } + table{ + border-collapse:collapse; + } + td,th{ + padding:3px 10px; + } + th{ + text-align:left; + } + </style> + <!--<script src="js/check_sections.js" charset="utf-8"></script>!--> + </head> + <body> + <table> + <tr><th>source<th># of books<th>note +<?php + + //findBrokenCharacter(); + //writeAllDataToOneFile(); + $countArray=countBooks(); + $descrArray=[ + 1=>"大部份有年號資訊 年代不知為出版或是編纂年代", + 2=>"大部份沒有時間資訊", + 3=>"只有朝代 沒有年號或年份", + 4=>"基本上沒有時間資訊", + 5=>"基本上沒有時間資訊", + 6=>"基本上沒有時間資訊", + 7=>"基本上沒有時間資訊" + + ]; + foreach($countArray as $source=>$count){ + echo "<tr><td>".$source."<td>".$count."<td>".$descrArray[$source]; + } + +?> + </table> + </body> +</html>