Mercurial > hg > extraction-interface
view interface/insert_new_columns_into_books/analyze_data_from_sinica.php @ 0:b12c99b7c3f0
commit for previous development
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 19 Jan 2015 17:13:49 +0100 |
parents | |
children |
line wrap: on
line source
<?php set_time_limit(0); ini_set('memory_limit', '-1'); //obsolete function function findBrokenCharacter(){ for($i=17; $i<=17; $i++){ $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv"; $fp=fopen($fileName,"r"); $cfp=fopen("./data_from_sinica/_".sprintf("%02d",$i).".csv","w"); $j=0; echo "page ".$i."<br>"; while(!feof($fp)){ $line=fgets($fp); $pattern="/td/"; if(preg_match($pattern,$line,$match)){ echo "find the broken character at line ".($j+1)."<br>"; $correctInfo=getCorrectInfo($i,$j+1); foreach($correctInfo as $column){ //echo $column."\t"; $pattern="/[\x81-\xA0][\x40-\xFE]/"; $replaced=preg_replace($pattern,"\xA1\xBC",$column); $column=mb_convert_encoding($replaced,"UTF-8","BIG5"); echo "".$column."\t"; } echo "<br>"; $line=fgets($fp); } $j++; } echo "page ".$i." has ".($j-1)." lines<br><br>"; } fclose($fp); } //obsolete function function getCorrectInfo($page,$line){ $url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$page."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3="; $bookListDoc=new DOMDocument(); $data=getData($url); $data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5"); $bookInfoArray=array(); @$bookListDoc->loadHTML($data); $bookList=$bookListDoc->getElementsByTagName("a"); $entry=$bookList->item($line-1+5); $link=$entry->attributes->getNamedItem("href")->value; $pattern="/detail.asp\?ID=[0-9]+&Source=([0-9]+)/u"; $correctInfo=array(); if(preg_match($pattern,$link,$match)){ $source=$match[1]; $correctInfo[0]=$source; $link="http://webgis.sinica.edu.tw/place/".$link; $data=getData($link,true); $pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/'; if(preg_match_all($pattern,$data,$match)){ foreach($match[1] as $idx=>$th){ $info=$match[2][$idx]; $correctInfo[$idx+1]=$info; } } } return $correctInfo; } function writeAllDataToOneFile(){ $fileName="./data_from_sinica/all_data.csv"; $fp=fopen($fileName,"w"); for($i=1; $i<=71; $i++){ $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv"; $data=file_get_contents($fileName); $data=str_replace("\r","",$data); fwrite($fp,$data); } fclose($fp); } function getData($url){ $curl=curl_init(); $timeout=5000; curl_setopt($curl,CURLOPT_URL,$url); curl_setopt($curl,CURLOPT_RETURNTRANSFER,1); curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout); $data=curl_exec($curl); curl_close($curl); $data=str_replace(" ","",$data); return $data; } function countBooks(){ $fileName="./data_from_sinica/all_data.csv"; $fp=fopen($fileName, "r"); $countArray=[1=>0,2=>0,3=>0,4=>0,5=>0,6=>0,7=>0]; while(!feof($fp)){ $book=fgetcsv($fp,1000000,"\t"); if($book[1]!=""){ $countArray[$book[1]]++; } } return $countArray; } //writeAllDataToOneFile(); ?> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <style type="text/css"> .row{ display:block; } .column{ display:inline-block; width:90px; } table{ border-collapse:collapse; } td,th{ padding:3px 10px; } th{ text-align:left; } </style> <!--<script src="js/check_sections.js" charset="utf-8"></script>!--> </head> <body> <table> <tr><th>source<th># of books<th>note <?php //findBrokenCharacter(); //writeAllDataToOneFile(); $countArray=countBooks(); $descrArray=[ 1=>"大部份有年號資訊 年代不知為出版或是編纂年代", 2=>"大部份沒有時間資訊", 3=>"只有朝代 沒有年號或年份", 4=>"基本上沒有時間資訊", 5=>"基本上沒有時間資訊", 6=>"基本上沒有時間資訊", 7=>"基本上沒有時間資訊" ]; foreach($countArray as $source=>$count){ echo "<tr><td>".$source."<td>".$count."<td>".$descrArray[$source]; } ?> </table> </body> </html>