Mercurial > hg > extraction-interface
diff interface/insert_new_columns_into_books/get_data_from_sinica.php @ 0:b12c99b7c3f0
commit for previous development
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 19 Jan 2015 17:13:49 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interface/insert_new_columns_into_books/get_data_from_sinica.php Mon Jan 19 17:13:49 2015 +0100 @@ -0,0 +1,167 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> + <style type="text/css"> + .row{ + display:block; + margin-bottom:20px; + } + .column{ + display:inline-block; + width:90px; + vertical-align:top; + } + </style> + <!--<script src="js/check_sections.js" charset="utf-8"></script>!--> + </head> + <body> +<?php +set_time_limit(0); +ini_set('memory_limit', '-1'); +$columnNameArray=array(); +$count=0; +for($i=1; $i<=71; $i++){ + $url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$i."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3="; + $bookInfoArray=getBookInfo($url); + $fileName=sprintf("%02d",$i).".csv"; + saveBookInfo($bookInfoArray,$fileName); + $count+=sizeof($bookInfoArray); + //break; +} +saveColumnName($columnNameArray); +echo "<br><br>".$count."<br>"; +?> + </body> +</html> +<?php +function getBookInfo($url){ + global $columnNameArray; + $bookListDoc=new DOMDocument(); + $data=getData($url); + $data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5"); + $bookInfoArray=array(); + @$bookListDoc->loadHTML($data); + $bookList=$bookListDoc->getElementsByTagName("a"); + $bookNumber=0; + foreach($bookList as $entry){ + $link=$entry->attributes->getNamedItem("href")->value; + $pattern="/detail.asp\?ID=([0-9]+)&Source=([0-9]+)/u"; + if(preg_match($pattern,$link,$match)){ + $id=$match[1]; + $source=$match[2]; + $link="http://webgis.sinica.edu.tw/place/".$link; + $data=getData($link); + /* + $bookInfoDoc=new DOMDocument(); + @$bookInfoDoc->loadHTML($data); + $bookInfoDoc->preserveWhiteSpace = false; + $th=$bookInfoDoc->getElementsByTagName("th"); + $bookInfoArray[$bookNumber][0]=$source; + $columnNameArray[$source][0]=$source; + $infoNumber=1; + foreach($th as $row){ + $columnName=$row->nodeValue; + $columnName=str_replace(":","",$columnName); + $columnName=trim($columnName); + $columnNameArray[$source][$infoNumber]=$columnName; + $info=$row->nextSibling->nodeValue; + $bookInfoArray[$bookNumber][$infoNumber]=$info; + $infoNumber++; + }*/ + $bookInfoArray[$bookNumber][0]=$id; + $bookInfoArray[$bookNumber][1]=$source; + $bookInfoArray['big5'][$bookNumber][0]=$id; + $bookInfoArray['big5'][$bookNumber][1]=$source; + $columnNameArray[$source][0]=$source; + $columnNameArray['big5'][$source][0]=$source; + + $infoNumber=2; + //$pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td><td class="calc" align="left" valign="top">(.*)<\/td>/'; + $pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/'; + //$pattern='/<td[^>]+>(.*)<\/td>/'; + if(preg_match_all($pattern,$data,$match)){ + foreach($match[1] as $idx=>$th){ + $pattern="/[\x81-\xA0][\x40-\xFE]/"; + $th=str_replace("\xA1\x47","",$th);//remove colons + $th=trim($th); + $columnNameArray['big5'][$source][$infoNumber]=$th; + $th=preg_replace($pattern,"\xA1\xBC",$th);//replace the self-defined characters with a square + $th=mb_convert_encoding($th,"UTF-8","BIG5"); + $columnNameArray[$source][$infoNumber]=$th; + $info=$match[2][$idx]; + $bookInfoArray['big5'][$bookNumber][$infoNumber]=$info; + $info=preg_replace($pattern,"\xA1\xBC",$info);//replace the self-defined characters with a square + $info=mb_convert_encoding($info,"UTF-8","BIG5"); + $bookInfoArray[$bookNumber][$infoNumber]=$info; + $infoNumber++; + } + } + $bookNumber++; + } + } + return $bookInfoArray; +} +function saveColumnName($columnNameArray){ + $fp=fopen("./data_from_sinica/column_name.csv","w"); + foreach($columnNameArray as $idx=>$columnName){ + if($idx==="big5"){ + continue; + } + echo "<div class='row'>"; + fputcsv($fp,$columnName,"\t"); + foreach($columnName as $name){ + echo "<div class='column'>".$name."</div>"; + } + echo "</div>"; + } + fclose($fp); + $fp=fopen("./data_from_sinica/big5_column_name.csv","w"); + foreach($columnNameArray['big5'] as $idx=>$columnName){ + echo "<div class='row'>"; + fputcsv($fp,$columnName,"\t"); + foreach($columnName as $name){ + echo "<div class='column'>".$name."</div>"; + } + echo "</div>"; + } + fclose($fp); +} +function saveBookInfo($bookInfoArray,$fileName){ + $fp=fopen("./data_from_sinica/".$fileName,"w"); + foreach($bookInfoArray as $idx=>$book){ + if($idx==="big5"){ + continue; + } + echo "<div class='row'>"; + fputcsv($fp,$book,"\t"); + foreach($book as $info){ + echo "<div class='column'>".$info."</div>"; + } + echo "</div>"; + } + fclose($fp); + $fp=fopen("./data_from_sinica/big5_".$fileName,"w"); + foreach($bookInfoArray['big5'] as $idx=>$book){ + echo "<div class='row'>"; + fputcsv($fp,$book,"\t"); + foreach($book as $info){ + echo "<div class='column'>".$info."</div>"; + } + echo "</div>"; + } + fclose($fp); +} +function getData($url){ + $curl=curl_init(); + $timeout=5000; + curl_setopt($curl,CURLOPT_URL,$url); + curl_setopt($curl,CURLOPT_RETURNTRANSFER,1); + curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout); + $data=curl_exec($curl); + curl_close($curl); + $data=str_replace(" ","",$data); + return $data; +} + +?>