Mercurial > hg > extraction-interface
view interface/insert_new_columns_into_books/get_data_from_sinica.php @ 44:3163d8ca0b62 default
closing default branch, we use extractapp branch
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 13 Mar 2015 09:55:07 +0100 |
parents | b12c99b7c3f0 |
children |
line wrap: on
line source
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <style type="text/css"> .row{ display:block; margin-bottom:20px; } .column{ display:inline-block; width:90px; vertical-align:top; } </style> <!--<script src="js/check_sections.js" charset="utf-8"></script>!--> </head> <body> <?php set_time_limit(0); ini_set('memory_limit', '-1'); $columnNameArray=array(); $count=0; for($i=1; $i<=71; $i++){ $url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$i."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3="; $bookInfoArray=getBookInfo($url); $fileName=sprintf("%02d",$i).".csv"; saveBookInfo($bookInfoArray,$fileName); $count+=sizeof($bookInfoArray); //break; } saveColumnName($columnNameArray); echo "<br><br>".$count."<br>"; ?> </body> </html> <?php function getBookInfo($url){ global $columnNameArray; $bookListDoc=new DOMDocument(); $data=getData($url); $data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5"); $bookInfoArray=array(); @$bookListDoc->loadHTML($data); $bookList=$bookListDoc->getElementsByTagName("a"); $bookNumber=0; foreach($bookList as $entry){ $link=$entry->attributes->getNamedItem("href")->value; $pattern="/detail.asp\?ID=([0-9]+)&Source=([0-9]+)/u"; if(preg_match($pattern,$link,$match)){ $id=$match[1]; $source=$match[2]; $link="http://webgis.sinica.edu.tw/place/".$link; $data=getData($link); /* $bookInfoDoc=new DOMDocument(); @$bookInfoDoc->loadHTML($data); $bookInfoDoc->preserveWhiteSpace = false; $th=$bookInfoDoc->getElementsByTagName("th"); $bookInfoArray[$bookNumber][0]=$source; $columnNameArray[$source][0]=$source; $infoNumber=1; foreach($th as $row){ $columnName=$row->nodeValue; $columnName=str_replace(":","",$columnName); $columnName=trim($columnName); $columnNameArray[$source][$infoNumber]=$columnName; $info=$row->nextSibling->nodeValue; $bookInfoArray[$bookNumber][$infoNumber]=$info; $infoNumber++; }*/ $bookInfoArray[$bookNumber][0]=$id; $bookInfoArray[$bookNumber][1]=$source; $bookInfoArray['big5'][$bookNumber][0]=$id; $bookInfoArray['big5'][$bookNumber][1]=$source; $columnNameArray[$source][0]=$source; $columnNameArray['big5'][$source][0]=$source; $infoNumber=2; //$pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td><td class="calc" align="left" valign="top">(.*)<\/td>/'; $pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/'; //$pattern='/<td[^>]+>(.*)<\/td>/'; if(preg_match_all($pattern,$data,$match)){ foreach($match[1] as $idx=>$th){ $pattern="/[\x81-\xA0][\x40-\xFE]/"; $th=str_replace("\xA1\x47","",$th);//remove colons $th=trim($th); $columnNameArray['big5'][$source][$infoNumber]=$th; $th=preg_replace($pattern,"\xA1\xBC",$th);//replace the self-defined characters with a square $th=mb_convert_encoding($th,"UTF-8","BIG5"); $columnNameArray[$source][$infoNumber]=$th; $info=$match[2][$idx]; $bookInfoArray['big5'][$bookNumber][$infoNumber]=$info; $info=preg_replace($pattern,"\xA1\xBC",$info);//replace the self-defined characters with a square $info=mb_convert_encoding($info,"UTF-8","BIG5"); $bookInfoArray[$bookNumber][$infoNumber]=$info; $infoNumber++; } } $bookNumber++; } } return $bookInfoArray; } function saveColumnName($columnNameArray){ $fp=fopen("./data_from_sinica/column_name.csv","w"); foreach($columnNameArray as $idx=>$columnName){ if($idx==="big5"){ continue; } echo "<div class='row'>"; fputcsv($fp,$columnName,"\t"); foreach($columnName as $name){ echo "<div class='column'>".$name."</div>"; } echo "</div>"; } fclose($fp); $fp=fopen("./data_from_sinica/big5_column_name.csv","w"); foreach($columnNameArray['big5'] as $idx=>$columnName){ echo "<div class='row'>"; fputcsv($fp,$columnName,"\t"); foreach($columnName as $name){ echo "<div class='column'>".$name."</div>"; } echo "</div>"; } fclose($fp); } function saveBookInfo($bookInfoArray,$fileName){ $fp=fopen("./data_from_sinica/".$fileName,"w"); foreach($bookInfoArray as $idx=>$book){ if($idx==="big5"){ continue; } echo "<div class='row'>"; fputcsv($fp,$book,"\t"); foreach($book as $info){ echo "<div class='column'>".$info."</div>"; } echo "</div>"; } fclose($fp); $fp=fopen("./data_from_sinica/big5_".$fileName,"w"); foreach($bookInfoArray['big5'] as $idx=>$book){ echo "<div class='row'>"; fputcsv($fp,$book,"\t"); foreach($book as $info){ echo "<div class='column'>".$info."</div>"; } echo "</div>"; } fclose($fp); } function getData($url){ $curl=curl_init(); $timeout=5000; curl_setopt($curl,CURLOPT_URL,$url); curl_setopt($curl,CURLOPT_RETURNTRANSFER,1); curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout); $data=curl_exec($curl); curl_close($curl); $data=str_replace(" ","",$data); return $data; } ?>