diff interface/insert_new_columns_into_books/get_data_from_sinica.php @ 0:b12c99b7c3f0

commit for previous development
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Mon, 19 Jan 2015 17:13:49 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interface/insert_new_columns_into_books/get_data_from_sinica.php	Mon Jan 19 17:13:49 2015 +0100
@@ -0,0 +1,167 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+        <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+                <style type="text/css">
+			.row{
+				display:block;
+				margin-bottom:20px;
+			}
+			.column{
+				display:inline-block;
+				width:90px;
+				vertical-align:top;
+			}
+		</style>
+		<!--<script src="js/check_sections.js" charset="utf-8"></script>!-->
+        </head>
+        <body>
+<?php
+set_time_limit(0);
+ini_set('memory_limit', '-1');
+$columnNameArray=array();
+$count=0;
+for($i=1; $i<=71; $i++){
+	$url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$i."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3=";
+	$bookInfoArray=getBookInfo($url);
+	$fileName=sprintf("%02d",$i).".csv";
+	saveBookInfo($bookInfoArray,$fileName);
+	$count+=sizeof($bookInfoArray);
+	//break;
+}
+saveColumnName($columnNameArray);
+echo "<br><br>".$count."<br>";
+?>
+	</body>
+</html>
+<?php
+function getBookInfo($url){
+	global $columnNameArray;
+	$bookListDoc=new DOMDocument();
+	$data=getData($url);
+	$data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5");
+	$bookInfoArray=array();
+	@$bookListDoc->loadHTML($data);
+	$bookList=$bookListDoc->getElementsByTagName("a");
+	$bookNumber=0;
+	foreach($bookList as $entry){
+		$link=$entry->attributes->getNamedItem("href")->value;
+		$pattern="/detail.asp\?ID=([0-9]+)&Source=([0-9]+)/u";
+        	if(preg_match($pattern,$link,$match)){
+			$id=$match[1];
+			$source=$match[2];
+			$link="http://webgis.sinica.edu.tw/place/".$link;
+			$data=getData($link);
+			/*
+			$bookInfoDoc=new DOMDocument();
+			@$bookInfoDoc->loadHTML($data);
+			$bookInfoDoc->preserveWhiteSpace = false;
+			$th=$bookInfoDoc->getElementsByTagName("th");
+			$bookInfoArray[$bookNumber][0]=$source;
+			$columnNameArray[$source][0]=$source;
+			$infoNumber=1;
+			foreach($th as $row){
+				$columnName=$row->nodeValue;
+				$columnName=str_replace(":","",$columnName);
+				$columnName=trim($columnName);
+				$columnNameArray[$source][$infoNumber]=$columnName;
+				$info=$row->nextSibling->nodeValue;
+				$bookInfoArray[$bookNumber][$infoNumber]=$info;
+				$infoNumber++;
+			}*/
+			$bookInfoArray[$bookNumber][0]=$id;
+			$bookInfoArray[$bookNumber][1]=$source;
+			$bookInfoArray['big5'][$bookNumber][0]=$id;
+			$bookInfoArray['big5'][$bookNumber][1]=$source;
+                        $columnNameArray[$source][0]=$source;
+                        $columnNameArray['big5'][$source][0]=$source;
+
+                        $infoNumber=2;
+			//$pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td><td class="calc" align="left" valign="top">(.*)<\/td>/';
+			$pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/';
+			//$pattern='/<td[^>]+>(.*)<\/td>/';
+			if(preg_match_all($pattern,$data,$match)){
+				foreach($match[1] as $idx=>$th){
+					$pattern="/[\x81-\xA0][\x40-\xFE]/";
+					$th=str_replace("\xA1\x47","",$th);//remove colons
+					$th=trim($th);
+					$columnNameArray['big5'][$source][$infoNumber]=$th;
+                                        $th=preg_replace($pattern,"\xA1\xBC",$th);//replace the self-defined characters with a square
+                                        $th=mb_convert_encoding($th,"UTF-8","BIG5");
+					$columnNameArray[$source][$infoNumber]=$th;
+					$info=$match[2][$idx];
+					$bookInfoArray['big5'][$bookNumber][$infoNumber]=$info;
+                                        $info=preg_replace($pattern,"\xA1\xBC",$info);//replace the self-defined characters with a square
+                                        $info=mb_convert_encoding($info,"UTF-8","BIG5");
+					$bookInfoArray[$bookNumber][$infoNumber]=$info;
+                                	$infoNumber++;
+				}
+			}
+			$bookNumber++;
+		}
+	}
+	return $bookInfoArray;
+}
+function saveColumnName($columnNameArray){
+	$fp=fopen("./data_from_sinica/column_name.csv","w");
+	foreach($columnNameArray as $idx=>$columnName){
+		if($idx==="big5"){
+			continue;
+		}
+		echo "<div class='row'>";
+		fputcsv($fp,$columnName,"\t");
+		foreach($columnName as $name){
+			echo "<div class='column'>".$name."</div>";	
+		}
+		echo "</div>";
+	}
+	fclose($fp);
+	$fp=fopen("./data_from_sinica/big5_column_name.csv","w");
+	foreach($columnNameArray['big5'] as $idx=>$columnName){
+		echo "<div class='row'>";
+		fputcsv($fp,$columnName,"\t");
+		foreach($columnName as $name){
+			echo "<div class='column'>".$name."</div>";	
+		}
+		echo "</div>";
+	}
+	fclose($fp);
+}
+function saveBookInfo($bookInfoArray,$fileName){
+	$fp=fopen("./data_from_sinica/".$fileName,"w");
+	foreach($bookInfoArray as $idx=>$book){
+		if($idx==="big5"){
+			continue;
+		}
+		echo "<div class='row'>";
+		fputcsv($fp,$book,"\t");
+		foreach($book as $info){
+			echo "<div class='column'>".$info."</div>";
+		}
+		echo "</div>";
+	}
+	fclose($fp);
+	$fp=fopen("./data_from_sinica/big5_".$fileName,"w");
+	foreach($bookInfoArray['big5'] as $idx=>$book){
+		echo "<div class='row'>";
+		fputcsv($fp,$book,"\t");
+		foreach($book as $info){
+			echo "<div class='column'>".$info."</div>";
+		}
+		echo "</div>";
+	}
+	fclose($fp);
+}
+function getData($url){
+	$curl=curl_init();
+	$timeout=5000;
+	curl_setopt($curl,CURLOPT_URL,$url);
+	curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
+	curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout);
+	$data=curl_exec($curl);
+	curl_close($curl);
+	$data=str_replace("&nbsp;","",$data);
+	return $data;
+}
+
+?>