view interface/insert_new_columns_into_books/get_data_from_sinica.php @ 44:3163d8ca0b62 default

closing default branch, we use extractapp branch
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Fri, 13 Mar 2015 09:55:07 +0100
parents b12c99b7c3f0
children
line wrap: on
line source

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
        <head>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
                <style type="text/css">
			.row{
				display:block;
				margin-bottom:20px;
			}
			.column{
				display:inline-block;
				width:90px;
				vertical-align:top;
			}
		</style>
		<!--<script src="js/check_sections.js" charset="utf-8"></script>!-->
        </head>
        <body>
<?php
set_time_limit(0);
ini_set('memory_limit', '-1');
$columnNameArray=array();
$count=0;
for($i=1; $i<=71; $i++){
	$url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$i."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3=";
	$bookInfoArray=getBookInfo($url);
	$fileName=sprintf("%02d",$i).".csv";
	saveBookInfo($bookInfoArray,$fileName);
	$count+=sizeof($bookInfoArray);
	//break;
}
saveColumnName($columnNameArray);
echo "<br><br>".$count."<br>";
?>
	</body>
</html>
<?php
function getBookInfo($url){
	global $columnNameArray;
	$bookListDoc=new DOMDocument();
	$data=getData($url);
	$data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5");
	$bookInfoArray=array();
	@$bookListDoc->loadHTML($data);
	$bookList=$bookListDoc->getElementsByTagName("a");
	$bookNumber=0;
	foreach($bookList as $entry){
		$link=$entry->attributes->getNamedItem("href")->value;
		$pattern="/detail.asp\?ID=([0-9]+)&Source=([0-9]+)/u";
        	if(preg_match($pattern,$link,$match)){
			$id=$match[1];
			$source=$match[2];
			$link="http://webgis.sinica.edu.tw/place/".$link;
			$data=getData($link);
			/*
			$bookInfoDoc=new DOMDocument();
			@$bookInfoDoc->loadHTML($data);
			$bookInfoDoc->preserveWhiteSpace = false;
			$th=$bookInfoDoc->getElementsByTagName("th");
			$bookInfoArray[$bookNumber][0]=$source;
			$columnNameArray[$source][0]=$source;
			$infoNumber=1;
			foreach($th as $row){
				$columnName=$row->nodeValue;
				$columnName=str_replace(":","",$columnName);
				$columnName=trim($columnName);
				$columnNameArray[$source][$infoNumber]=$columnName;
				$info=$row->nextSibling->nodeValue;
				$bookInfoArray[$bookNumber][$infoNumber]=$info;
				$infoNumber++;
			}*/
			$bookInfoArray[$bookNumber][0]=$id;
			$bookInfoArray[$bookNumber][1]=$source;
			$bookInfoArray['big5'][$bookNumber][0]=$id;
			$bookInfoArray['big5'][$bookNumber][1]=$source;
                        $columnNameArray[$source][0]=$source;
                        $columnNameArray['big5'][$source][0]=$source;

                        $infoNumber=2;
			//$pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td><td class="calc" align="left" valign="top">(.*)<\/td>/';
			$pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/';
			//$pattern='/<td[^>]+>(.*)<\/td>/';
			if(preg_match_all($pattern,$data,$match)){
				foreach($match[1] as $idx=>$th){
					$pattern="/[\x81-\xA0][\x40-\xFE]/";
					$th=str_replace("\xA1\x47","",$th);//remove colons
					$th=trim($th);
					$columnNameArray['big5'][$source][$infoNumber]=$th;
                                        $th=preg_replace($pattern,"\xA1\xBC",$th);//replace the self-defined characters with a square
                                        $th=mb_convert_encoding($th,"UTF-8","BIG5");
					$columnNameArray[$source][$infoNumber]=$th;
					$info=$match[2][$idx];
					$bookInfoArray['big5'][$bookNumber][$infoNumber]=$info;
                                        $info=preg_replace($pattern,"\xA1\xBC",$info);//replace the self-defined characters with a square
                                        $info=mb_convert_encoding($info,"UTF-8","BIG5");
					$bookInfoArray[$bookNumber][$infoNumber]=$info;
                                	$infoNumber++;
				}
			}
			$bookNumber++;
		}
	}
	return $bookInfoArray;
}
function saveColumnName($columnNameArray){
	$fp=fopen("./data_from_sinica/column_name.csv","w");
	foreach($columnNameArray as $idx=>$columnName){
		if($idx==="big5"){
			continue;
		}
		echo "<div class='row'>";
		fputcsv($fp,$columnName,"\t");
		foreach($columnName as $name){
			echo "<div class='column'>".$name."</div>";	
		}
		echo "</div>";
	}
	fclose($fp);
	$fp=fopen("./data_from_sinica/big5_column_name.csv","w");
	foreach($columnNameArray['big5'] as $idx=>$columnName){
		echo "<div class='row'>";
		fputcsv($fp,$columnName,"\t");
		foreach($columnName as $name){
			echo "<div class='column'>".$name."</div>";	
		}
		echo "</div>";
	}
	fclose($fp);
}
function saveBookInfo($bookInfoArray,$fileName){
	$fp=fopen("./data_from_sinica/".$fileName,"w");
	foreach($bookInfoArray as $idx=>$book){
		if($idx==="big5"){
			continue;
		}
		echo "<div class='row'>";
		fputcsv($fp,$book,"\t");
		foreach($book as $info){
			echo "<div class='column'>".$info."</div>";
		}
		echo "</div>";
	}
	fclose($fp);
	$fp=fopen("./data_from_sinica/big5_".$fileName,"w");
	foreach($bookInfoArray['big5'] as $idx=>$book){
		echo "<div class='row'>";
		fputcsv($fp,$book,"\t");
		foreach($book as $info){
			echo "<div class='column'>".$info."</div>";
		}
		echo "</div>";
	}
	fclose($fp);
}
function getData($url){
	$curl=curl_init();
	$timeout=5000;
	curl_setopt($curl,CURLOPT_URL,$url);
	curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
	curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout);
	$data=curl_exec($curl);
	curl_close($curl);
	$data=str_replace("&nbsp;","",$data);
	return $data;
}

?>