view interface/insert_new_columns_into_books/analyze_data_from_sinica.php @ 0:b12c99b7c3f0

commit for previous development
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Mon, 19 Jan 2015 17:13:49 +0100
parents
children
line wrap: on
line source

<?php
set_time_limit(0);
ini_set('memory_limit', '-1');
//obsolete function
function findBrokenCharacter(){
	for($i=17; $i<=17; $i++){
                $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv";
                $fp=fopen($fileName,"r");
		$cfp=fopen("./data_from_sinica/_".sprintf("%02d",$i).".csv","w");
		$j=0;
		echo "page ".$i."<br>";
                while(!feof($fp)){
			$line=fgets($fp);
			
			$pattern="/td/";
			if(preg_match($pattern,$line,$match)){
				echo "find the broken character at line ".($j+1)."<br>";
				$correctInfo=getCorrectInfo($i,$j+1);
				foreach($correctInfo as $column){
					//echo $column."\t";
					$pattern="/[\x81-\xA0][\x40-\xFE]/";
					$replaced=preg_replace($pattern,"\xA1\xBC",$column);
					$column=mb_convert_encoding($replaced,"UTF-8","BIG5");
					echo "".$column."\t";
				}
				echo "<br>";
				$line=fgets($fp);
			}
			$j++;	
		}
		echo "page ".$i." has ".($j-1)." lines<br><br>";
                
        }
        fclose($fp);
}
//obsolete function
function getCorrectInfo($page,$line){
	$url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$page."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3=";
	$bookListDoc=new DOMDocument();
        $data=getData($url);
	$data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5");
        $bookInfoArray=array();
        @$bookListDoc->loadHTML($data);
        $bookList=$bookListDoc->getElementsByTagName("a");
	$entry=$bookList->item($line-1+5);
        $link=$entry->attributes->getNamedItem("href")->value;
        $pattern="/detail.asp\?ID=[0-9]+&Source=([0-9]+)/u";
	$correctInfo=array();
        if(preg_match($pattern,$link,$match)){
                $source=$match[1];
		$correctInfo[0]=$source;
                $link="http://webgis.sinica.edu.tw/place/".$link;
                $data=getData($link,true);
		$pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/';
                if(preg_match_all($pattern,$data,$match)){
                	foreach($match[1] as $idx=>$th){
                                $info=$match[2][$idx];
                                $correctInfo[$idx+1]=$info;
                        }
                }
	}
	return $correctInfo;
}
function writeAllDataToOneFile(){
	$fileName="./data_from_sinica/all_data.csv";
	$fp=fopen($fileName,"w");
	for($i=1; $i<=71; $i++){
        	$fileName="./data_from_sinica/".sprintf("%02d",$i).".csv";
		$data=file_get_contents($fileName);
		$data=str_replace("\r","",$data);
		fwrite($fp,$data);
	}
	fclose($fp);
}
function getData($url){
        $curl=curl_init();
        $timeout=5000;
        curl_setopt($curl,CURLOPT_URL,$url);
        curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
        curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout);
        $data=curl_exec($curl);
        curl_close($curl);
        $data=str_replace("&nbsp;","",$data);
        return $data;
}
function countBooks(){
	$fileName="./data_from_sinica/all_data.csv";
	$fp=fopen($fileName, "r");
	$countArray=[1=>0,2=>0,3=>0,4=>0,5=>0,6=>0,7=>0];
	while(!feof($fp)){
		$book=fgetcsv($fp,1000000,"\t");
		if($book[1]!=""){
			$countArray[$book[1]]++;
		}
	}
	return $countArray;
}
//writeAllDataToOneFile();
?>

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html> 
        <head>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
                <style type="text/css">
                        .row{  
                                display:block;
                        }
                        .column{
                                display:inline-block;
                                width:90px;
                        }
			table{
				border-collapse:collapse;
			}
			td,th{
				padding:3px 10px;
			}
			th{
				text-align:left;
			}
                </style>
                <!--<script src="js/check_sections.js" charset="utf-8"></script>!-->
        </head>
        <body>
		<table>
		<tr><th>source<th># of books<th>note
<?php

	//findBrokenCharacter();
	//writeAllDataToOneFile();
	$countArray=countBooks();
	$descrArray=[
		1=>"大部份有年號資訊 年代不知為出版或是編纂年代",
		2=>"大部份沒有時間資訊",
		3=>"只有朝代 沒有年號或年份",
		4=>"基本上沒有時間資訊",
		5=>"基本上沒有時間資訊",
		6=>"基本上沒有時間資訊",
		7=>"基本上沒有時間資訊"
		
	];
	foreach($countArray as $source=>$count){
		echo "<tr><td>".$source."<td>".$count."<td>".$descrArray[$source];
	}

?>
		</table>
        </body>
</html>