Mercurial > hg > extraction-interface

diff interface/insert_new_columns_into_books/analyze_data_from_sinica.php @ 0:b12c99b7c3f0
commit for previous development
author: Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date: Mon, 19 Jan 2015 17:13:49 +0100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interface/insert_new_columns_into_books/analyze_data_from_sinica.php	Mon Jan 19 17:13:49 2015 +0100
@@ -0,0 +1,150 @@
+<?php
+set_time_limit(0);
+ini_set('memory_limit', '-1');
+//obsolete function
+function findBrokenCharacter(){
+	for($i=17; $i<=17; $i++){
+                $fileName="./data_from_sinica/".sprintf("%02d",$i).".csv";
+                $fp=fopen($fileName,"r");
+		$cfp=fopen("./data_from_sinica/_".sprintf("%02d",$i).".csv","w");
+		$j=0;
+		echo "page ".$i."<br>";
+                while(!feof($fp)){
+			$line=fgets($fp);
+			
+			$pattern="/td/";
+			if(preg_match($pattern,$line,$match)){
+				echo "find the broken character at line ".($j+1)."<br>";
+				$correctInfo=getCorrectInfo($i,$j+1);
+				foreach($correctInfo as $column){
+					//echo $column."\t";
+					$pattern="/[\x81-\xA0][\x40-\xFE]/";
+					$replaced=preg_replace($pattern,"\xA1\xBC",$column);
+					$column=mb_convert_encoding($replaced,"UTF-8","BIG5");
+					echo "".$column."\t";
+				}
+				echo "<br>";
+				$line=fgets($fp);
+			}
+			$j++;	
+		}
+		echo "page ".$i." has ".($j-1)." lines<br><br>";
+                
+        }
+        fclose($fp);
+}
+//obsolete function
+function getCorrectInfo($page,$line){
+	$url="http://webgis.sinica.edu.tw/place/query.asp?Page=".$page."&Page_setup=500&A1=%AC%D9%A5%F7&B1=containing&C1=&D1=AND&A2=99&B2=containing&C2=&D2=AND&A3=99&B3=containing&C3=";
+	$bookListDoc=new DOMDocument();
+        $data=getData($url);
+	$data=mb_convert_encoding($data,"HTML-ENTITIES","BIG5");
+        $bookInfoArray=array();
+        @$bookListDoc->loadHTML($data);
+        $bookList=$bookListDoc->getElementsByTagName("a");
+	$entry=$bookList->item($line-1+5);
+        $link=$entry->attributes->getNamedItem("href")->value;
+        $pattern="/detail.asp\?ID=[0-9]+&Source=([0-9]+)/u";
+	$correctInfo=array();
+        if(preg_match($pattern,$link,$match)){
+                $source=$match[1];
+		$correctInfo[0]=$source;
+                $link="http://webgis.sinica.edu.tw/place/".$link;
+                $data=getData($link,true);
+		$pattern='/<th class="calc" align="right" valign="top" width="100">(.*)<\/td>[\s]*<td class="calc" align="left" valign="top">(.*)<\/td>/';
+                if(preg_match_all($pattern,$data,$match)){
+                	foreach($match[1] as $idx=>$th){
+                                $info=$match[2][$idx];
+                                $correctInfo[$idx+1]=$info;
+                        }
+                }
+	}
+	return $correctInfo;
+}
+function writeAllDataToOneFile(){
+	$fileName="./data_from_sinica/all_data.csv";
+	$fp=fopen($fileName,"w");
+	for($i=1; $i<=71; $i++){
+        	$fileName="./data_from_sinica/".sprintf("%02d",$i).".csv";
+		$data=file_get_contents($fileName);
+		$data=str_replace("\r","",$data);
+		fwrite($fp,$data);
+	}
+	fclose($fp);
+}
+function getData($url){
+        $curl=curl_init();
+        $timeout=5000;
+        curl_setopt($curl,CURLOPT_URL,$url);
+        curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
+        curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,$timeout);
+        $data=curl_exec($curl);
+        curl_close($curl);
+        $data=str_replace("&nbsp;","",$data);
+        return $data;
+}
+function countBooks(){
+	$fileName="./data_from_sinica/all_data.csv";
+	$fp=fopen($fileName, "r");
+	$countArray=[1=>0,2=>0,3=>0,4=>0,5=>0,6=>0,7=>0];
+	while(!feof($fp)){
+		$book=fgetcsv($fp,1000000,"\t");
+		if($book[1]!=""){
+			$countArray[$book[1]]++;
+		}
+	}
+	return $countArray;
+}
+//writeAllDataToOneFile();
+?>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html> 
+        <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+                <style type="text/css">
+                        .row{  
+                                display:block;
+                        }
+                        .column{
+                                display:inline-block;
+                                width:90px;
+                        }
+			table{
+				border-collapse:collapse;
+			}
+			td,th{
+				padding:3px 10px;
+			}
+			th{
+				text-align:left;
+			}
+                </style>
+                <!--<script src="js/check_sections.js" charset="utf-8"></script>!-->
+        </head>
+        <body>
+		<table>
+		<tr><th>source<th># of books<th>note
+<?php
+
+	//findBrokenCharacter();
+	//writeAllDataToOneFile();
+	$countArray=countBooks();
+	$descrArray=[
+		1=>"大部份有年號資訊 年代不知為出版或是編纂年代",
+		2=>"大部份沒有時間資訊",
+		3=>"只有朝代 沒有年號或年份",
+		4=>"基本上沒有時間資訊",
+		5=>"基本上沒有時間資訊",
+		6=>"基本上沒有時間資訊",
+		7=>"基本上沒有時間資訊"
+		
+	];
+	foreach($countArray as $source=>$count){
+		echo "<tr><td>".$source."<td>".$count."<td>".$descrArray[$source];
+	}
+
+?>
+		</table>
+        </body>
+</html>
author	Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date	Mon, 19 Jan 2015 17:13:49 +0100
parents
children