diff interface/insert_new_columns_into_books/parse_data_from_sinica.php @ 0:b12c99b7c3f0

commit for previous development
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Mon, 19 Jan 2015 17:13:49 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interface/insert_new_columns_into_books/parse_data_from_sinica.php	Mon Jan 19 17:13:49 2015 +0100
@@ -0,0 +1,169 @@
+<?php
+set_time_limit(0);
+ini_set('memory_limit', '-1');
+//group the duplicate books in source 1
+function mergeBooks(){
+	$fileName="./data_from_sinica/all_data.csv";
+	$inputFp=fopen($fileName, "r");
+	$fileName="./data_from_sinica/merged_books.csv";
+	$outputFp=fopen($fileName, "w");
+	$bookArray=array();
+	$count=0;
+	$bookCount=0;
+	$columnNameMapping=[2=>'level1', 3=>'level2', 'year'=>'year', 
+			4=>'title', 5=>'volume', 6=>'timeOfTheAuthor', 7=>'author', 
+			8=>'time', 9=>'commonEra', 10=>'edition', 11=>'location', 12=>'note', 1=>'source',0=>'id'];
+	while(!feof($inputFp)){
+		$data=fgetcsv($inputFp,1000000,"\t");
+		//0=>id, 1=>source, 2=>省份, 3=>地區, 4=>地方志名, 5=>卷數, 6=>編修者年代, 7=>人名, 8=>年代, 9=>西元, 10=>性質, 11=>館藏地, 12=>註記
+		$book=array();
+		foreach($columnNameMapping as $oldIdx=>$newIdx){
+			if(!is_numeric($oldIdx)){
+				$book[$newIdx]='';
+				continue;
+			}
+			$book[$newIdx]=$data[$oldIdx];
+		}
+		$bracketArray=["(",")"];
+		$book['timeOfTheAuthor']=str_replace($bracketArray,"",$book['timeOfTheAuthor']);
+		$leftBracketArray=["﹝"];
+		$rightBracketArray=["﹞","}"];
+		$book['title']=str_replace($leftBracketArray,"〔",$book['title']);
+		$book['title']=str_replace($rightBracketArray,"〕",$book['title']);
+		$pattern="/〔(.{2,4})〕(.+)/u";
+                //0=whole,1=year,2=title
+                if(preg_match($pattern,$book['title'],$match)){
+			$book['year']=$match[1];
+			$book['title']=$match[2];
+		}
+		$pattern="/﹝(.{2,4})﹞(.+)/u";
+                //0=whole,1=year,2=title
+                if(preg_match($pattern,$book['title'],$match)){
+			$book['year']=$match[1];
+			$book['title']=$match[2];
+		}
+		if($count==0){ 
+			$bookArray[$count]=$book;
+			$count++;
+			continue;
+		}
+		$lastBook=$bookArray[$count-1];
+		if($lastBook['year']==$book['year'] && $lastBook['title']==$book['title']){//the same book
+			//push into the buffer
+			$bookArray[$count]=$book;
+			$count++;
+		}else{//differnt books
+			//flush out the buffer
+			$array=["#",$lastBook['level1'],$lastBook['level2'],$lastBook['year'],$lastBook['title'],$count];
+			fputcsv($outputFp,$array,"\t");
+			foreach($bookArray as $bookInfo){
+				fputcsv($outputFp,$bookInfo,"\t");
+			}
+			$bookArray=array();
+			$count=0;
+			$bookArray[$count]=$book;
+			$count++;
+			$bookCount++;
+		}
+		if($book['source']>1)	break;
+	}
+	echo "# of books: ".$bookCount."<br>";
+	fclose($inputFp);
+	fclose($outputFp);
+}
+function printMergedBooks(){
+	echo "<table>";
+	echo "<tr>";
+	echo "<th class='level'>level 1";
+	echo "<th class='level'>level 2";
+	echo "<th class='time'>year";
+	echo "<th class='title'>title";
+	echo "<th class='volume'>volume";
+	echo "<th class='time'>time of the author";
+	echo "<th class='author'>author";
+	echo "<th class='time'>time";
+	echo "<th class='time'>ce";
+	echo "<th class='edition'>edition";
+	echo "<th class='location'>location";
+	echo "<th class='note'>note";
+	echo "<th class='source'>src";
+	echo "<th class='source'>id";
+	$fileName="./data_from_sinica/merged_books.csv";
+        $fp=fopen($fileName, "r");	
+	$count=0;
+	while(!feof($fp)){
+		$data=fgetcsv($fp,100000,"\t");
+		if($data[0]=="#"){
+			$lastBook=array();
+			for($i=0; $i<$data[5];$i++){
+				$book=fgetcsv($fp,100000,"\t");
+
+				if($i==0){
+					echo "<tr class='separator'>";
+					foreach($book as $item){
+						echo "<td>";
+					}
+				}
+				echo "<tr>";
+				$j=0;
+				foreach($book as $idx=>$item){
+					$class='';
+					if($i!=0){
+						if($idx==4 && $lastBook[4]!=$item){
+							$class="different";
+						}
+						if($idx==6 && $lastBook[6]!=$item){
+							$class="different";
+						}
+					}
+					if($idx==2 && $item=="") $class="empty";
+					echo "<td class='".$class."'>".$item;
+					$j++;
+				}
+				$lastBook=$book;
+				$count++;
+			}	
+		}
+	}
+	echo "</table>";
+}
+
+?>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html> 
+        <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+                <style type="text/css">
+			table{
+				border-collapse:collapse;
+			}
+			td,th{
+				border:1px solid #eee;
+				padding:5px 5px;
+			}
+			td{vertical-align:top;}
+			.separator{background:#000;}
+			.book{background:#ddd;}
+			.empty{background:#F6CECE;}
+			.different{background:#F3E2A9;}
+			.level{width:80px;}
+			.time{width:40px;}
+			.title{width:100px;}
+			.volume{width:120px;}
+			.author{width:200px;}
+			.edition,.location{width:200px;}
+			.note{width:200px;}
+			.source{20px;}
+                </style>
+                <!--<script src="js/check_sections.js" charset="utf-8"></script>!-->
+        </head>
+        <body>
+<?php
+
+	//mergeBooks();
+	printMergedBooks();
+
+?>
+        </body>
+</html>