view interface/insert_new_columns_into_books/parse_data_from_sinica.php @ 0:b12c99b7c3f0

commit for previous development
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Mon, 19 Jan 2015 17:13:49 +0100
parents
children
line wrap: on
line source

<?php
set_time_limit(0);
ini_set('memory_limit', '-1');
//group the duplicate books in source 1
function mergeBooks(){
	$fileName="./data_from_sinica/all_data.csv";
	$inputFp=fopen($fileName, "r");
	$fileName="./data_from_sinica/merged_books.csv";
	$outputFp=fopen($fileName, "w");
	$bookArray=array();
	$count=0;
	$bookCount=0;
	$columnNameMapping=[2=>'level1', 3=>'level2', 'year'=>'year', 
			4=>'title', 5=>'volume', 6=>'timeOfTheAuthor', 7=>'author', 
			8=>'time', 9=>'commonEra', 10=>'edition', 11=>'location', 12=>'note', 1=>'source',0=>'id'];
	while(!feof($inputFp)){
		$data=fgetcsv($inputFp,1000000,"\t");
		//0=>id, 1=>source, 2=>省份, 3=>地區, 4=>地方志名, 5=>卷數, 6=>編修者年代, 7=>人名, 8=>年代, 9=>西元, 10=>性質, 11=>館藏地, 12=>註記
		$book=array();
		foreach($columnNameMapping as $oldIdx=>$newIdx){
			if(!is_numeric($oldIdx)){
				$book[$newIdx]='';
				continue;
			}
			$book[$newIdx]=$data[$oldIdx];
		}
		$bracketArray=["(",")"];
		$book['timeOfTheAuthor']=str_replace($bracketArray,"",$book['timeOfTheAuthor']);
		$leftBracketArray=["﹝"];
		$rightBracketArray=["﹞","}"];
		$book['title']=str_replace($leftBracketArray,"〔",$book['title']);
		$book['title']=str_replace($rightBracketArray,"〕",$book['title']);
		$pattern="/〔(.{2,4})〕(.+)/u";
                //0=whole,1=year,2=title
                if(preg_match($pattern,$book['title'],$match)){
			$book['year']=$match[1];
			$book['title']=$match[2];
		}
		$pattern="/﹝(.{2,4})﹞(.+)/u";
                //0=whole,1=year,2=title
                if(preg_match($pattern,$book['title'],$match)){
			$book['year']=$match[1];
			$book['title']=$match[2];
		}
		if($count==0){ 
			$bookArray[$count]=$book;
			$count++;
			continue;
		}
		$lastBook=$bookArray[$count-1];
		if($lastBook['year']==$book['year'] && $lastBook['title']==$book['title']){//the same book
			//push into the buffer
			$bookArray[$count]=$book;
			$count++;
		}else{//differnt books
			//flush out the buffer
			$array=["#",$lastBook['level1'],$lastBook['level2'],$lastBook['year'],$lastBook['title'],$count];
			fputcsv($outputFp,$array,"\t");
			foreach($bookArray as $bookInfo){
				fputcsv($outputFp,$bookInfo,"\t");
			}
			$bookArray=array();
			$count=0;
			$bookArray[$count]=$book;
			$count++;
			$bookCount++;
		}
		if($book['source']>1)	break;
	}
	echo "# of books: ".$bookCount."<br>";
	fclose($inputFp);
	fclose($outputFp);
}
function printMergedBooks(){
	echo "<table>";
	echo "<tr>";
	echo "<th class='level'>level 1";
	echo "<th class='level'>level 2";
	echo "<th class='time'>year";
	echo "<th class='title'>title";
	echo "<th class='volume'>volume";
	echo "<th class='time'>time of the author";
	echo "<th class='author'>author";
	echo "<th class='time'>time";
	echo "<th class='time'>ce";
	echo "<th class='edition'>edition";
	echo "<th class='location'>location";
	echo "<th class='note'>note";
	echo "<th class='source'>src";
	echo "<th class='source'>id";
	$fileName="./data_from_sinica/merged_books.csv";
        $fp=fopen($fileName, "r");	
	$count=0;
	while(!feof($fp)){
		$data=fgetcsv($fp,100000,"\t");
		if($data[0]=="#"){
			$lastBook=array();
			for($i=0; $i<$data[5];$i++){
				$book=fgetcsv($fp,100000,"\t");

				if($i==0){
					echo "<tr class='separator'>";
					foreach($book as $item){
						echo "<td>";
					}
				}
				echo "<tr>";
				$j=0;
				foreach($book as $idx=>$item){
					$class='';
					if($i!=0){
						if($idx==4 && $lastBook[4]!=$item){
							$class="different";
						}
						if($idx==6 && $lastBook[6]!=$item){
							$class="different";
						}
					}
					if($idx==2 && $item=="") $class="empty";
					echo "<td class='".$class."'>".$item;
					$j++;
				}
				$lastBook=$book;
				$count++;
			}	
		}
	}
	echo "</table>";
}

?>

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html> 
        <head>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
                <style type="text/css">
			table{
				border-collapse:collapse;
			}
			td,th{
				border:1px solid #eee;
				padding:5px 5px;
			}
			td{vertical-align:top;}
			.separator{background:#000;}
			.book{background:#ddd;}
			.empty{background:#F6CECE;}
			.different{background:#F3E2A9;}
			.level{width:80px;}
			.time{width:40px;}
			.title{width:100px;}
			.volume{width:120px;}
			.author{width:200px;}
			.edition,.location{width:200px;}
			.note{width:200px;}
			.source{20px;}
                </style>
                <!--<script src="js/check_sections.js" charset="utf-8"></script>!-->
        </head>
        <body>
<?php

	//mergeBooks();
	printMergedBooks();

?>
        </body>
</html>