Mercurial > hg > extraction-interface
diff interface/insert_new_columns_into_books/parse_data_from_sinica.php @ 0:b12c99b7c3f0
commit for previous development
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 19 Jan 2015 17:13:49 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interface/insert_new_columns_into_books/parse_data_from_sinica.php Mon Jan 19 17:13:49 2015 +0100 @@ -0,0 +1,169 @@ +<?php +set_time_limit(0); +ini_set('memory_limit', '-1'); +//group the duplicate books in source 1 +function mergeBooks(){ + $fileName="./data_from_sinica/all_data.csv"; + $inputFp=fopen($fileName, "r"); + $fileName="./data_from_sinica/merged_books.csv"; + $outputFp=fopen($fileName, "w"); + $bookArray=array(); + $count=0; + $bookCount=0; + $columnNameMapping=[2=>'level1', 3=>'level2', 'year'=>'year', + 4=>'title', 5=>'volume', 6=>'timeOfTheAuthor', 7=>'author', + 8=>'time', 9=>'commonEra', 10=>'edition', 11=>'location', 12=>'note', 1=>'source',0=>'id']; + while(!feof($inputFp)){ + $data=fgetcsv($inputFp,1000000,"\t"); + //0=>id, 1=>source, 2=>省份, 3=>地區, 4=>地方志名, 5=>卷數, 6=>編修者年代, 7=>人名, 8=>年代, 9=>西元, 10=>性質, 11=>館藏地, 12=>註記 + $book=array(); + foreach($columnNameMapping as $oldIdx=>$newIdx){ + if(!is_numeric($oldIdx)){ + $book[$newIdx]=''; + continue; + } + $book[$newIdx]=$data[$oldIdx]; + } + $bracketArray=["(",")"]; + $book['timeOfTheAuthor']=str_replace($bracketArray,"",$book['timeOfTheAuthor']); + $leftBracketArray=["﹝"]; + $rightBracketArray=["﹞","}"]; + $book['title']=str_replace($leftBracketArray,"〔",$book['title']); + $book['title']=str_replace($rightBracketArray,"〕",$book['title']); + $pattern="/〔(.{2,4})〕(.+)/u"; + //0=whole,1=year,2=title + if(preg_match($pattern,$book['title'],$match)){ + $book['year']=$match[1]; + $book['title']=$match[2]; + } + $pattern="/﹝(.{2,4})﹞(.+)/u"; + //0=whole,1=year,2=title + if(preg_match($pattern,$book['title'],$match)){ + $book['year']=$match[1]; + $book['title']=$match[2]; + } + if($count==0){ + $bookArray[$count]=$book; + $count++; + continue; + } + $lastBook=$bookArray[$count-1]; + if($lastBook['year']==$book['year'] && $lastBook['title']==$book['title']){//the same book + //push into the buffer + $bookArray[$count]=$book; + $count++; + }else{//differnt books + //flush out the buffer + $array=["#",$lastBook['level1'],$lastBook['level2'],$lastBook['year'],$lastBook['title'],$count]; + fputcsv($outputFp,$array,"\t"); + foreach($bookArray as $bookInfo){ + fputcsv($outputFp,$bookInfo,"\t"); + } + $bookArray=array(); + $count=0; + $bookArray[$count]=$book; + $count++; + $bookCount++; + } + if($book['source']>1) break; + } + echo "# of books: ".$bookCount."<br>"; + fclose($inputFp); + fclose($outputFp); +} +function printMergedBooks(){ + echo "<table>"; + echo "<tr>"; + echo "<th class='level'>level 1"; + echo "<th class='level'>level 2"; + echo "<th class='time'>year"; + echo "<th class='title'>title"; + echo "<th class='volume'>volume"; + echo "<th class='time'>time of the author"; + echo "<th class='author'>author"; + echo "<th class='time'>time"; + echo "<th class='time'>ce"; + echo "<th class='edition'>edition"; + echo "<th class='location'>location"; + echo "<th class='note'>note"; + echo "<th class='source'>src"; + echo "<th class='source'>id"; + $fileName="./data_from_sinica/merged_books.csv"; + $fp=fopen($fileName, "r"); + $count=0; + while(!feof($fp)){ + $data=fgetcsv($fp,100000,"\t"); + if($data[0]=="#"){ + $lastBook=array(); + for($i=0; $i<$data[5];$i++){ + $book=fgetcsv($fp,100000,"\t"); + + if($i==0){ + echo "<tr class='separator'>"; + foreach($book as $item){ + echo "<td>"; + } + } + echo "<tr>"; + $j=0; + foreach($book as $idx=>$item){ + $class=''; + if($i!=0){ + if($idx==4 && $lastBook[4]!=$item){ + $class="different"; + } + if($idx==6 && $lastBook[6]!=$item){ + $class="different"; + } + } + if($idx==2 && $item=="") $class="empty"; + echo "<td class='".$class."'>".$item; + $j++; + } + $lastBook=$book; + $count++; + } + } + } + echo "</table>"; +} + +?> + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> + <style type="text/css"> + table{ + border-collapse:collapse; + } + td,th{ + border:1px solid #eee; + padding:5px 5px; + } + td{vertical-align:top;} + .separator{background:#000;} + .book{background:#ddd;} + .empty{background:#F6CECE;} + .different{background:#F3E2A9;} + .level{width:80px;} + .time{width:40px;} + .title{width:100px;} + .volume{width:120px;} + .author{width:200px;} + .edition,.location{width:200px;} + .note{width:200px;} + .source{20px;} + </style> + <!--<script src="js/check_sections.js" charset="utf-8"></script>!--> + </head> + <body> +<?php + + //mergeBooks(); + printMergedBooks(); + +?> + </body> +</html>