Mercurial > hg > extraction-interface
view interface/insert_new_columns_into_books/parse_data_from_sinica.php @ 0:b12c99b7c3f0
commit for previous development
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 19 Jan 2015 17:13:49 +0100 |
parents | |
children |
line wrap: on
line source
<?php set_time_limit(0); ini_set('memory_limit', '-1'); //group the duplicate books in source 1 function mergeBooks(){ $fileName="./data_from_sinica/all_data.csv"; $inputFp=fopen($fileName, "r"); $fileName="./data_from_sinica/merged_books.csv"; $outputFp=fopen($fileName, "w"); $bookArray=array(); $count=0; $bookCount=0; $columnNameMapping=[2=>'level1', 3=>'level2', 'year'=>'year', 4=>'title', 5=>'volume', 6=>'timeOfTheAuthor', 7=>'author', 8=>'time', 9=>'commonEra', 10=>'edition', 11=>'location', 12=>'note', 1=>'source',0=>'id']; while(!feof($inputFp)){ $data=fgetcsv($inputFp,1000000,"\t"); //0=>id, 1=>source, 2=>省份, 3=>地區, 4=>地方志名, 5=>卷數, 6=>編修者年代, 7=>人名, 8=>年代, 9=>西元, 10=>性質, 11=>館藏地, 12=>註記 $book=array(); foreach($columnNameMapping as $oldIdx=>$newIdx){ if(!is_numeric($oldIdx)){ $book[$newIdx]=''; continue; } $book[$newIdx]=$data[$oldIdx]; } $bracketArray=["(",")"]; $book['timeOfTheAuthor']=str_replace($bracketArray,"",$book['timeOfTheAuthor']); $leftBracketArray=["﹝"]; $rightBracketArray=["﹞","}"]; $book['title']=str_replace($leftBracketArray,"〔",$book['title']); $book['title']=str_replace($rightBracketArray,"〕",$book['title']); $pattern="/〔(.{2,4})〕(.+)/u"; //0=whole,1=year,2=title if(preg_match($pattern,$book['title'],$match)){ $book['year']=$match[1]; $book['title']=$match[2]; } $pattern="/﹝(.{2,4})﹞(.+)/u"; //0=whole,1=year,2=title if(preg_match($pattern,$book['title'],$match)){ $book['year']=$match[1]; $book['title']=$match[2]; } if($count==0){ $bookArray[$count]=$book; $count++; continue; } $lastBook=$bookArray[$count-1]; if($lastBook['year']==$book['year'] && $lastBook['title']==$book['title']){//the same book //push into the buffer $bookArray[$count]=$book; $count++; }else{//differnt books //flush out the buffer $array=["#",$lastBook['level1'],$lastBook['level2'],$lastBook['year'],$lastBook['title'],$count]; fputcsv($outputFp,$array,"\t"); foreach($bookArray as $bookInfo){ fputcsv($outputFp,$bookInfo,"\t"); } $bookArray=array(); $count=0; $bookArray[$count]=$book; $count++; $bookCount++; } if($book['source']>1) break; } echo "# of books: ".$bookCount."<br>"; fclose($inputFp); fclose($outputFp); } function printMergedBooks(){ echo "<table>"; echo "<tr>"; echo "<th class='level'>level 1"; echo "<th class='level'>level 2"; echo "<th class='time'>year"; echo "<th class='title'>title"; echo "<th class='volume'>volume"; echo "<th class='time'>time of the author"; echo "<th class='author'>author"; echo "<th class='time'>time"; echo "<th class='time'>ce"; echo "<th class='edition'>edition"; echo "<th class='location'>location"; echo "<th class='note'>note"; echo "<th class='source'>src"; echo "<th class='source'>id"; $fileName="./data_from_sinica/merged_books.csv"; $fp=fopen($fileName, "r"); $count=0; while(!feof($fp)){ $data=fgetcsv($fp,100000,"\t"); if($data[0]=="#"){ $lastBook=array(); for($i=0; $i<$data[5];$i++){ $book=fgetcsv($fp,100000,"\t"); if($i==0){ echo "<tr class='separator'>"; foreach($book as $item){ echo "<td>"; } } echo "<tr>"; $j=0; foreach($book as $idx=>$item){ $class=''; if($i!=0){ if($idx==4 && $lastBook[4]!=$item){ $class="different"; } if($idx==6 && $lastBook[6]!=$item){ $class="different"; } } if($idx==2 && $item=="") $class="empty"; echo "<td class='".$class."'>".$item; $j++; } $lastBook=$book; $count++; } } } echo "</table>"; } ?> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <style type="text/css"> table{ border-collapse:collapse; } td,th{ border:1px solid #eee; padding:5px 5px; } td{vertical-align:top;} .separator{background:#000;} .book{background:#ddd;} .empty{background:#F6CECE;} .different{background:#F3E2A9;} .level{width:80px;} .time{width:40px;} .title{width:100px;} .volume{width:120px;} .author{width:200px;} .edition,.location{width:200px;} .note{width:200px;} .source{20px;} </style> <!--<script src="js/check_sections.js" charset="utf-8"></script>!--> </head> <body> <?php //mergeBooks(); printMergedBooks(); ?> </body> </html>