comparison models/extractapp.php @ 78:960ba96efce1 extractapp

Update: click to popup remove-tag-window; select to popup tag-window
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Mon, 20 Apr 2015 15:44:54 +0200
parents 97c1e5102a22
children e6e213b26454
comparison
equal deleted inserted replaced
77:97c1e5102a22 78:960ba96efce1
41 41
42 $this->lg_text = $lg_text; 42 $this->lg_text = $lg_text;
43 43
44 } 44 }
45 45
46 public function SetBookMetaDataBySectionId() { 46 public function SetBookMetadataBySectionId() {
47 47
48 // get book_meta from $$this->get_section_metadata_by_sectionId_url 48 // get book_meta from $$this->get_section_metadata_by_sectionId_url
49 $section_meta_url = $this->get_section_metadata_by_sectionId_url.$this->section_id; 49 $section_meta_url = $this->get_section_metadata_by_sectionId_url.$this->section_id;
50 $section_meta = json_decode(file_get_contents($section_meta_url), true); 50 $section_meta = json_decode(file_get_contents($section_meta_url), true);
51 51
180 $stringInput = $this->lg_text; 180 $stringInput = $this->lg_text;
181 181
182 $data = array(); // data array to be passed to view 182 $data = array(); // data array to be passed to view
183 183
184 //$taglistArray = $this->GetTaglistArray(); 184 //$taglistArray = $this->GetTaglistArray();
185 //for GetTaglistByTopicID: 185 //for GetTaglistByTopicId:
186 $taglistArray = $this->GetTaglistByTopicID($this->GetTopic()); 186 $taglistArray = $this->GetTaglistByTopicId($this->GetTopic());
187 187
188 // $this->taglist_infile is set (1) from file or (2) from _postdata['taglistArray'] which comes from frontend that user decided 188 // $this->taglist_infile is set (1) from file or (2) from _postdata['taglistArray'] which comes from frontend that user decided
189 // $this->taglist_infile is the most up-to-date taglist decided by user. Should be written into file. 189 // $this->taglist_infile is the most up-to-date taglist decided by user. Should be written into file.
190 if( $this->TaglistSubsetIn($this->taglist_infile, $taglistArray) ) { // TaglistSubsetIn($l1,$l2): $l1 is a subset of $l2 or not 190 if( $this->TaglistSubsetIn($this->taglist_infile, $taglistArray) ) { // TaglistSubsetIn($l1,$l2): $l1 is a subset of $l2 or not
191 $this->taglist_infile = ""; 191 $this->taglist_infile = "";
258 file_put_contents("data/parsing_files/".$postdata['filename'].".txt", $require); 258 file_put_contents("data/parsing_files/".$postdata['filename'].".txt", $require);
259 } 259 }
260 260
261 } 261 }
262 262
263 public function UpdateInfoByResponseFromLGService($response) { 263 public function UpdateInfoResponsedFromLGService($response) {
264 264
265 if (isset($response["file"])) { 265 if (isset($response["file"])) {
266 $response_file = $response["file"]; 266 $response_file = $response["file"];
267 } 267 }
268 if (isset($response["branch"])) { 268 if (isset($response["branch"])) {
294 $this->current_fileId = 0; 294 $this->current_fileId = 0;
295 } 295 }
296 } 296 }
297 297
298 // This is only called by SaveFullTextToLGService() 298 // This is only called by SaveFullTextToLGService()
299 private function AppendMetaData($text_content) { 299 private function AppendMetadata($text_content) {
300 $text = '<?xml version="1.0" encoding="UTF-8"?>'; 300 $text = '<?xml version="1.0" encoding="UTF-8"?>';
301 $text .= "\n<text>\n"; 301 $text .= "\n<text>\n";
302 // --- topic --- 302 // --- topic ---
303 $text .= "<topic>".$this->topic."</topic>\n"; 303 $text .= "<topic>".$this->topic."</topic>\n";
304 // --- book meta data --- 304 // --- book meta data ---
329 $text .= "</section>\n"; 329 $text .= "</section>\n";
330 330
331 // ---- taglist --- 331 // ---- taglist ---
332 // $taglist = $this->taglist_infile; 332 // $taglist = $this->taglist_infile;
333 // obtain the latest taglist from db 333 // obtain the latest taglist from db
334 $taglist = $this->GetTaglistByTopicID($this->GetTopic()); 334 $taglist = $this->GetTaglistByTopicId($this->GetTopic());
335 335
336 foreach ($taglist as $tagitem) { 336 foreach ($taglist as $tagitem) {
337 $text .= "<tagitem>\n"; 337 $text .= "<tagitem>\n";
338 $text .= "<id>".$tagitem[0]."</id>\n"; 338 $text .= "<id>".$tagitem[0]."</id>\n";
339 $text .= "<name>".$tagitem[1]."</name>\n"; 339 $text .= "<name>".$tagitem[1]."</name>\n";
347 $text .= "</text>"; 347 $text .= "</text>";
348 // ---- 348 // ----
349 349
350 return $text; 350 return $text;
351 } 351 }
352
352 public function SaveFullTextToLGService($_postdata) { 353 public function SaveFullTextToLGService($_postdata) {
353 // save tagged text (full text) by Jorge's API to lg service 354 // save tagged text (full text) by Jorge's API to lg service
354 // -------- 355 // --------
355 global $AT_LOCAL; 356 global $AT_LOCAL;
356 357
383 // -- new branch case 384 // -- new branch case
384 $require = "<text_content>".$require."</text_content>\n"; 385 $require = "<text_content>".$require."</text_content>\n";
385 } 386 }
386 */ 387 */
387 388
388 $require = $this->AppendMetaData($require); 389 $require = $this->AppendMetadata($require);
389 390
390 //saving in my local machine in developing phrase 391 //saving in my local machine in developing phrase
391 if ($AT_LOCAL) { 392 if ($AT_LOCAL) {
392 file_put_contents("data/parsing_files/".$_postdata['sectionId'].".txt", $require); 393 file_put_contents("data/parsing_files/".$_postdata['sectionId'].".txt", $require);
393 } 394 }
452 return $response; 453 return $response;
453 454
454 } 455 }
455 456
456 // --- for regex ---- 457 // --- for regex ----
457 public function GetRegexFilenameById($topic_id) { 458 private function GetRegexFilenameById($topic_id) {
458 $query = sprintf("SELECT regex_filename FROM `topic_regex_relation` WHERE topic_id=\"%s\"", $topic_id); 459 $query = sprintf("SELECT regex_filename FROM `topic_regex_relation` WHERE topic_id=\"%s\"", $topic_id);
459 $result = mysql_query($query); 460 $result = mysql_query($query);
460 if (!$result) { 461 if (!$result) {
461 return json_encode("Failed during selecting topic_regex_relation table."); 462 return json_encode("Failed during selecting topic_regex_relation table.");
462 } 463 }
466 array_push($filenames, $row['regex_filename']); 467 array_push($filenames, $row['regex_filename']);
467 } 468 }
468 469
469 return $filenames; 470 return $filenames;
470 } 471 }
471 public function SmartRegexLoad($topic_id) { 472 public function LoadSmartRegex($topic_id) {
472 473
473 // Load regex file based on current topic. Only shows the regex in this topic -- 474 // Load regex file based on current topic. Only shows the regex in this topic --
474 $filenames = $this->GetRegexFilenameById($topic_id); 475 $filenames = $this->GetRegexFilenameById($topic_id);
475 476
476 // Get regex file from filesystem ---- 477 // Get regex file from filesystem ----
490 echo json_encode($returnArray); 491 echo json_encode($returnArray);
491 return; 492 return;
492 } 493 }
493 494
494 495
495 public function SmartRegexSave($_postdata) { 496 public function SaveSmartRegex($_postdata) {
496 if ($_postdata['text']){ 497 if ($_postdata['text']){
497 498
498 // --- update topic_regex_relation table --- 499 // --- update topic_regex_relation table ---
499 $topic_id = $_postdata['topic_id']; 500 $topic_id = $_postdata['topic_id'];
500 $filename = $_postdata['filename'].'.txt'; 501 $filename = $_postdata['filename'].'.txt';
557 } 558 }
558 $outputTableArray[0]["other"] = "其他"; 559 $outputTableArray[0]["other"] = "其他";
559 $outputTableArray[0]["page"] = "頁數"; 560 $outputTableArray[0]["page"] = "頁數";
560 $outputTableArray[0]["full"] = "全文"; 561 $outputTableArray[0]["full"] = "全文";
561 562
563 // id, name, tag, color in _taglistArray
562 foreach ( $_taglistArray as $tagValue ) { 564 foreach ( $_taglistArray as $tagValue ) {
563 $content = preg_replace("/<\/".$tagValue[2].">○*<".$tagValue[2].">/u", "", $_content); 565 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name
564 $content = preg_replace("/<".$tagValue[2].">[ ]*<\/".$tagValue[2].">/u", "", $_content); 566 $content = preg_replace("/<\/".$tag_name.">○*<".$tag_name.">/u", "", $_content);
567 $content = preg_replace("/<".$tag_name.">[ ]*<\/".$tag_name.">/u", "", $_content);
565 } 568 }
566 569
567 $contentLineArray = explode( "<br>", $content ); 570 $contentLineArray = explode( "<br>", $content );
568 571
569 $count=0; 572 $count=0;
571 foreach ( $contentLineArray as $value ) { 574 foreach ( $contentLineArray as $value ) {
572 $count++; 575 $count++;
573 $recordString = $value; 576 $recordString = $value;
574 $otherString = $recordString; 577 $otherString = $recordString;
575 //echo $recordString."<br>\n"; 578 //echo $recordString."<br>\n";
579 // find hyper link in pattern with <a>...</a>
576 if ( preg_match("/【<a(.*?)>(.*?)<\/a>】/u", $recordString, $matches) ) { 580 if ( preg_match("/【<a(.*?)>(.*?)<\/a>】/u", $recordString, $matches) ) {
577 $pageNow = $matches[2]; 581 $pageNow = $matches[2];
578 } 582 }
579 foreach ( $_taglistArray as $tagValue ) { 583 foreach ( $_taglistArray as $tagValue ) {
580 if ( preg_match_all("/<".$tagValue[2].">(.*?)<\/".$tagValue[2].">/u", $recordString, $matches, PREG_SET_ORDER) ) { 584 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name
585
586 if ( preg_match_all("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", $recordString, $matches, PREG_SET_ORDER) ) {
581 foreach ( $matches as $matchesValue ) { 587 foreach ( $matches as $matchesValue ) {
582 $matchesValue[1] = preg_replace("/○/u", "", $matchesValue[1]); 588 $matchesValue[1] = preg_replace("/○/u", "", $matchesValue[1]);
589
583 if ( preg_match_all("/〈(.*?)〉/u", $matchesValue[1], $matches2, PREG_SET_ORDER) ) { 590 if ( preg_match_all("/〈(.*?)〉/u", $matchesValue[1], $matches2, PREG_SET_ORDER) ) {
584 foreach ( $matches2 as $matches2Value ) { 591 foreach ( $matches2 as $matches2Value ) {
585 if ( isset($outputTableArray[$count][0][$tagValue[2]]) ) { 592 if ( isset($outputTableArray[$count][0][$tag_name]) ) {
586 $outputTableArray[$count][0][$tagValue[2]] .= ";".$matches2Value[1]; 593 $outputTableArray[$count][0][$tag_name] .= ";".$matches2Value[1];
587 } else { 594 } else {
588 $outputTableArray[$count][0][$tagValue[2]] = $matches2Value[1]; 595 $outputTableArray[$count][0][$tag_name] = $matches2Value[1];
589 } 596 }
590 } 597 }
591 } else { 598 } else {
592 if ( isset($outputTableArray[$count][0][$tagValue[2]]) ) { 599 if ( isset($outputTableArray[$count][0][$tag_name]) ) {
593 $outputTableArray[$count][0][$tagValue[2]] .= ";".$matchesValue[1]; 600 $outputTableArray[$count][0][$tag_name] .= ";".$matchesValue[1];
594 } else { 601 } else {
595 $outputTableArray[$count][0][$tagValue[2]] = $matchesValue[1]; 602 $outputTableArray[$count][0][$tag_name] = $matchesValue[1];
596 } 603 }
597 } 604 }
598 } 605 }
599 $otherString = preg_replace("/<".$tagValue[2].">(.*?)<\/".$tagValue[2].">/u", " ", $otherString); 606 $otherString = preg_replace("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", " ", $otherString);
600 } 607 }
601 } 608 }
602 $otherString = preg_replace("/○/u", "", $otherString); 609 $otherString = preg_replace("/○/u", "", $otherString);
603 $outputTableArray[$count]["other"] = $otherString; 610 $outputTableArray[$count]["other"] = $otherString;
604 $outputTableArray[$count]["page"] = $pageNow; 611 $outputTableArray[$count]["page"] = $pageNow;
643 $sectionName = $postdata['sectionName']; 650 $sectionName = $postdata['sectionName'];
644 $bookId = $postdata['bookId']; 651 $bookId = $postdata['bookId'];
645 $bookName = $postdata['bookName']; 652 $bookName = $postdata['bookName'];
646 653
647 //$taglistArray = $this->GetTaglistArray(); 654 //$taglistArray = $this->GetTaglistArray();
648 $taglistArray = $this->GetTaglistByTopicID($topic); 655 $taglistArray = $this->GetTaglistByTopicId($topic);
649 } 656 }
650 // ===== 657 // =====
651 658
652 $topic_tag = $this->GetTopicTag($topic); 659 $topic_tag = $this->GetTopicTag($topic);
653 660
681 $row = mysql_fetch_assoc($result); 688 $row = mysql_fetch_assoc($result);
682 $largest_id = $row['AUTO_INCREMENT']-1; 689 $largest_id = $row['AUTO_INCREMENT']-1;
683 690
684 691
685 $topic_id = $_postdata['topic_id']; 692 $topic_id = $_postdata['topic_id'];
686 $result = $this->GetTaglistByTopicID($topic_id); 693 $result = $this->GetTaglistByTopicId($topic_id);
687 694
688 $taglistArray = array(); 695 $taglistArray = array();
689 696
690 foreach ($result as $row) { 697 foreach ($result as $row) {
691 $taglistArray[$row[0]] = array($row[1], $row[2], $row[3]); 698 $taglistArray[$row[0]] = array($row[1], $row[2], $row[3]);
782 /** 789 /**
783 * 790 *
784 */ 791 */
785 792
786 $topic = $postdata['topic']; 793 $topic = $postdata['topic'];
787 $result = $this->GetTopicByID($topic); 794 $result = $this->GetTopicById($topic);
788 $row = mysql_fetch_assoc($result); 795 $row = mysql_fetch_assoc($result);
789 $topic_name_en = $row['name_en']; 796 $topic_name_en = $row['name_en'];
790 $topic_name_ch = $row['name_ch']; 797 $topic_name_ch = $row['name_ch'];
791 $topic_name_pinyin = $row['name_pinyin']; 798 $topic_name_pinyin = $row['name_pinyin'];
792 799
969 */ 976 */
970 977
971 } 978 }
972 979
973 980
974 private function GetTaglistByTopicID($topic_id) { 981 private function GetTaglistByTopicId($topic_id) {
975 $taglistArray = array(); 982 $taglistArray = array();
976 // select taglist ids from topic_tag_relation table 983 // select taglist ids from topic_tag_relation table
977 $query = sprintf("SELECT * FROM `topic_tag_relation` WHERE `topic_id`='%s'", $topic_id); 984 $query = sprintf("SELECT * FROM `topic_tag_relation` WHERE `topic_id`='%s'", $topic_id);
978 $result = mysql_query($query); 985 $result = mysql_query($query);
979 if (!$result) { 986 if (!$result) {
1003 1010
1004 public function SetTopic($topic) { 1011 public function SetTopic($topic) {
1005 $this->topic = $topic; 1012 $this->topic = $topic;
1006 } 1013 }
1007 1014
1008 public function GetTopic() { 1015 private function GetTopic() {
1009 return $this->topic; 1016 return $this->topic;
1010 } 1017 }
1011 1018
1012 private function GetTopiclistArray() { 1019 private function GetTopiclistArray() {
1013 $topiclistArray = array(); 1020 $topiclistArray = array();
1014 $result = $this->GetTopiclist(); 1021 $result = $this->GetTopicList();
1015 while ($row = mysql_fetch_assoc($result)) { 1022 while ($row = mysql_fetch_assoc($result)) {
1016 //array_push($topiclistArray, array('id'=>$row['id'],'name'=>$row['name'],'tag'=>$row['tag'])); 1023 //array_push($topiclistArray, array('id'=>$row['id'],'name'=>$row['name'],'tag'=>$row['tag']));
1017 array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['name_en'],'name_ch'=>$row['name_ch'],'name_pinyin'=>$row['name_pinyin'],)); 1024 array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['name_en'],'name_ch'=>$row['name_ch'],'name_pinyin'=>$row['name_pinyin'],));
1018 } 1025 }
1019 return $topiclistArray; 1026 return $topiclistArray;
1020 } 1027 }
1021 1028
1022 1029
1023 private function GetTopicTag($topic_id) { 1030 private function GetTopicTag($topic_id) {
1024 $result = $this->GetTopicByID($topic_id); 1031 $result = $this->GetTopicById($topic_id);
1025 $row = mysql_fetch_assoc($result); 1032 $row = mysql_fetch_assoc($result);
1026 $tag = $row['tag']; 1033 $tag = $row['tag'];
1027 return $tag; 1034 return $tag;
1028 1035
1029 } 1036 }
1030 private function GetTopicName($topic_id) { 1037 private function GetTopicName($topic_id) {
1031 $result = $this->GetTopicByID($topic_id); 1038 $result = $this->GetTopicById($topic_id);
1032 $row = mysql_fetch_assoc($result); 1039 $row = mysql_fetch_assoc($result);
1033 //$name = $row['name']; 1040 //$name = $row['name'];
1034 $name = array('name_en'=>$row['name_en'], 'name_ch'=>$row['name_ch'], 'name_pinyin'=>$row['name_pinyin']); 1041 $name = array('name_en'=>$row['name_en'], 'name_ch'=>$row['name_ch'], 'name_pinyin'=>$row['name_pinyin']);
1035 return $name; 1042 return $name;
1036 1043
1119 */ 1126 */
1120 } 1127 }
1121 1128
1122 1129
1123 // ======================================= 1130 // =======================================
1124 1131
1125 public function sortFunction($a,$b) { 1132 private function sortFunction($a,$b) {
1126 return strlen($b)-strlen($a); 1133 return strlen($b)-strlen($a);
1127 } 1134 }
1128 1135
1129 private function GetSectionId() { 1136 private function GetSectionId() {
1130 1137
1139 $section_id = $this->GetSectionId(); 1146 $section_id = $this->GetSectionId();
1140 if (!is_numeric($section_id)){ 1147 if (!is_numeric($section_id)){
1141 return $section_id; 1148 return $section_id;
1142 } 1149 }
1143 1150
1144 $result = $this->GetSectionsByID($section_id); 1151 $result = $this->GetSectionsById($section_id);
1145 1152
1146 1153
1147 while ($row = mysql_fetch_assoc($result)) { 1154 while ($row = mysql_fetch_assoc($result)) {
1148 $bookId=$row['books_id']; 1155 $bookId=$row['books_id'];
1149 $startPage=$row['start_page']; 1156 $startPage=$row['start_page'];
1265 } 1272 }
1266 1273
1267 /* 1274 /*
1268 // get book_meta from books table in db on localhost 1275 // get book_meta from books table in db on localhost
1269 $book_meta = array(); 1276 $book_meta = array();
1270 $books_result = $this->GetBooksByID($bookId); 1277 $books_result = $this->GetBooksById($bookId);
1271 while ($row = mysql_fetch_assoc($books_result)) { 1278 while ($row = mysql_fetch_assoc($books_result)) {
1272 array_push($book_meta, array($row['id'],$row['name'],$row['author'],(string)$row['start_year'],(string)$row['line'],(string)$row['dynasty'])); 1279 array_push($book_meta, array($row['id'],$row['name'],$row['author'],(string)$row['start_year'],(string)$row['line'],(string)$row['dynasty']));
1273 // use 'start_year' as year, 'line' is pagenumber 1280 // use 'start_year' as year, 'line' is pagenumber
1274 } 1281 }
1275 $this->book_meta = $book_meta; 1282 $this->book_meta = $book_meta;
1321 1328
1322 return $wordlistArray; 1329 return $wordlistArray;
1323 1330
1324 } 1331 }
1325 1332
1326 protected function GetBooksInfo($bookId) { 1333 private function GetBooksInfo($bookId) {
1327 $result = $this->GetSectionsByID($bookId); 1334 $result = $this->GetSectionsById($bookId);
1328 while ($row = mysql_fetch_assoc($result)) { 1335 while ($row = mysql_fetch_assoc($result)) {
1329 $bookName = $row['name']; 1336 $bookName = $row['name'];
1330 } 1337 }
1331 1338
1332 $data = array(); 1339 $data = array();