comparison models/extractapp.php @ 115:507070df62e6 extractapp

add escape rule for regular expression
author Calvin Yeh <cyeh@mpiwg-berlin.mpg.de>
date Thu, 28 Sep 2017 17:13:39 +0200
parents 7bdbc7fc7936
children
comparison
equal deleted inserted replaced
114:7d6a107c37da 115:507070df62e6
62 62
63 63
64 $stringInput = $lg_text; 64 $stringInput = $lg_text;
65 $stringInput = preg_replace("/ /u", "○", $stringInput); 65 $stringInput = preg_replace("/ /u", "○", $stringInput);
66 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); 66 $stringInput = preg_replace("/\n/u", "<br>", $stringInput);
67 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$this->book_id."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); 67 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput);
68 $lg_text = $stringInput; 68 $lg_text = $stringInput;
69 69
70 $this->lg_text = $lg_text; 70 $this->lg_text = $lg_text;
71 71
72 } 72 }
100 $book_meta->level2 = $b['level2']; 100 $book_meta->level2 = $b['level2'];
101 $book_meta->period = $b['period']; 101 $book_meta->period = $b['period'];
102 $book_meta->dynasty = $b['dynasty']; 102 $book_meta->dynasty = $b['dynasty'];
103 $book_meta->start_year = $b['start_year']; 103 $book_meta->start_year = $b['start_year'];
104 $book_meta->end_year = $b['end_year']; 104 $book_meta->end_year = $b['end_year'];
105 $book_meta->book_year = $b['book_year'];
106 $book_meta->edition_year = $b['edition_year'];
105 $book_meta->line = $b['line']; 107 $book_meta->line = $b['line'];
106 $book_meta->volume = $b['volume']; 108 $book_meta->volume = $b['volume'];
107 $book_meta->author = $b['author']; 109 $book_meta->author = $b['author'];
108 $book_meta->edition = $b['edition']; 110 $book_meta->edition = $b['edition'];
109 $book_meta->in_jibengujiku = $b['in_jibengujiku']; 111 $book_meta->in_jibengujiku = $b['in_jibengujiku'];
110 $book_meta->admin_type = $b['admin_type']; 112 $book_meta->admin_type = $b['admin_type'];
113 $book_meta->book_year = $b['book_year'];
114 $book_meta->edition_year = $b['edition_year'];
111 115
112 $coordinates = $section_meta['section']['coordinates_books']; 116 $coordinates = $section_meta['section']['coordinates_books'];
113 $book_meta->x = $coordinates['x']; 117 $book_meta->x = $coordinates['x'];
114 $book_meta->y = $coordinates['y']; 118 $book_meta->y = $coordinates['y'];
115 $book_meta->place_name = $coordinates['place_name']; 119 $book_meta->place_name = $coordinates['place_name'];
145 149
146 $stringInput = $lg_text; 150 $stringInput = $lg_text;
147 $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput); 151 $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput);
148 $stringInput = preg_replace("/ /u", "○", $stringInput); 152 $stringInput = preg_replace("/ /u", "○", $stringInput);
149 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); 153 $stringInput = preg_replace("/\n/u", "<br>", $stringInput);
150 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$this->book_id."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); 154 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput);
151 $lg_text = $stringInput; 155 $lg_text = $stringInput;
152 156
153 $this->lg_text = $lg_text; 157 $this->lg_text = $lg_text;
154 } 158 }
155 159
158 $this->section_id = $_id; 162 $this->section_id = $_id;
159 //$this->branch_id = 1; // local test sets branch_id to 1 163 //$this->branch_id = 1; // local test sets branch_id to 1
160 $this->messages['debug'] .= "[Debug] from my local"."<br>"; 164 $this->messages['debug'] .= "[Debug] from my local"."<br>";
161 $this->lg_text = $this->GetSectionContent(); 165 $this->lg_text = $this->GetSectionContent();
162 } 166 }
163
164 167
165 public function SetInfoFromPreviousPage($_postdata) { 168 public function SetInfoFromPreviousPage($_postdata) {
166 /** 169 /**
167 * Get information from post data passsed by the previous page, and set variables in $this. 170 * Get information from post data passsed by the previous page, and set variables in $this.
168 * The previous page could be: 171 * The previous page could be:
211 if (isset($_postdata['book_meta'])) { 214 if (isset($_postdata['book_meta'])) {
212 $this->book_meta = json_decode($_postdata['book_meta']); 215 $this->book_meta = json_decode($_postdata['book_meta']);
213 } 216 }
214 217
215 } 218 }
216
217 219
218 private function TaglistSubsetIn($list1, $list2) { // $l1 is a subset of $l2 or not 220 private function TaglistSubsetIn($list1, $list2) { // $l1 is a subset of $l2 or not
219 // l1 and l2: array( $row['id'], $row['name'], $row['tag'], $row['color'] ) 221 // l1 and l2: array( $row['id'], $row['name'], $row['tag'], $row['color'] )
220 $cnt_list1 = count($list1); 222 $cnt_list1 = count($list1);
221 $cnt_list2 = count($list2); 223 $cnt_list2 = count($list2);
302 304
303 305
304 $this->lg_text = $lg_text; 306 $this->lg_text = $lg_text;
305 307
306 } 308 }
309
307 public function StartTagging() { 310 public function StartTagging() {
308 /** 311 /**
309 * This is the main method for tagging text. It passes all the information to "views/Extractapp/TaggingText.php" view. 312 * This is the main method for tagging text. It passes all the information to "views/Extractapp/TaggingText.php" view.
310 * The information contain the text string, taglist array, wordlis array, topic, etc. 313 * The information contain the text string, taglist array, wordlis array, topic, etc.
311 */ 314 */
398 $require = $postdata['text']; 401 $require = $postdata['text'];
399 } 402 }
400 403
401 $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require); 404 $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require);
402 $require = preg_replace('/&amp;/u', "&", $require); 405 $require = preg_replace('/&amp;/u', "&", $require);
403 $require = preg_replace('/&nbsp;/u', " ", $require); //avoiding invaild xml format 406 $require = preg_replace('/&nbsp;/u', " ", $require);
407 $require = preg_replace('/<font [^><]*>/u', "", $require);
408 $require = preg_replace('/<\/font>/u', "", $require);
409 $require = preg_replace('/ style="background-color: transparent;"/u', "", $require);
404 $require = preg_replace("/○/u", " ", $require); 410 $require = preg_replace("/○/u", " ", $require);
405 $require = preg_replace("/<br>/u", "\n", $require); 411 $require = preg_replace("/<br>/u", "\n", $require);
406 $require = preg_replace("/<br>/u", "\n", $require); 412 $require = preg_replace("/<br>/u", "\n", $require);
407 file_put_contents("data/parsing_files/".$postdata['filename'].".txt", $require); 413 file_put_contents("data/parsing_files/".$postdata['filename'].".txt", $require);
408 } 414 }
409 415
410 } 416 }
411
412 417
413 public function UpdateInfoResponsedFromLGService($response) { 418 public function UpdateInfoResponsedFromLGService($response) {
414 /** 419 /**
415 * 420 *
416 */ 421 */
466 $text .= "<level2>".$book->level2."</level2>\n"; 471 $text .= "<level2>".$book->level2."</level2>\n";
467 $text .= "<period>".$book->period."</period>\n"; 472 $text .= "<period>".$book->period."</period>\n";
468 $text .= "<dynasty>".$book->dynasty."</dynasty>\n"; 473 $text .= "<dynasty>".$book->dynasty."</dynasty>\n";
469 $text .= "<start_year>".$book->start_year."</start_year>\n"; 474 $text .= "<start_year>".$book->start_year."</start_year>\n";
470 $text .= "<end_year>".$book->end_year."</end_year>\n"; 475 $text .= "<end_year>".$book->end_year."</end_year>\n";
476 $text .= "<book_year>".$book->book_year."</book_year>\n";
477 $text .= "<edition_year>".$book->edition_year."</edition_year>\n";
471 $text .= "<line>".$book->line."</line>\n"; 478 $text .= "<line>".$book->line."</line>\n";
472 $text .= "<volume>".$book->volume."</volume>\n"; 479 $text .= "<volume>".$book->volume."</volume>\n";
473 $text .= "<author>".$book->author."</author>\n"; 480 $text .= "<author>".$book->author."</author>\n";
474 $text .= "<edition>".$book->edition."</edition>\n"; 481 $text .= "<edition>".$book->edition."</edition>\n";
475 $text .= "<in_jibengujiku>".$book->in_jibengujiku."</in_jibengujiku>\n"; 482 $text .= "<in_jibengujiku>".$book->in_jibengujiku."</in_jibengujiku>\n";
538 $require = $_postdata['text']; 545 $require = $_postdata['text'];
539 } 546 }
540 547
541 $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require); 548 $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require);
542 $require = preg_replace('/&amp;/u', "&", $require); 549 $require = preg_replace('/&amp;/u', "&", $require);
543 $require = preg_replace('/&nbsp;/u', " ", $require); //avoiding invaild xml format 550 $require = preg_replace('/&nbsp;/u', " ", $require);
551 $require = preg_replace('/<font [^><]*>/u', "", $require);
552 $require = preg_replace('/<\/font>/u', "", $require);
553 $require = preg_replace('/ style="background-color: transparent;"/u', "", $require);
544 $require = preg_replace("/○/u", " ", $require); 554 $require = preg_replace("/○/u", " ", $require);
545 $require = preg_replace("/<br(.*?)>/u", "\n", $require); 555 $require = preg_replace("/<br(.*?)>/u", "\n", $require);
546 //$require = preg_replace("/<br>/u", "\n", $require); 556 //$require = preg_replace("/<br>/u", "\n", $require);
547 557
548 $require = "<text_content>".$require."</text_content>\n"; 558 $require = "<text_content>".$require."</text_content>\n";
636 array_push($filenames, $row['regex_filename']); 646 array_push($filenames, $row['regex_filename']);
637 } 647 }
638 648
639 return $filenames; 649 return $filenames;
640 } 650 }
651
641 public function LoadSmartRegex($topic_id) { 652 public function LoadSmartRegex($topic_id) {
642 653
643 // Load regex file based on current topic. Only shows the regex in this topic -- 654 // Load regex file based on current topic. Only shows the regex in this topic --
644 $filenames = $this->GetRegexFilenameById($topic_id); 655 $filenames = $this->GetRegexFilenameById($topic_id);
645 656
659 } 670 }
660 echo json_encode($returnArray); 671 echo json_encode($returnArray);
661 return; 672 return;
662 } 673 }
663 674
664
665 public function SaveSmartRegex($_postdata) { 675 public function SaveSmartRegex($_postdata) {
666 if ($_postdata['text']){ 676 if ($_postdata['text']){
667 677
668 // --- update topic_regex_relation table --- 678 // --- update topic_regex_relation table ---
669 $topic_id = $_postdata['topic_id']; 679 $topic_id = $_postdata['topic_id'];
711 file_put_contents( $data_path."regex_files/".$_postdata['filename'].".txt", $require); 721 file_put_contents( $data_path."regex_files/".$_postdata['filename'].".txt", $require);
712 722
713 723
714 } 724 }
715 } 725 }
716
717 726
718 private function GetTableArray($_taglistArray, $_topic_tag, $_content) { 727 private function GetTableArray($_taglistArray, $_topic_tag, $_content) {
719 728
720 $outputTableArray = array(); 729 $outputTableArray = array();
721 $outputTableArray[0]=array(); 730 $outputTableArray[0]=array();
723 $outputTableArray[0][1]=array(); 732 $outputTableArray[0][1]=array();
724 foreach ( $_taglistArray as $value ) { 733 foreach ( $_taglistArray as $value ) {
725 $outputTableArray[0][0][$value[2]] = $value[1]; 734 $outputTableArray[0][0][$value[2]] = $value[1];
726 $outputTableArray[0][1][$value[2]] = $value[1]."(Title)"; 735 $outputTableArray[0][1][$value[2]] = $value[1]."(Title)";
727 } 736 }
728 $outputTableArray[0]["other"] = "其他"; 737 //remove other, full columns
738 //$outputTableArray[0]["other"] = "其他";
729 $outputTableArray[0]["page"] = "頁數"; 739 $outputTableArray[0]["page"] = "頁數";
730 $outputTableArray[0]["full"] = "全文"; 740 //$outputTableArray[0]["full"] = "全文";
731 741
732 // id, name, tag, color in _taglistArray 742 // id, name, tag, color in _taglistArray
733 foreach ( $_taglistArray as $tagValue ) { 743 foreach ( $_taglistArray as $tagValue ) {
734 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name 744 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name
735 $content = preg_replace("/<\/".$tag_name.">○*<".$tag_name.">/u", "", $_content); 745 $content = preg_replace("/<\/".$tag_name.">○*<".$tag_name.">/u", "", $_content);
741 $count=0; 751 $count=0;
742 $pageNow=NULL; 752 $pageNow=NULL;
743 foreach ( $contentLineArray as $value ) { 753 foreach ( $contentLineArray as $value ) {
744 $count++; 754 $count++;
745 $recordString = $value; 755 $recordString = $value;
746 $otherString = $recordString; 756 //$otherString = $recordString;
747 //echo $recordString."<br>\n"; 757 //echo $recordString."<br>\n";
748 // find hyper link in pattern with <a>...</a> 758 // find hyper link in pattern with <a>...</a>
749 if ( preg_match("/【<a(.*?)>(.*?)<\/a>】/u", $recordString, $matches) ) { 759 if ( preg_match("/【<a(.*?)>(.*?)<\/a>】/u", $recordString, $matches) ) {
750 $pageNow = $matches[2]; 760 $pageNow = $matches[2];
751 } 761 }
762
752 foreach ( $_taglistArray as $tagValue ) { 763 foreach ( $_taglistArray as $tagValue ) {
753 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name 764 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name
754 765
755 if ( preg_match_all("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", $recordString, $matches, PREG_SET_ORDER) ) { 766 if ( preg_match_all("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", $recordString, $matches, PREG_SET_ORDER) ) {
756 foreach ( $matches as $matchesValue ) { 767 foreach ( $matches as $matchesValue ) {
770 } else { 781 } else {
771 $outputTableArray[$count][0][$tag_name] = $matchesValue[1]; 782 $outputTableArray[$count][0][$tag_name] = $matchesValue[1];
772 } 783 }
773 } 784 }
774 } 785 }
775 $otherString = preg_replace("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", " ", $otherString); 786 //$otherString = preg_replace("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", " ", $otherString);
776 } 787 }
777 } 788 }
778 $otherString = preg_replace("/○/u", "", $otherString); 789
779 $outputTableArray[$count]["other"] = $otherString; 790 //$otherString = preg_replace("/○/u", "", $otherString);
791 //$outputTableArray[$count]["other"] = $otherString;
780 $outputTableArray[$count]["page"] = $pageNow; 792 $outputTableArray[$count]["page"] = $pageNow;
781 $value = preg_replace("/>/u", "&gt;", $value); 793 //$value = preg_replace("/>/u", "&gt;", $value);
782 $value = preg_replace("/</u", "&lt;", $value); 794 //$value = preg_replace("/</u", "&lt;", $value);
783 $outputTableArray[$count]["full"] = $value; 795 //$outputTableArray[$count]["full"] = $value;
784 } 796 }
785 797
786 798
787 799
788 foreach ( $outputTableArray as $arrayIndex => $arrayValue ) { 800 foreach ( $outputTableArray as $arrayIndex => $arrayValue ) {
962 $this->GetSQLValueString($tag, "text"), 974 $this->GetSQLValueString($tag, "text"),
963 $this->GetSQLValueString($color, "text"), 975 $this->GetSQLValueString($color, "text"),
964 $this->GetSQLValueString($date, "date"), 976 $this->GetSQLValueString($date, "date"),
965 $this->GetSQLValueString($id, "int")); 977 $this->GetSQLValueString($id, "int"));
966 $result = mysql_query($query); 978 $result = mysql_query($query);
979 if (!$result) {
980 echo 0;
981 }
967 982
968 } 983 }
969 984
970 } 985 }
971 986
1163 $data['tag_others'] = $tag_others; 1178 $data['tag_others'] = $tag_others;
1164 1179
1165 return $data; 1180 return $data;
1166 } 1181 }
1167 1182
1168
1169
1170 public function UpdateTagsInTopic($_postdata) { 1183 public function UpdateTagsInTopic($_postdata) {
1171 $topic_id = $_postdata['topic_id']; 1184 $topic_id = $_postdata['topic_id'];
1172 $tag_ids = json_decode(str_replace('\\', '', $_postdata['ids'])); 1185 $tag_ids = json_decode(str_replace('\\', '', $_postdata['ids']));
1173 1186
1174 // update topic_tag_relation by tags_ids array as `tag_id` and topic_id as `topic_id` 1187 // update topic_tag_relation by tags_ids array as `tag_id` and topic_id as `topic_id`
1244 } 1257 }
1245 1258
1246 1259
1247 } 1260 }
1248 1261
1249
1250 private function GetTaglistByTopicId($topic_id) { 1262 private function GetTaglistByTopicId($topic_id) {
1251 $taglistArray = array(); 1263 $taglistArray = array();
1252 // select taglist ids from topic_tag_relation table 1264 // select taglist ids from topic_tag_relation table
1253 $query = sprintf("SELECT * FROM `TopicTagRelation` WHERE `topicId`='%s'", $topic_id); 1265 $query = sprintf("SELECT * FROM `TopicTagRelation` WHERE `topicId`='%s'", $topic_id);
1254 $result = mysql_query($query); 1266 $result = mysql_query($query);
1293 //array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['name_en'],'name_ch'=>$row['name_ch'],'name_pinyin'=>$row['name_pinyin'],)); 1305 //array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['name_en'],'name_ch'=>$row['name_ch'],'name_pinyin'=>$row['name_pinyin'],));
1294 array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['nameEn'],'name_ch'=>$row['nameCh'],'name_pinyin'=>$row['namePinyin'],)); 1306 array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['nameEn'],'name_ch'=>$row['nameCh'],'name_pinyin'=>$row['namePinyin'],));
1295 } 1307 }
1296 return $topiclistArray; 1308 return $topiclistArray;
1297 } 1309 }
1298
1299 1310
1300 private function GetTopicTag($topic_id) { 1311 private function GetTopicTag($topic_id) {
1301 $result = $this->GetTopicById($topic_id); 1312 $result = $this->GetTopicById($topic_id);
1302 $row = mysql_fetch_assoc($result); 1313 $row = mysql_fetch_assoc($result);
1303 $tag = $row['tag']; 1314 $tag = $row['tag'];
1540 // ---- 1551 // ----
1541 1552
1542 // if the text is from file system 1553 // if the text is from file system
1543 $stringInput = preg_replace("/ /u", "○", $stringInput); 1554 $stringInput = preg_replace("/ /u", "○", $stringInput);
1544 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); 1555 $stringInput = preg_replace("/\n/u", "<br>", $stringInput);
1545 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$bookId."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); 1556 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput);
1546 1557
1547 $this->branch_id = 1; // testing at local 1558 $this->branch_id = 1; // testing at local
1548 1559
1549 } else { 1560 } else {
1550 $query = sprintf("SELECT `content`, `line`, `books_id` FROM `contents` WHERE `books_id`=\"%s\" AND `line`>=%d AND `line`<=%d", $bookId, $startPage, $endPage); 1561 $query = sprintf("SELECT `content`, `line`, `books_id` FROM `contents` WHERE `books_id`=\"%s\" AND `line`>=%d AND `line`<=%d", $bookId, $startPage, $endPage);
1559 // the text is from database 1570 // the text is from database
1560 $stringInput = $contentString; 1571 $stringInput = $contentString;
1561 $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput); 1572 $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput);
1562 $stringInput = preg_replace("/ /u", "○", $stringInput); 1573 $stringInput = preg_replace("/ /u", "○", $stringInput);
1563 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); 1574 $stringInput = preg_replace("/\n/u", "<br>", $stringInput);
1564 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$bookId."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); 1575 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput);
1565 } 1576 }
1566 1577
1567 return $stringInput; 1578 return $stringInput;
1568 1579
1569 } 1580 }