Mercurial > hg > extraction-interface
comparison models/extractapp.php @ 115:507070df62e6 extractapp
add escape rule for regular expression
author | Calvin Yeh <cyeh@mpiwg-berlin.mpg.de> |
---|---|
date | Thu, 28 Sep 2017 17:13:39 +0200 |
parents | 7bdbc7fc7936 |
children |
comparison
equal
deleted
inserted
replaced
114:7d6a107c37da | 115:507070df62e6 |
---|---|
62 | 62 |
63 | 63 |
64 $stringInput = $lg_text; | 64 $stringInput = $lg_text; |
65 $stringInput = preg_replace("/ /u", "○", $stringInput); | 65 $stringInput = preg_replace("/ /u", "○", $stringInput); |
66 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); | 66 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); |
67 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$this->book_id."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); | 67 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput); |
68 $lg_text = $stringInput; | 68 $lg_text = $stringInput; |
69 | 69 |
70 $this->lg_text = $lg_text; | 70 $this->lg_text = $lg_text; |
71 | 71 |
72 } | 72 } |
100 $book_meta->level2 = $b['level2']; | 100 $book_meta->level2 = $b['level2']; |
101 $book_meta->period = $b['period']; | 101 $book_meta->period = $b['period']; |
102 $book_meta->dynasty = $b['dynasty']; | 102 $book_meta->dynasty = $b['dynasty']; |
103 $book_meta->start_year = $b['start_year']; | 103 $book_meta->start_year = $b['start_year']; |
104 $book_meta->end_year = $b['end_year']; | 104 $book_meta->end_year = $b['end_year']; |
105 $book_meta->book_year = $b['book_year']; | |
106 $book_meta->edition_year = $b['edition_year']; | |
105 $book_meta->line = $b['line']; | 107 $book_meta->line = $b['line']; |
106 $book_meta->volume = $b['volume']; | 108 $book_meta->volume = $b['volume']; |
107 $book_meta->author = $b['author']; | 109 $book_meta->author = $b['author']; |
108 $book_meta->edition = $b['edition']; | 110 $book_meta->edition = $b['edition']; |
109 $book_meta->in_jibengujiku = $b['in_jibengujiku']; | 111 $book_meta->in_jibengujiku = $b['in_jibengujiku']; |
110 $book_meta->admin_type = $b['admin_type']; | 112 $book_meta->admin_type = $b['admin_type']; |
113 $book_meta->book_year = $b['book_year']; | |
114 $book_meta->edition_year = $b['edition_year']; | |
111 | 115 |
112 $coordinates = $section_meta['section']['coordinates_books']; | 116 $coordinates = $section_meta['section']['coordinates_books']; |
113 $book_meta->x = $coordinates['x']; | 117 $book_meta->x = $coordinates['x']; |
114 $book_meta->y = $coordinates['y']; | 118 $book_meta->y = $coordinates['y']; |
115 $book_meta->place_name = $coordinates['place_name']; | 119 $book_meta->place_name = $coordinates['place_name']; |
145 | 149 |
146 $stringInput = $lg_text; | 150 $stringInput = $lg_text; |
147 $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput); | 151 $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput); |
148 $stringInput = preg_replace("/ /u", "○", $stringInput); | 152 $stringInput = preg_replace("/ /u", "○", $stringInput); |
149 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); | 153 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); |
150 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$this->book_id."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); | 154 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput); |
151 $lg_text = $stringInput; | 155 $lg_text = $stringInput; |
152 | 156 |
153 $this->lg_text = $lg_text; | 157 $this->lg_text = $lg_text; |
154 } | 158 } |
155 | 159 |
158 $this->section_id = $_id; | 162 $this->section_id = $_id; |
159 //$this->branch_id = 1; // local test sets branch_id to 1 | 163 //$this->branch_id = 1; // local test sets branch_id to 1 |
160 $this->messages['debug'] .= "[Debug] from my local"."<br>"; | 164 $this->messages['debug'] .= "[Debug] from my local"."<br>"; |
161 $this->lg_text = $this->GetSectionContent(); | 165 $this->lg_text = $this->GetSectionContent(); |
162 } | 166 } |
163 | |
164 | 167 |
165 public function SetInfoFromPreviousPage($_postdata) { | 168 public function SetInfoFromPreviousPage($_postdata) { |
166 /** | 169 /** |
167 * Get information from post data passsed by the previous page, and set variables in $this. | 170 * Get information from post data passsed by the previous page, and set variables in $this. |
168 * The previous page could be: | 171 * The previous page could be: |
211 if (isset($_postdata['book_meta'])) { | 214 if (isset($_postdata['book_meta'])) { |
212 $this->book_meta = json_decode($_postdata['book_meta']); | 215 $this->book_meta = json_decode($_postdata['book_meta']); |
213 } | 216 } |
214 | 217 |
215 } | 218 } |
216 | |
217 | 219 |
218 private function TaglistSubsetIn($list1, $list2) { // $l1 is a subset of $l2 or not | 220 private function TaglistSubsetIn($list1, $list2) { // $l1 is a subset of $l2 or not |
219 // l1 and l2: array( $row['id'], $row['name'], $row['tag'], $row['color'] ) | 221 // l1 and l2: array( $row['id'], $row['name'], $row['tag'], $row['color'] ) |
220 $cnt_list1 = count($list1); | 222 $cnt_list1 = count($list1); |
221 $cnt_list2 = count($list2); | 223 $cnt_list2 = count($list2); |
302 | 304 |
303 | 305 |
304 $this->lg_text = $lg_text; | 306 $this->lg_text = $lg_text; |
305 | 307 |
306 } | 308 } |
309 | |
307 public function StartTagging() { | 310 public function StartTagging() { |
308 /** | 311 /** |
309 * This is the main method for tagging text. It passes all the information to "views/Extractapp/TaggingText.php" view. | 312 * This is the main method for tagging text. It passes all the information to "views/Extractapp/TaggingText.php" view. |
310 * The information contain the text string, taglist array, wordlis array, topic, etc. | 313 * The information contain the text string, taglist array, wordlis array, topic, etc. |
311 */ | 314 */ |
398 $require = $postdata['text']; | 401 $require = $postdata['text']; |
399 } | 402 } |
400 | 403 |
401 $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require); | 404 $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require); |
402 $require = preg_replace('/&/u', "&", $require); | 405 $require = preg_replace('/&/u', "&", $require); |
403 $require = preg_replace('/ /u', " ", $require); //avoiding invaild xml format | 406 $require = preg_replace('/ /u', " ", $require); |
407 $require = preg_replace('/<font [^><]*>/u', "", $require); | |
408 $require = preg_replace('/<\/font>/u', "", $require); | |
409 $require = preg_replace('/ style="background-color: transparent;"/u', "", $require); | |
404 $require = preg_replace("/○/u", " ", $require); | 410 $require = preg_replace("/○/u", " ", $require); |
405 $require = preg_replace("/<br>/u", "\n", $require); | 411 $require = preg_replace("/<br>/u", "\n", $require); |
406 $require = preg_replace("/<br>/u", "\n", $require); | 412 $require = preg_replace("/<br>/u", "\n", $require); |
407 file_put_contents("data/parsing_files/".$postdata['filename'].".txt", $require); | 413 file_put_contents("data/parsing_files/".$postdata['filename'].".txt", $require); |
408 } | 414 } |
409 | 415 |
410 } | 416 } |
411 | |
412 | 417 |
413 public function UpdateInfoResponsedFromLGService($response) { | 418 public function UpdateInfoResponsedFromLGService($response) { |
414 /** | 419 /** |
415 * | 420 * |
416 */ | 421 */ |
466 $text .= "<level2>".$book->level2."</level2>\n"; | 471 $text .= "<level2>".$book->level2."</level2>\n"; |
467 $text .= "<period>".$book->period."</period>\n"; | 472 $text .= "<period>".$book->period."</period>\n"; |
468 $text .= "<dynasty>".$book->dynasty."</dynasty>\n"; | 473 $text .= "<dynasty>".$book->dynasty."</dynasty>\n"; |
469 $text .= "<start_year>".$book->start_year."</start_year>\n"; | 474 $text .= "<start_year>".$book->start_year."</start_year>\n"; |
470 $text .= "<end_year>".$book->end_year."</end_year>\n"; | 475 $text .= "<end_year>".$book->end_year."</end_year>\n"; |
476 $text .= "<book_year>".$book->book_year."</book_year>\n"; | |
477 $text .= "<edition_year>".$book->edition_year."</edition_year>\n"; | |
471 $text .= "<line>".$book->line."</line>\n"; | 478 $text .= "<line>".$book->line."</line>\n"; |
472 $text .= "<volume>".$book->volume."</volume>\n"; | 479 $text .= "<volume>".$book->volume."</volume>\n"; |
473 $text .= "<author>".$book->author."</author>\n"; | 480 $text .= "<author>".$book->author."</author>\n"; |
474 $text .= "<edition>".$book->edition."</edition>\n"; | 481 $text .= "<edition>".$book->edition."</edition>\n"; |
475 $text .= "<in_jibengujiku>".$book->in_jibengujiku."</in_jibengujiku>\n"; | 482 $text .= "<in_jibengujiku>".$book->in_jibengujiku."</in_jibengujiku>\n"; |
538 $require = $_postdata['text']; | 545 $require = $_postdata['text']; |
539 } | 546 } |
540 | 547 |
541 $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require); | 548 $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require); |
542 $require = preg_replace('/&/u', "&", $require); | 549 $require = preg_replace('/&/u', "&", $require); |
543 $require = preg_replace('/ /u', " ", $require); //avoiding invaild xml format | 550 $require = preg_replace('/ /u', " ", $require); |
551 $require = preg_replace('/<font [^><]*>/u', "", $require); | |
552 $require = preg_replace('/<\/font>/u', "", $require); | |
553 $require = preg_replace('/ style="background-color: transparent;"/u', "", $require); | |
544 $require = preg_replace("/○/u", " ", $require); | 554 $require = preg_replace("/○/u", " ", $require); |
545 $require = preg_replace("/<br(.*?)>/u", "\n", $require); | 555 $require = preg_replace("/<br(.*?)>/u", "\n", $require); |
546 //$require = preg_replace("/<br>/u", "\n", $require); | 556 //$require = preg_replace("/<br>/u", "\n", $require); |
547 | 557 |
548 $require = "<text_content>".$require."</text_content>\n"; | 558 $require = "<text_content>".$require."</text_content>\n"; |
636 array_push($filenames, $row['regex_filename']); | 646 array_push($filenames, $row['regex_filename']); |
637 } | 647 } |
638 | 648 |
639 return $filenames; | 649 return $filenames; |
640 } | 650 } |
651 | |
641 public function LoadSmartRegex($topic_id) { | 652 public function LoadSmartRegex($topic_id) { |
642 | 653 |
643 // Load regex file based on current topic. Only shows the regex in this topic -- | 654 // Load regex file based on current topic. Only shows the regex in this topic -- |
644 $filenames = $this->GetRegexFilenameById($topic_id); | 655 $filenames = $this->GetRegexFilenameById($topic_id); |
645 | 656 |
659 } | 670 } |
660 echo json_encode($returnArray); | 671 echo json_encode($returnArray); |
661 return; | 672 return; |
662 } | 673 } |
663 | 674 |
664 | |
665 public function SaveSmartRegex($_postdata) { | 675 public function SaveSmartRegex($_postdata) { |
666 if ($_postdata['text']){ | 676 if ($_postdata['text']){ |
667 | 677 |
668 // --- update topic_regex_relation table --- | 678 // --- update topic_regex_relation table --- |
669 $topic_id = $_postdata['topic_id']; | 679 $topic_id = $_postdata['topic_id']; |
711 file_put_contents( $data_path."regex_files/".$_postdata['filename'].".txt", $require); | 721 file_put_contents( $data_path."regex_files/".$_postdata['filename'].".txt", $require); |
712 | 722 |
713 | 723 |
714 } | 724 } |
715 } | 725 } |
716 | |
717 | 726 |
718 private function GetTableArray($_taglistArray, $_topic_tag, $_content) { | 727 private function GetTableArray($_taglistArray, $_topic_tag, $_content) { |
719 | 728 |
720 $outputTableArray = array(); | 729 $outputTableArray = array(); |
721 $outputTableArray[0]=array(); | 730 $outputTableArray[0]=array(); |
723 $outputTableArray[0][1]=array(); | 732 $outputTableArray[0][1]=array(); |
724 foreach ( $_taglistArray as $value ) { | 733 foreach ( $_taglistArray as $value ) { |
725 $outputTableArray[0][0][$value[2]] = $value[1]; | 734 $outputTableArray[0][0][$value[2]] = $value[1]; |
726 $outputTableArray[0][1][$value[2]] = $value[1]."(Title)"; | 735 $outputTableArray[0][1][$value[2]] = $value[1]."(Title)"; |
727 } | 736 } |
728 $outputTableArray[0]["other"] = "其他"; | 737 //remove other, full columns |
738 //$outputTableArray[0]["other"] = "其他"; | |
729 $outputTableArray[0]["page"] = "頁數"; | 739 $outputTableArray[0]["page"] = "頁數"; |
730 $outputTableArray[0]["full"] = "全文"; | 740 //$outputTableArray[0]["full"] = "全文"; |
731 | 741 |
732 // id, name, tag, color in _taglistArray | 742 // id, name, tag, color in _taglistArray |
733 foreach ( $_taglistArray as $tagValue ) { | 743 foreach ( $_taglistArray as $tagValue ) { |
734 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name | 744 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name |
735 $content = preg_replace("/<\/".$tag_name.">○*<".$tag_name.">/u", "", $_content); | 745 $content = preg_replace("/<\/".$tag_name.">○*<".$tag_name.">/u", "", $_content); |
741 $count=0; | 751 $count=0; |
742 $pageNow=NULL; | 752 $pageNow=NULL; |
743 foreach ( $contentLineArray as $value ) { | 753 foreach ( $contentLineArray as $value ) { |
744 $count++; | 754 $count++; |
745 $recordString = $value; | 755 $recordString = $value; |
746 $otherString = $recordString; | 756 //$otherString = $recordString; |
747 //echo $recordString."<br>\n"; | 757 //echo $recordString."<br>\n"; |
748 // find hyper link in pattern with <a>...</a> | 758 // find hyper link in pattern with <a>...</a> |
749 if ( preg_match("/【<a(.*?)>(.*?)<\/a>】/u", $recordString, $matches) ) { | 759 if ( preg_match("/【<a(.*?)>(.*?)<\/a>】/u", $recordString, $matches) ) { |
750 $pageNow = $matches[2]; | 760 $pageNow = $matches[2]; |
751 } | 761 } |
762 | |
752 foreach ( $_taglistArray as $tagValue ) { | 763 foreach ( $_taglistArray as $tagValue ) { |
753 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name | 764 $tag_name = $tagValue[2]; // $tagValue[2] is tag_name |
754 | 765 |
755 if ( preg_match_all("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", $recordString, $matches, PREG_SET_ORDER) ) { | 766 if ( preg_match_all("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", $recordString, $matches, PREG_SET_ORDER) ) { |
756 foreach ( $matches as $matchesValue ) { | 767 foreach ( $matches as $matchesValue ) { |
770 } else { | 781 } else { |
771 $outputTableArray[$count][0][$tag_name] = $matchesValue[1]; | 782 $outputTableArray[$count][0][$tag_name] = $matchesValue[1]; |
772 } | 783 } |
773 } | 784 } |
774 } | 785 } |
775 $otherString = preg_replace("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", " ", $otherString); | 786 //$otherString = preg_replace("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", " ", $otherString); |
776 } | 787 } |
777 } | 788 } |
778 $otherString = preg_replace("/○/u", "", $otherString); | 789 |
779 $outputTableArray[$count]["other"] = $otherString; | 790 //$otherString = preg_replace("/○/u", "", $otherString); |
791 //$outputTableArray[$count]["other"] = $otherString; | |
780 $outputTableArray[$count]["page"] = $pageNow; | 792 $outputTableArray[$count]["page"] = $pageNow; |
781 $value = preg_replace("/>/u", ">", $value); | 793 //$value = preg_replace("/>/u", ">", $value); |
782 $value = preg_replace("/</u", "<", $value); | 794 //$value = preg_replace("/</u", "<", $value); |
783 $outputTableArray[$count]["full"] = $value; | 795 //$outputTableArray[$count]["full"] = $value; |
784 } | 796 } |
785 | 797 |
786 | 798 |
787 | 799 |
788 foreach ( $outputTableArray as $arrayIndex => $arrayValue ) { | 800 foreach ( $outputTableArray as $arrayIndex => $arrayValue ) { |
962 $this->GetSQLValueString($tag, "text"), | 974 $this->GetSQLValueString($tag, "text"), |
963 $this->GetSQLValueString($color, "text"), | 975 $this->GetSQLValueString($color, "text"), |
964 $this->GetSQLValueString($date, "date"), | 976 $this->GetSQLValueString($date, "date"), |
965 $this->GetSQLValueString($id, "int")); | 977 $this->GetSQLValueString($id, "int")); |
966 $result = mysql_query($query); | 978 $result = mysql_query($query); |
979 if (!$result) { | |
980 echo 0; | |
981 } | |
967 | 982 |
968 } | 983 } |
969 | 984 |
970 } | 985 } |
971 | 986 |
1163 $data['tag_others'] = $tag_others; | 1178 $data['tag_others'] = $tag_others; |
1164 | 1179 |
1165 return $data; | 1180 return $data; |
1166 } | 1181 } |
1167 | 1182 |
1168 | |
1169 | |
1170 public function UpdateTagsInTopic($_postdata) { | 1183 public function UpdateTagsInTopic($_postdata) { |
1171 $topic_id = $_postdata['topic_id']; | 1184 $topic_id = $_postdata['topic_id']; |
1172 $tag_ids = json_decode(str_replace('\\', '', $_postdata['ids'])); | 1185 $tag_ids = json_decode(str_replace('\\', '', $_postdata['ids'])); |
1173 | 1186 |
1174 // update topic_tag_relation by tags_ids array as `tag_id` and topic_id as `topic_id` | 1187 // update topic_tag_relation by tags_ids array as `tag_id` and topic_id as `topic_id` |
1244 } | 1257 } |
1245 | 1258 |
1246 | 1259 |
1247 } | 1260 } |
1248 | 1261 |
1249 | |
1250 private function GetTaglistByTopicId($topic_id) { | 1262 private function GetTaglistByTopicId($topic_id) { |
1251 $taglistArray = array(); | 1263 $taglistArray = array(); |
1252 // select taglist ids from topic_tag_relation table | 1264 // select taglist ids from topic_tag_relation table |
1253 $query = sprintf("SELECT * FROM `TopicTagRelation` WHERE `topicId`='%s'", $topic_id); | 1265 $query = sprintf("SELECT * FROM `TopicTagRelation` WHERE `topicId`='%s'", $topic_id); |
1254 $result = mysql_query($query); | 1266 $result = mysql_query($query); |
1293 //array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['name_en'],'name_ch'=>$row['name_ch'],'name_pinyin'=>$row['name_pinyin'],)); | 1305 //array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['name_en'],'name_ch'=>$row['name_ch'],'name_pinyin'=>$row['name_pinyin'],)); |
1294 array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['nameEn'],'name_ch'=>$row['nameCh'],'name_pinyin'=>$row['namePinyin'],)); | 1306 array_push($topiclistArray, array('id'=>$row['id'],'tag'=>$row['tag'],'name_en'=>$row['nameEn'],'name_ch'=>$row['nameCh'],'name_pinyin'=>$row['namePinyin'],)); |
1295 } | 1307 } |
1296 return $topiclistArray; | 1308 return $topiclistArray; |
1297 } | 1309 } |
1298 | |
1299 | 1310 |
1300 private function GetTopicTag($topic_id) { | 1311 private function GetTopicTag($topic_id) { |
1301 $result = $this->GetTopicById($topic_id); | 1312 $result = $this->GetTopicById($topic_id); |
1302 $row = mysql_fetch_assoc($result); | 1313 $row = mysql_fetch_assoc($result); |
1303 $tag = $row['tag']; | 1314 $tag = $row['tag']; |
1540 // ---- | 1551 // ---- |
1541 | 1552 |
1542 // if the text is from file system | 1553 // if the text is from file system |
1543 $stringInput = preg_replace("/ /u", "○", $stringInput); | 1554 $stringInput = preg_replace("/ /u", "○", $stringInput); |
1544 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); | 1555 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); |
1545 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$bookId."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); | 1556 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput); |
1546 | 1557 |
1547 $this->branch_id = 1; // testing at local | 1558 $this->branch_id = 1; // testing at local |
1548 | 1559 |
1549 } else { | 1560 } else { |
1550 $query = sprintf("SELECT `content`, `line`, `books_id` FROM `contents` WHERE `books_id`=\"%s\" AND `line`>=%d AND `line`<=%d", $bookId, $startPage, $endPage); | 1561 $query = sprintf("SELECT `content`, `line`, `books_id` FROM `contents` WHERE `books_id`=\"%s\" AND `line`>=%d AND `line`<=%d", $bookId, $startPage, $endPage); |
1559 // the text is from database | 1570 // the text is from database |
1560 $stringInput = $contentString; | 1571 $stringInput = $contentString; |
1561 $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput); | 1572 $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput); |
1562 $stringInput = preg_replace("/ /u", "○", $stringInput); | 1573 $stringInput = preg_replace("/ /u", "○", $stringInput); |
1563 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); | 1574 $stringInput = preg_replace("/\n/u", "<br>", $stringInput); |
1564 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$bookId."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); | 1575 $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput); |
1565 } | 1576 } |
1566 | 1577 |
1567 return $stringInput; | 1578 return $stringInput; |
1568 | 1579 |
1569 } | 1580 } |