Mercurial > hg > extraction-interface
changeset 115:507070df62e6 extractapp
add escape rule for regular expression
author | Calvin Yeh <cyeh@mpiwg-berlin.mpg.de> |
---|---|
date | Thu, 28 Sep 2017 17:13:39 +0200 |
parents | 7d6a107c37da |
children | 0ef426b5a1b9 |
files | models/extractapp.php |
diffstat | 1 files changed, 35 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- a/models/extractapp.php Thu Sep 28 14:01:59 2017 +0200 +++ b/models/extractapp.php Thu Sep 28 17:13:39 2017 +0200 @@ -64,7 +64,7 @@ $stringInput = $lg_text; $stringInput = preg_replace("/ /u", "○", $stringInput); $stringInput = preg_replace("/\n/u", "<br>", $stringInput); - $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$this->book_id."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); + $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput); $lg_text = $stringInput; $this->lg_text = $lg_text; @@ -102,12 +102,16 @@ $book_meta->dynasty = $b['dynasty']; $book_meta->start_year = $b['start_year']; $book_meta->end_year = $b['end_year']; + $book_meta->book_year = $b['book_year']; + $book_meta->edition_year = $b['edition_year']; $book_meta->line = $b['line']; $book_meta->volume = $b['volume']; $book_meta->author = $b['author']; $book_meta->edition = $b['edition']; $book_meta->in_jibengujiku = $b['in_jibengujiku']; $book_meta->admin_type = $b['admin_type']; + $book_meta->book_year = $b['book_year']; + $book_meta->edition_year = $b['edition_year']; $coordinates = $section_meta['section']['coordinates_books']; $book_meta->x = $coordinates['x']; @@ -147,7 +151,7 @@ $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput); $stringInput = preg_replace("/ /u", "○", $stringInput); $stringInput = preg_replace("/\n/u", "<br>", $stringInput); - $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$this->book_id."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); + $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput); $lg_text = $stringInput; $this->lg_text = $lg_text; @@ -161,7 +165,6 @@ $this->lg_text = $this->GetSectionContent(); } - public function SetInfoFromPreviousPage($_postdata) { /** * Get information from post data passsed by the previous page, and set variables in $this. @@ -214,7 +217,6 @@ } - private function TaglistSubsetIn($list1, $list2) { // $l1 is a subset of $l2 or not // l1 and l2: array( $row['id'], $row['name'], $row['tag'], $row['color'] ) $cnt_list1 = count($list1); @@ -304,6 +306,7 @@ $this->lg_text = $lg_text; } + public function StartTagging() { /** * This is the main method for tagging text. It passes all the information to "views/Extractapp/TaggingText.php" view. @@ -400,7 +403,10 @@ $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require); $require = preg_replace('/&/u', "&", $require); - $require = preg_replace('/ /u', " ", $require); //avoiding invaild xml format + $require = preg_replace('/ /u', " ", $require); + $require = preg_replace('/<font [^><]*>/u', "", $require); + $require = preg_replace('/<\/font>/u', "", $require); + $require = preg_replace('/ style="background-color: transparent;"/u', "", $require); $require = preg_replace("/○/u", " ", $require); $require = preg_replace("/<br>/u", "\n", $require); $require = preg_replace("/<br>/u", "\n", $require); @@ -409,7 +415,6 @@ } - public function UpdateInfoResponsedFromLGService($response) { /** * @@ -468,6 +473,8 @@ $text .= "<dynasty>".$book->dynasty."</dynasty>\n"; $text .= "<start_year>".$book->start_year."</start_year>\n"; $text .= "<end_year>".$book->end_year."</end_year>\n"; + $text .= "<book_year>".$book->book_year."</book_year>\n"; + $text .= "<edition_year>".$book->edition_year."</edition_year>\n"; $text .= "<line>".$book->line."</line>\n"; $text .= "<volume>".$book->volume."</volume>\n"; $text .= "<author>".$book->author."</author>\n"; @@ -540,7 +547,10 @@ $require = preg_replace("/【<a(.*?)>(.*?)<\/a>】/u", "【\\2】", $require); $require = preg_replace('/&/u', "&", $require); - $require = preg_replace('/ /u', " ", $require); //avoiding invaild xml format + $require = preg_replace('/ /u', " ", $require); + $require = preg_replace('/<font [^><]*>/u', "", $require); + $require = preg_replace('/<\/font>/u', "", $require); + $require = preg_replace('/ style="background-color: transparent;"/u', "", $require); $require = preg_replace("/○/u", " ", $require); $require = preg_replace("/<br(.*?)>/u", "\n", $require); //$require = preg_replace("/<br>/u", "\n", $require); @@ -638,6 +648,7 @@ return $filenames; } + public function LoadSmartRegex($topic_id) { // Load regex file based on current topic. Only shows the regex in this topic -- @@ -661,7 +672,6 @@ return; } - public function SaveSmartRegex($_postdata) { if ($_postdata['text']){ @@ -714,7 +724,6 @@ } } - private function GetTableArray($_taglistArray, $_topic_tag, $_content) { $outputTableArray = array(); @@ -725,9 +734,10 @@ $outputTableArray[0][0][$value[2]] = $value[1]; $outputTableArray[0][1][$value[2]] = $value[1]."(Title)"; } - $outputTableArray[0]["other"] = "其他"; + //remove other, full columns + //$outputTableArray[0]["other"] = "其他"; $outputTableArray[0]["page"] = "頁數"; - $outputTableArray[0]["full"] = "全文"; + //$outputTableArray[0]["full"] = "全文"; // id, name, tag, color in _taglistArray foreach ( $_taglistArray as $tagValue ) { @@ -743,12 +753,13 @@ foreach ( $contentLineArray as $value ) { $count++; $recordString = $value; - $otherString = $recordString; + //$otherString = $recordString; //echo $recordString."<br>\n"; // find hyper link in pattern with <a>...</a> if ( preg_match("/【<a(.*?)>(.*?)<\/a>】/u", $recordString, $matches) ) { $pageNow = $matches[2]; } + foreach ( $_taglistArray as $tagValue ) { $tag_name = $tagValue[2]; // $tagValue[2] is tag_name @@ -772,15 +783,16 @@ } } } - $otherString = preg_replace("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", " ", $otherString); + //$otherString = preg_replace("/<".$tag_name.">(.*?)<\/".$tag_name.">/u", " ", $otherString); } } - $otherString = preg_replace("/○/u", "", $otherString); - $outputTableArray[$count]["other"] = $otherString; + + //$otherString = preg_replace("/○/u", "", $otherString); + //$outputTableArray[$count]["other"] = $otherString; $outputTableArray[$count]["page"] = $pageNow; - $value = preg_replace("/>/u", ">", $value); - $value = preg_replace("/</u", "<", $value); - $outputTableArray[$count]["full"] = $value; + //$value = preg_replace("/>/u", ">", $value); + //$value = preg_replace("/</u", "<", $value); + //$outputTableArray[$count]["full"] = $value; } @@ -964,6 +976,9 @@ $this->GetSQLValueString($date, "date"), $this->GetSQLValueString($id, "int")); $result = mysql_query($query); + if (!$result) { + echo 0; + } } @@ -1165,8 +1180,6 @@ return $data; } - - public function UpdateTagsInTopic($_postdata) { $topic_id = $_postdata['topic_id']; $tag_ids = json_decode(str_replace('\\', '', $_postdata['ids'])); @@ -1246,7 +1259,6 @@ } - private function GetTaglistByTopicId($topic_id) { $taglistArray = array(); // select taglist ids from topic_tag_relation table @@ -1296,7 +1308,6 @@ return $topiclistArray; } - private function GetTopicTag($topic_id) { $result = $this->GetTopicById($topic_id); $row = mysql_fetch_assoc($result); @@ -1542,7 +1553,7 @@ // if the text is from file system $stringInput = preg_replace("/ /u", "○", $stringInput); $stringInput = preg_replace("/\n/u", "<br>", $stringInput); - $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$bookId."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); + $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput); $this->branch_id = 1; // testing at local @@ -1561,7 +1572,7 @@ $stringInput = preg_replace("/<(.*?)>/u", "○", $stringInput); $stringInput = preg_replace("/ /u", "○", $stringInput); $stringInput = preg_replace("/\n/u", "<br>", $stringInput); - $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"review_index_xml_images.php?books_id=".$bookId."&pages=\\1&entry=0\" target=\"_bookImg\">\\1</a>】", $stringInput); + $stringInput = preg_replace("/【(.*?)】/u", "【<a href=\"#\">\\1</a>】", $stringInput); } return $stringInput;