Mercurial > hg > extraction-interface
comparison js/taggingtext.js @ 89:e681d693240e extractapp
new: generated regex to SmartRegex
| author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
|---|---|
| date | Wed, 03 Jun 2015 16:54:09 +0200 |
| parents | fb5049fc5dd7 |
| children | a05491461199 |
comparison
equal
deleted
inserted
replaced
| 88:61593b047289 | 89:e681d693240e |
|---|---|
| 576 }); | 576 }); |
| 577 | 577 |
| 578 function genRegexWindowOpen(){ | 578 function genRegexWindowOpen(){ |
| 579 var btn_state = $('#regex_generator').css('display'); | 579 var btn_state = $('#regex_generator').css('display'); |
| 580 if (btn_state == "block") { | 580 if (btn_state == "block") { |
| 581 $("#regex_generator").css("display", "none"); | 581 genRegexWindowClose(); |
| 582 $("#gen_regex_window_open_id").text("Open Gen Regex"); | |
| 583 } else { | 582 } else { |
| 584 $('#regex_generator').css("display", "block"); | 583 $('#regex_generator').css("display", "block"); |
| 585 $("#gen_regex_window_open_id").text("Close Gen Regex"); | 584 $("#gen_regex_window_open_id").text("Close Gen Regex"); |
| 586 } | 585 } |
| 587 } | 586 } |
| 588 function genRegexWindowClose(){ | 587 function genRegexWindowClose(){ |
| 589 $('#regex_generator').css("display", "none"); | 588 $('#regex_generator').css("display", "none"); |
| 589 $("#gen_regex_window_open_id").text("Open Gen Regex"); | |
| 590 } | 590 } |
| 591 | 591 |
| 592 function sharedStart_(array){ | 592 function sharedStart_(array){ |
| 593 var A= array.concat().sort(), | 593 var A= array.concat().sort(), |
| 594 a1= A[0], a2= A[A.length-1], L= a1.length, i= 0; | 594 a1= A[0], a2= A[A.length-1], L= a1.length, i= 0; |
| 595 while(i<L && a1.charAt(i)=== a2.charAt(i)) i++; | 595 while(i<L && a1.charAt(i)=== a2.charAt(i)) i++; |
| 596 return a1.substring(0, i); | 596 return a1.substring(0, i); |
| 597 } | |
| 598 | |
| 599 function getTagNameByTag(tag){ | |
| 600 var name = ""; | |
| 601 // taglistArray is a global variable | |
| 602 for (var i = 0; i < taglistArray.length; i++) { | |
| 603 var taglistValue = taglistArray[i]; | |
| 604 | |
| 605 var _tag = taglistValue[2]; | |
| 606 var _name = taglistValue[1]; | |
| 607 if (_tag == tag) { | |
| 608 name = _name; | |
| 609 } | |
| 610 } | |
| 611 | |
| 612 return name; | |
| 613 } | |
| 614 | |
| 615 function genRegexAddToSmartRegex() { | |
| 616 // append blocks of generated regex to smart regex | |
| 617 | |
| 618 smartRegexEmpty(); // clear | |
| 619 | |
| 620 var reg_obj = getSuggestedRegex(); | |
| 621 console.log(reg_obj); | |
| 622 | |
| 623 for (var i = 0; i < reg_obj.length; i++) { | |
| 624 if (reg_obj[i].txt != "") { | |
| 625 | |
| 626 var newdiv = document.createElement("span"); | |
| 627 | |
| 628 $(newdiv).css("border", "1px solid black"); | |
| 629 $(newdiv).css("width", "100px"); | |
| 630 | |
| 631 if (reg_obj[i].tag == null) { | |
| 632 if (reg_obj[i].txt == "○") { | |
| 633 $(newdiv).text("空白"); | |
| 634 } else { | |
| 635 $(newdiv).text(reg_obj[i].txt); | |
| 636 } | |
| 637 $(newdiv).attr("class", "span_NOTAG"); | |
| 638 $(newdiv).attr("regexReplace","NOTAG"); | |
| 639 } else { | |
| 640 | |
| 641 var name = getTagNameByTag(reg_obj[i].tag); | |
| 642 $(newdiv).text(name+"名"); | |
| 643 | |
| 644 $(newdiv).attr("class", "span_"+reg_obj[i].tag); | |
| 645 $(newdiv).attr("regexReplace", reg_obj[i].tag); | |
| 646 | |
| 647 } | |
| 648 | |
| 649 regex_element_index += 1; | |
| 650 $(newdiv).attr("id", "regex_elem_"+regex_element_index); | |
| 651 | |
| 652 $(newdiv).attr("regexText", reg_obj[i].txt); | |
| 653 | |
| 654 $('#smartRegexShowDiv').append(newdiv); | |
| 655 | |
| 656 } | |
| 657 } | |
| 658 | |
| 597 } | 659 } |
| 598 | 660 |
| 599 function longestCommonSubstring_(string1, string2){ | 661 function longestCommonSubstring_(string1, string2){ |
| 600 // init max value | 662 // init max value |
| 601 var longestCommonSubstring = 0; | 663 var longestCommonSubstring = 0; |
| 656 } | 718 } |
| 657 return s1.substring(start_idx, (start_idx + max_len)); | 719 return s1.substring(start_idx, (start_idx + max_len)); |
| 658 | 720 |
| 659 } | 721 } |
| 660 | 722 |
| 661 function getRegex(_pattern) { | 723 var suggestedRegex = []; |
| 724 | |
| 725 function setSuggestedRegex(_pattern) { | |
| 726 /* | |
| 662 console.log(_pattern[0]); | 727 console.log(_pattern[0]); |
| 663 console.log(_pattern[1]); | 728 console.log(_pattern[1]); |
| 729 */ | |
| 664 var p0 = _pattern[0]; | 730 var p0 = _pattern[0]; |
| 665 var p1 = _pattern[1]; | 731 var p1 = _pattern[1]; |
| 666 | 732 |
| 667 // TODO: find common pattern | 733 // TODO: find common pattern |
| 668 var reg_str = ""; | 734 suggestedRegex = []; // it's a global variable |
| 669 // _p1 = 測試 | 735 |
| 670 // _p2 = 測<tag_name>試</tag_name>一下 | |
| 671 var combined = []; | |
| 672 if (p0.length > p1.length) { | 736 if (p0.length > p1.length) { |
| 673 combined = p0; | 737 suggestedRegex = p0; |
| 674 } else if(p0.length < p1.length) { | 738 } else if(p0.length < p1.length) { |
| 675 combined = p1; | 739 suggestedRegex = p1; |
| 676 } else { // equal length | 740 } else { // equal length |
| 677 // find matching string | 741 // find matching string |
| 678 var cnt = p0.length; | 742 var cnt = p0.length; |
| 679 for (var i = 0; i < cnt; i++) { | 743 for (var i = 0; i < cnt; i++) { |
| 680 if (p1[i].tag != null) { | 744 if (p1[i].tag != null) { |
| 681 combined.push({tag:p1[i].tag, txt:"[^○如即而之有<>〈〉【】]{1,"+p1[i].txt.length+"}"}); | 745 suggestedRegex.push({tag:p1[i].tag, txt:"[^○如即而之有<>〈〉【】]{1,"+p1[i].txt.length+"}"}); |
| 682 } else if (p0[i].tag != null) { | 746 } else if (p0[i].tag != null) { |
| 683 combined.push({tag:p0[i].tag, txt:"[^○如即而之有<>〈〉【】]{1,"+p0[i].txt.length+"}"}); | 747 suggestedRegex.push({tag:p0[i].tag, txt:"[^○如即而之有<>〈〉【】]{1,"+p0[i].txt.length+"}"}); |
| 684 } else { | 748 } else { |
| 685 // find matching for text in each corresponding position | 749 // find matching for text in each corresponding position |
| 686 var texts = [p0[i].txt, p1[i].txt]; | 750 var texts = [p0[i].txt, p1[i].txt]; |
| 687 var common = longestCommonSubstring(p0[i].txt, p1[i].txt); | 751 var common = longestCommonSubstring(p0[i].txt, p1[i].txt); |
| 688 | 752 |
| 692 common[i]; | 756 common[i]; |
| 693 reg_for_common += common[i]+"|"; | 757 reg_for_common += common[i]+"|"; |
| 694 }; | 758 }; |
| 695 reg_for_common += "]"; | 759 reg_for_common += "]"; |
| 696 */ | 760 */ |
| 697 combined.push({tag:null, txt:common}); | 761 suggestedRegex.push({tag:null, txt:common}); |
| 698 } | 762 } |
| 699 }; | 763 }; |
| 700 } | 764 } |
| 701 | 765 |
| 702 for (var i = 0; i < combined.length; i++) { | 766 |
| 703 reg_str += combined[i].txt; | 767 } |
| 704 }; | 768 |
| 705 | 769 function getSuggestedRegex(){ |
| 706 return reg_str; | 770 return suggestedRegex; |
| 707 } | 771 } |
| 708 | 772 |
| 709 var pattern_obj = []; // record pattern array for regex generator. only contain pattern1 and pattern2 | 773 var pattern_obj = []; // record pattern array for regex generator. only contain pattern1 and pattern2 |
| 710 | 774 |
| 711 function genRegexBySelection(tag_item_div, _selection) { | 775 function genRegexBySelection(tag_item_div, _selection) { |
| 712 var add_gen_regex_button = document.createElement("button"); | 776 var add_gen_regex_button = document.createElement("button"); |
| 713 $(add_gen_regex_button).id = "addToGenRegex"; | 777 $(add_gen_regex_button).id = "addToGenRegex"; |
| 714 $(add_gen_regex_button).addClass("btn btn-md"); | 778 $(add_gen_regex_button).addClass("btn btn-md"); |
| 715 $(add_gen_regex_button).click( function(){ | 779 $(add_gen_regex_button).click( function(){ |
| 716 // popup for selected words regex gen | 780 // popup for selected words regex gen |
| 781 /* | |
| 717 console.log("Debug: "); | 782 console.log("Debug: "); |
| 718 console.log(_selection); | 783 console.log(_selection); |
| 719 | 784 */ |
| 785 | |
| 720 if (_selection.type == "Range") { | 786 if (_selection.type == "Range") { |
| 721 // select words, not just click on text | 787 // select words, not just click on text |
| 722 var anchor_node = _selection.anchorNode; | 788 var anchor_node = _selection.anchorNode; |
| 723 var focus_node = _selection.focusNode; | 789 var focus_node = _selection.focusNode; |
| 724 var sibling_node = anchor_node.nextElementSibling; | 790 var sibling_node = anchor_node.nextElementSibling; |
| 749 | 815 |
| 750 $(seleted_div).text(text_before+tagged_text+text_after); | 816 $(seleted_div).text(text_before+tagged_text+text_after); |
| 751 seleted_obj.push({tag:null, txt:text_before}); | 817 seleted_obj.push({tag:null, txt:text_before}); |
| 752 seleted_obj.push({tag:tag_name, txt:tagged_text}); | 818 seleted_obj.push({tag:tag_name, txt:tagged_text}); |
| 753 seleted_obj.push({tag:null, txt:text_after}); | 819 seleted_obj.push({tag:null, txt:text_after}); |
| 754 | 820 /* |
| 755 | |
| 756 console.log(text_before); | 821 console.log(text_before); |
| 757 console.log(tag_name); | 822 console.log(tag_name); |
| 758 console.log(tagged_text); | 823 console.log(tagged_text); |
| 759 console.log(text_after); | 824 console.log(text_after); |
| 825 */ | |
| 760 } | 826 } |
| 761 | 827 |
| 762 | 828 |
| 763 var generated_regex = ""; | 829 |
| 830 var generated_regex_plaintext = ""; | |
| 764 // show generate regex window | 831 // show generate regex window |
| 765 $('#regex_generator').css("display", "block"); | 832 $('#regex_generator').css("display", "block"); |
| 766 $("#gen_regex_window_open_id").text("Close Gen Regex"); | 833 $("#gen_regex_window_open_id").text("Close Gen Regex"); |
| 767 | 834 |
| 768 //var seleted_text = String(_selection).replace(/^\s+|\s+$/g,''); | 835 //var seleted_text = String(_selection).replace(/^\s+|\s+$/g,''); |
| 773 pattern_obj.push(seleted_obj); | 840 pattern_obj.push(seleted_obj); |
| 774 // pattern1.text(seleted_div.text()); | 841 // pattern1.text(seleted_div.text()); |
| 775 } else if (pattern2.children().length == 0) { | 842 } else if (pattern2.children().length == 0) { |
| 776 pattern2.append(seleted_div); | 843 pattern2.append(seleted_div); |
| 777 pattern_obj.push(seleted_obj); | 844 pattern_obj.push(seleted_obj); |
| 778 //pattern2.text(seleted_div.text()); | 845 |
| 779 generated_regex = getRegex(pattern_obj); | 846 setSuggestedRegex(pattern_obj); |
| 847 var generated_regex = getSuggestedRegex(); | |
| 848 | |
| 849 // get plaintext from generated_regex obj | |
| 850 for (var i = 0; i < generated_regex.length; i++) { | |
| 851 generated_regex_plaintext += generated_regex[i].txt; | |
| 852 } | |
| 780 | 853 |
| 781 } else { | 854 } else { |
| 782 // pattern1 and pattern2 are already having text | 855 // pattern1 and pattern2 are already having text |
| 783 pattern1.children().remove(); | 856 pattern1.children().remove(); |
| 784 pattern1.append(pattern2.children()); | 857 pattern1.append(pattern2.children()); |
| 786 pattern2.children().remove(); | 859 pattern2.children().remove(); |
| 787 pattern2.append(seleted_div); | 860 pattern2.append(seleted_div); |
| 788 | 861 |
| 789 pattern_obj.shift(); | 862 pattern_obj.shift(); |
| 790 pattern_obj.push(seleted_obj); | 863 pattern_obj.push(seleted_obj); |
| 791 | 864 |
| 792 //pattern1.text(pattern2.text()); | 865 setSuggestedRegex(pattern_obj); |
| 793 //pattern2.text(seleted_div); | 866 var generated_regex = getSuggestedRegex(); |
| 794 generated_regex = getRegex(pattern_obj); | 867 |
| 868 // get plaintext from generated_regex obj | |
| 869 for (var i = 0; i < generated_regex.length; i++) { | |
| 870 generated_regex_plaintext += generated_regex[i].txt; | |
| 871 } | |
| 872 | |
| 795 | 873 |
| 796 } | 874 } |
| 797 $('#generated_regex').text(generated_regex); | 875 //$('#generated_regex').text(generated_regex); |
| 876 $('#generated_regex').text(generated_regex_plaintext); | |
| 798 // --- | 877 // --- |
| 799 | 878 |
| 800 $('#regex_generator_error_msg').text(""); | 879 $('#regex_generator_error_msg').text(""); |
| 801 } else { | 880 } else { |
| 802 $('#regex_generator_error_msg').text("Note: Not a valid selection for regex generator."); | 881 $('#regex_generator_error_msg').text("Note: Not a valid selection for regex generator."); |
