annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 12:fba5577e49d9

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 19 Apr 2011 16:51:26 +0200
parents 59ff47d1e237
children 5df60f24e997
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.doc;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.util.ArrayList;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5 import org.apache.lucene.analysis.Token;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 import org.xml.sax.*;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 public class DictionarizerContentHandler implements ContentHandler {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 private static String MARK = "COMPLEXELEMENTTTTT";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15 private static int MARK_SIZE = MARK.length();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16 private static int ELEMENT_TYPE_CHARACTERS = 1;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 private static int ELEMENT_TYPE_COMPLEX = 2;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
18 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19 private String xmlnsString = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 private String language;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21 private String outputXmlFragment = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 private Element rootElement;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 private Element currentElement;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 private ArrayList<Element> elementQueue;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26 public DictionarizerContentHandler(String language) throws ApplicationException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 this.language = language;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 public String getXmlFragment() {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 return outputXmlFragment;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 public void startDocument() throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37 public void endDocument() throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 String rootElemToStr = rootElement.toXmlString();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 write(rootElemToStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 write("\n");
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43 public void characters(char[] c, int start, int length) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 char[] cCopy = new char[length];
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 System.arraycopy(c, start, cCopy, 0, length);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 String charactersStr = String.valueOf(cCopy);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 if (charactersStr != null && ! charactersStr.equals("")) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 if (currentElement != null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
51 if (currentElement.composites == null)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
52 currentElement.composites = new ArrayList<Element>();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 currentElement.composites.add(charElement);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
56 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
59 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
60
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
61 public void processingInstruction(String target, String data) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
62 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
63
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
64 public void setDocumentLocator(Locator locator) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
66
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 public void startPrefixMapping(String prefix, String uri) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
68 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
12
fba5577e49d9 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 10
diff changeset
69 if (prefix != null && prefix.equals(""))
fba5577e49d9 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 10
diff changeset
70 xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" ";
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
71 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
72
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
73 public void endPrefixMapping(String prefix) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
74 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
75
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
76 public void skippedEntity(String name) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
77 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
78
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
79 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
80 if (elementQueue == null)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
81 elementQueue = new ArrayList<Element>();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 Element newElement = new Element(name); // element of type: complex
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
83 if (currentElement != null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
84 if (currentElement.composites == null)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
85 currentElement.composites = new ArrayList<Element>();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
86 if (currentElement.lang != null)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
87 newElement.lang = currentElement.lang; // language wird an Kinder vererbt
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
88 currentElement.composites.add(newElement);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
90 currentElement = newElement;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
91 int attrSize = attrs.getLength();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
92 String attrString = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
93 for (int i=0; i<attrSize; i++) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
94 String attrQName = attrs.getQName(i);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
95 String attrValue = attrs.getValue(i);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
96 attrValue = StringUtilEscapeChars.forXML(attrValue);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
97 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
98 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang")))
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
99 currentElement.lang = attrValue; // wenn xml:lang belegt ist, wird es an das neue Element gesetzt und überschreibt vom Vater geerbte Werte
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
100 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
101 currentElement.attrString = attrString;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
102 if (! xmlnsString.equals("")) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
103 currentElement.xmlnsString = xmlnsString;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
104 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
105 xmlnsString = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
106 elementQueue.add(currentElement);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
107 // only the first element is the root element
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
108 if(rootElement == null)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
109 rootElement = currentElement;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
110 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
111
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
112 public void endElement(String uri, String localName, String name) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
113 if (elementQueue != null && elementQueue.size() > 0) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
114 int lastIndex = elementQueue.size() - 1;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
115 elementQueue.remove(lastIndex);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
116 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
117 if (elementQueue != null && elementQueue.size() > 0) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
118 int lastIndex = elementQueue.size() - 1;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
119 currentElement = elementQueue.get(lastIndex);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
120 } else {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
121 currentElement = null;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
122 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
123 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
124
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
125 public int getCharIndex(String compositesCharsDictionarized, int indexComplexElemCompositesCharsWithMarks) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
126 if (indexComplexElemCompositesCharsWithMarks == 0)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
127 return -1;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
128 int size = compositesCharsDictionarized.length();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
129 if (size == 0)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
130 return -1;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
131 int index = 0;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
132 int counter = 0;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
133 boolean isInTag = false;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
134 boolean success = false;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
135 while (!success) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
136 if (counter > size)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
137 return -1;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
138 char c = compositesCharsDictionarized.charAt(counter);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
139 if (c == '<')
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
140 isInTag = true;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
141 if (! isInTag)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
142 index++;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
143 if (index == indexComplexElemCompositesCharsWithMarks)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
144 success = true;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
145 if (c == '>')
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
146 isInTag = false;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
147 counter++;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
148 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
149 return counter + 1;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
150 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
151
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
152 private void write(String outStr) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
153 outputXmlFragment += outStr;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
154 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
155
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
156 private class Element {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
157 private int type;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
158 private String name;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
159 private String xmlnsString;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
160 private String attrString;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
161 private String value;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
162 private String lang; // normalerweise mit dem Wert aus dem Attribut xml:lang belegt bzw. mit dem aus dem Vaterknoten wererbten xml:lang-Wert
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
163 private ArrayList<Element> composites;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
164
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
165 private Element(String name) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
166 this.type = ELEMENT_TYPE_COMPLEX;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
167 this.name = name;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
168 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
169
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
170 private Element(String name, int type) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
171 this.type = type;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
172 this.name = name;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
173 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
174
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
175 private boolean isComplex() {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
176 boolean isComplex = false;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
177 if (type == ELEMENT_TYPE_COMPLEX)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
178 isComplex = true;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
179 return isComplex;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
180 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
181
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
182 /**
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
183 * feel free to add/remove some element names
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
184 * @return true if element is a word delimiter element else false
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
185 */
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
186 private boolean isWordDelimiterElement() {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
187 boolean isWordDelimiterElement = true;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
188 if (name.equals("lb") || name.equals("cb") || name.equals("gap") || name.equals("figure") || name.equals("image") || name.equals("note") || name.equals("handwritten") || name.equals("anchor"))
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
189 isWordDelimiterElement = false;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
190 return isWordDelimiterElement;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
191 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
192
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
193 private String toXmlString() throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
194 String retString = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
195 String elemLanguage = language; // default value for the document/page
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
196 if (lang != null)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
197 elemLanguage = lang; // der Wert des Elements falls vorhanden
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
198 // write this element
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
199 if (! isComplex()) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
200 retString += value;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
201 } else {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
202 String xmlNsString = this.xmlnsString;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
203 if (xmlNsString == null || xmlNsString.equals("")) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
204 retString = retString + "<" + name + attrString + ">";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
205 } else {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
206 retString = retString + "<" + name + " " + xmlNsString + attrString + ">";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
207 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
208 if (composites != null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
209 String compositesChars = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
210 String compositesCharsWithMarks = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
211 ArrayList<Element> complexElements = new ArrayList<Element>();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
212 for (int i=0; i<composites.size(); i++) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
213 Element composite = composites.get(i);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
214 if (! composite.isComplex()) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
215 if (composite.value != null && ! composite.value.equals("")) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
216 String compositeValueStr = composite.value;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
217 compositesChars = compositesChars + compositeValueStr;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
218 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
219 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
220 } else {
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
221 if (! composite.isWordDelimiterElement()) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
222 compositesChars = compositesChars + SPECIAL_NOT_WORD_DELIM_SYMBOL; // add a special symbol at the position of the "not word delimiter element" (e.g. line break)
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
223 }
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
224 complexElements.add(composite);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
225 compositesCharsWithMarks += MARK;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
226 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
227 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
228 String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage);
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
229 compositesChars = compositesChars.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
230 compositesCharsDictionarized = compositesCharsDictionarized.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
231 if (complexElements.size() > 0) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
232 for (int i=0; i<complexElements.size(); i++) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
233 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
234 int indexComplexElemCompositesCharsDictionarized = getCharIndex(compositesCharsDictionarized, indexComplexElemCompositesCharsWithMarks);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
235 Element complexElem = complexElements.get(i);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
236 String complexElementStr = complexElem.toXmlString();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
237 String firstPiece = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
238 if (indexComplexElemCompositesCharsDictionarized > 0) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
239 firstPiece = compositesCharsDictionarized.substring(0, indexComplexElemCompositesCharsDictionarized - 1);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
240 compositesCharsDictionarized = compositesCharsDictionarized.substring(indexComplexElemCompositesCharsDictionarized - 1);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
241 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
242 retString = retString + firstPiece + complexElementStr;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
243 compositesCharsWithMarks = compositesCharsWithMarks.substring(indexComplexElemCompositesCharsWithMarks + MARK_SIZE);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
244 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
245 retString = retString + compositesCharsDictionarized; // last one must also be added
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
246 } else {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
247 retString = retString + compositesCharsDictionarized; // last one must also be added
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
248 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
249 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
250 retString = retString + "</" + name + ">";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
251 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
252 return retString;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
253 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
254
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
255 private String characters2DictWords(String charactersStrDeresolved, String language) throws SAXException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
256 String charactersStr = StringUtilEscapeChars.resolveXmlEntities(charactersStrDeresolved);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
257 String retStr = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
258 try {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
259 MpdlTokenizerAnalyzer dictionarizerAnalyzer = new MpdlTokenizerAnalyzer(language);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
260 ArrayList<Token> wordTokens = dictionarizerAnalyzer.getToken(charactersStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
261 int endPos = 0;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
262 for (int i=0; i < wordTokens.size(); i++) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
263 Token wordToken = wordTokens.get(i);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
264 int startPos = wordToken.startOffset();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
265 String beforeStr = charactersStr.substring(endPos, startPos);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
266 endPos = wordToken.endOffset();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
267 String wordStr = charactersStr.substring(startPos, endPos);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
268 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
269 String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
270 String wordTokenText = wordToken.termText();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
271 LexHandler lexHandler = LexHandler.getInstance();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
272 // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form)
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
273 String wordTokenTextWithoutSpecialSymbols = removeSpecialSymbols(wordTokenText);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
274 ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenTextWithoutSpecialSymbols, language, false);
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
275 if (lexEntryKeys != null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
276 String lexForms = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
277 for (int j=0; j<lexEntryKeys.size(); j++) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
278 String lexEntryKey = lexEntryKeys.get(j);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
279 lexForms = lexForms + lexEntryKey + " ";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
280 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
281 lexForms = lexForms.substring(0, lexForms.length() - 1);
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
282 retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenTextWithoutSpecialSymbols + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>";
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
283 } else {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
284 retStr = retStr + beforeStrDeresolved + wordStrDeresolved;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
285 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
286 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
287 String lastAfterStr = charactersStr.substring(endPos);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
288 String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
289 retStr = retStr + lastAfterStrDeresolved;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
290 } catch (ApplicationException e) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
291 throw new SAXException(e);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
292 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
293 return retStr;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
294 }
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
295
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
296 private String removeSpecialSymbols(String inputStr) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
297 String retStr = inputStr.replaceAll(" ", "");
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
298 retStr = retStr.replaceAll("\n", "");
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
299 retStr = retStr.replaceAll("-", "");
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
300 return retStr;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
301 }
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
302 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
303 }