annotate software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children 7d6d969b10cf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.io.StringReader;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import java.util.ArrayList;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5 import java.util.Collections;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 import java.util.Hashtable;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 import org.xml.sax.*;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 public class XmlTokenizerContentHandler implements ContentHandler {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21 private static int ELEMENT_TYPE_CHARACTERS = 1;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 private static int ELEMENT_TYPE_COMPLEX = 2;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 private String[] normalizeFunctions = {}; // default: without normalize functions
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 private String[] nwbElements = {}; // non word breaking elements, default: these elements
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 private String[] stopElements = {}; // default: no stop elements
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26 private String[] outputOptions = {};
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 private String xmlnsString = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 private String language;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 private String outputXmlFragment = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 private Element rootElement;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 private Element currentElement;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32 private ArrayList<Element> elementQueue;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 if (normalizeFunctions == null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 String[] emptyFunctions = {};
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37 this.normalizeFunctions = emptyFunctions;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 this.normalizeFunctions = normalizeFunctions;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 this.language = language;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 public void setNWBElements(String[] nwbElements) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 this.nwbElements = nwbElements;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 public void setStopElements(String[] stopElements) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 this.stopElements = stopElements;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
51
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
52 public void setOutputOptions(String[] outputOptions) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 this.outputOptions = outputOptions;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
56 public String getXmlFragment() {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 return outputXmlFragment;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
59
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 public void startDocument() throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
61 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
62
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
63 public void endDocument() throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
64 try {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 String rootElemToStr = rootElement.toXmlString();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
66 write(rootElemToStr);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 write("\n");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
68 } catch (NullPointerException e) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
69 throw new SAXException(e);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
70 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
71 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
72
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
73 public void characters(char[] c, int start, int length) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
74 char[] cCopy = new char[length];
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
75 System.arraycopy(c, start, cCopy, 0, length);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
76 String charactersStr = String.valueOf(cCopy);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
77 if (charactersStr != null && ! charactersStr.equals("")) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
78 if (currentElement != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
79 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
80 charElement.value = StringUtils.deresolveXmlEntities(charactersStr);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
81 if (currentElement.composites == null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 currentElement.composites = new ArrayList<Element>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
83 currentElement.composites.add(charElement);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
84 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
85 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
86 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
87
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
88 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
90
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
91 public void processingInstruction(String target, String data) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
92 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
93
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
94 public void setDocumentLocator(Locator locator) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
95 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
96
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
97 public void startPrefixMapping(String prefix, String uri) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
98 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
99 if (prefix != null && prefix.equals(""))
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
100 xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" ";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
101 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
102
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
103 public void endPrefixMapping(String prefix) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
104 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
105
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
106 public void skippedEntity(String name) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
107 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
108
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
109 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
110 if (elementQueue == null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
111 elementQueue = new ArrayList<Element>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
112 Element newElement = new Element(name); // element of type: complex
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
113 if (currentElement != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
114 if (currentElement.composites == null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
115 currentElement.composites = new ArrayList<Element>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
116 if (currentElement.lang != null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
117 newElement.lang = currentElement.lang; // language is inherited to childs
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
118 currentElement.composites.add(newElement);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
119 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
120 currentElement = newElement;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
121 int attrSize = attrs.getLength();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
122 String attrString = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
123 for (int i=0; i<attrSize; i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
124 String attrQName = attrs.getQName(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
125 String attrValue = attrs.getValue(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
126 attrValue = StringUtils.forXML(attrValue);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
127 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
128 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang")))
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
129 currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
130 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
131 currentElement.attrString = attrString;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
132 if (! xmlnsString.equals("")) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
133 currentElement.xmlnsString = xmlnsString;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
134 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
135 xmlnsString = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
136 elementQueue.add(currentElement);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
137 // only the first element is the root element
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
138 if(rootElement == null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
139 rootElement = currentElement;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
140 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
141
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
142 public void endElement(String uri, String localName, String name) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
143 if (elementQueue != null && elementQueue.size() > 0) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
144 int lastIndex = elementQueue.size() - 1;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
145 elementQueue.remove(lastIndex);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
146 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
147 if (elementQueue != null && elementQueue.size() > 0) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
148 int lastIndex = elementQueue.size() - 1;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
149 currentElement = elementQueue.get(lastIndex);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
150 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
151 currentElement = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
152 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
153 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
154
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
155 private boolean withForms() {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
156 boolean result = false;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
157 for (int i=0; i< outputOptions.length; i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
158 String function = outputOptions[i];
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
159 if (function.equals("withForms"))
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
160 return true;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
161 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
162 return result;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
163 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
164
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
165 private boolean withLemmas() {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
166 boolean result = false;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
167 for (int i=0; i< outputOptions.length; i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
168 String function = outputOptions[i];
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
169 if (function.equals("withLemmas"))
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
170 return true;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
171 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
172 return result;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
173 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
174
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
175 private void write(String outStr) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
176 outputXmlFragment += outStr;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
177 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
178
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
179 private class Element {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
180 private int type;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
181 private String name;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
182 private String xmlnsString;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
183 private String attrString;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
184 private String value;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
185 private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
186 private ArrayList<Element> composites;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
187
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
188 private Element(String name) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
189 this.type = ELEMENT_TYPE_COMPLEX;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
190 this.name = name;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
191 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
192
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
193 private Element(String name, int type) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
194 this.type = type;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
195 this.name = name;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
196 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
197
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
198 private boolean isComplex() {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
199 boolean isComplex = false;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
200 if (type == ELEMENT_TYPE_COMPLEX)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
201 isComplex = true;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
202 return isComplex;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
203 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
204
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
205 private boolean isWordDelimiterElement() {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
206 boolean isWordDelimiterElement = true;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
207 for (int i=0; i<nwbElements.length; i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
208 String nwbElementName = nwbElements[i];
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
209 if (name.equals(nwbElementName)) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
210 isWordDelimiterElement = false;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
211 break;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
212 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
213 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
214 return isWordDelimiterElement;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
215 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
216
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
217 private boolean isStopElement() {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
218 boolean isStopElement = false;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
219 for (int i=0; i<stopElements.length; i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
220 String stopElementName = stopElements[i];
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
221 if (name.equals(stopElementName)) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
222 isStopElement = true;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
223 break;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
224 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
225 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
226 return isStopElement;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
227 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
228
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
229 private String toXmlString() throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
230 String retString = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
231 String elemLanguage = language; // default value for the document/page
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
232 if (lang != null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
233 elemLanguage = lang; // value of the element if available
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
234 // write this element
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
235 if (! isComplex()) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
236 retString += value;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
237 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
238 String xmlNsString = this.xmlnsString;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
239 if (xmlNsString == null || xmlNsString.equals("")) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
240 retString = retString + "<" + name + attrString + ">";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
241 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
242 retString = retString + "<" + name + " " + xmlNsString + attrString + ">";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
243 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
244 if (composites != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
245 String compositesCharsWithMarks = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
246 ArrayList<Element> complexElements = new ArrayList<Element>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
247 for (int i=0; i<composites.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
248 Element composite = composites.get(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
249 if (! composite.isComplex()) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
250 if (composite.value != null && ! composite.value.equals("")) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
251 String compositeValueStr = composite.value;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
252 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words.
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
253 compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one Blank
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
254 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
255 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
256 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
257 if (! composite.isWordDelimiterElement()) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
258 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
259 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
260 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
261 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
262 complexElements.add(composite);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
263 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
264 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
265 // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta")
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
266 String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
267 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
268 if (complexElements.size() > 0) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
269 for (int i=0; i<complexElements.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
270 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
271 Element complexElem = complexElements.get(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
272 String complexElementStr = complexElem.toXmlString();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
273 String firstPiece = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
274 if (indexComplexElemCompositesCharsWithMarks > 0) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
275 firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
276 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
277 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
278 retString = retString + firstPiece + complexElementStr;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
279 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
280 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
281 retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
282 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
283 retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
284 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
285 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
286 retString = retString + "</" + name + ">";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
287 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
288 return retString;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
289 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
290
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
291 private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
292 String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
293 String retStr = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
294 try {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
295 Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr));
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
296 tokenizer.setLanguage(language);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
297 tokenizer.setNormFunctions(normalizeFunctions);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
298 ArrayList<Token> tokens = tokenizer.getTokens();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
299 int endPos = 0;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
300 for (int i=0; i < tokens.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
301 Token token = tokens.get(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
302 String wordForm = token.getContent();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
303 int startPos = token.getStart();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
304 String beforeStr = charactersStr.substring(endPos, startPos);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
305 endPos = token.getEnd();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
306 String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
307 String origWordForm = charactersStr.substring(startPos, endPos);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
308 String wordTag = insertWordTags(wordForm, language, origWordForm);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
309 retStr = retStr + beforeStrDeresolved + wordTag;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
310 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
311 String lastAfterStr = charactersStr.substring(endPos);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
312 String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
313 retStr = retStr + lastAfterStrDeresolved;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
314 } catch (ApplicationException e) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
315 throw new SAXException(e);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
316 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
317 return retStr;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
318 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
319
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
320 private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
321 String wordTag = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
322 if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK))
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
323 return origWordForm;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
324 if (isStopElement())
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
325 return origWordForm;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
326 wordForm = removeSpecialSymbols(wordForm);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
327 wordForm = wordForm.toLowerCase();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
328 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
329 ArrayList<Lemma> lemmas = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
330 if (withForms() || withLemmas()) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
331 LexHandler lexHandler = LexHandler.getInstance();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
332 lemmas = lexHandler.getLemmas(wordForm, "form", language, "none");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
333 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
334 wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
335 return wordTag;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
336 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
337
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
338 /**
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
339 *
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
340 * @param origWordToken could contain nwd marks
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
341 * @param wordForm contains no nwd marks
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
342 * @param language
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
343 * @param origWordFormNormalized
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
344 * @param lemmas
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
345 * @return for each substring between nwd marks create a word tag
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
346 */
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
347 private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
348 if (origWordToken.isEmpty())
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
349 return origWordToken;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
350 if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK))
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
351 return COMPLEX_ELEMENT_NWD_MARK;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
352 String retWordTags = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
353 String origWordTokenTmp = origWordToken;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
354 while (! origWordTokenTmp.isEmpty()) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
355 if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
356 origWordTokenTmp = origWordTokenTmp.substring(1);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
357 retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
358 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
359 int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
360 if (indexUpToNWD != -1) { // not end of string reached
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
361 String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
362 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
363 retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
364 origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
365 } else { // end of string reached
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
366 String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length());
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
367 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
368 retWordTags = retWordTags + origWordTokenFragmentWithTags;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
369 origWordTokenTmp = ""; // finente
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
370 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
371 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
372 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
373 return retWordTags;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
374 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
375
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
376 private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
377 if (origWordForm == null || origWordForm.isEmpty())
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
378 return "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
379 String langISOCode = Language.getInstance().getISO639Code(language);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
380 String retStr = "<w form=\"" + wordForm + "\"" + " lang=\"" + langISOCode + "\"";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
381 if (origWordFormNormalized != null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
382 retStr = retStr + " formNormalized=\"" + origWordFormNormalized + "\"";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
383 if (lemmas != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
384 String lemmasStr = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
385 String formsStr = "";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
386 Collections.sort(lemmas);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
387 Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
388 for (int i=0; i < lemmas.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
389 Lemma lemma = lemmas.get(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
390 ArrayList<Form> lemmaForms = lemma.getFormsList();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
391 for (int j=0; j < lemmaForms.size(); j++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
392 Form form = lemmaForms.get(j);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
393 formsHashtable.put(form.getFormName(), form);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
394 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
395 String lemmaName = lemma.getLemmaName();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
396 lemmasStr = lemmasStr + lemmaName + " ";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
397 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
398 ArrayList<Form> forms = new ArrayList<Form>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
399 forms.addAll(formsHashtable.values());
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
400 Collections.sort(forms);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
401 for (int i=0; i < forms.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
402 Form form = forms.get(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
403 String formName = form.getFormName();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
404 formName = StringUtils.forXML(formName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
405 formsStr = formsStr + formName + " ";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
406 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
407 if (formsStr.endsWith(" "))
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
408 formsStr = formsStr.substring(0, formsStr.length() - 1);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
409 if (lemmasStr.endsWith(" "))
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
410 lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
411 if (withForms())
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
412 retStr = retStr + " forms=\"" + formsStr + "\"";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
413 if (withLemmas())
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
414 retStr = retStr + " lemmas=\"" + lemmasStr + "\"";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
415 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
416 retStr = retStr + ">" + origWordForm + "</w>";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
417 return retStr;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
418 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
419
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
420 private String removeSpecialSymbols(String inputStr) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
421 String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", "");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
422 return retStr;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
423 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
424
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
425 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
426 }