comparison software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/transform/HighlightContentHandler.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
comparison
equal deleted inserted replaced
23:e845310098ba 25:e9fe3186670c
1 package de.mpg.mpiwg.berlin.mpdl.cms.transform;
2
3 import java.util.ArrayList;
4
5 import org.xml.sax.*;
6
7 import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
8 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
9 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
10
11 public class HighlightContentHandler implements ContentHandler {
12 private String xmlnsString = "";
13 private String highlightElemName;
14 private int highlightElemPos = 1;
15 private int currentHighlightElemPos = 0;
16 private boolean highlightElemMode = false;
17 private int highlightElemModeOpenTags = 0;
18 private String highlightQueryType = "orig"; // orig, reg, norm or morph
19 private String highlightQuery; // complex Lucene query
20 private String highlightQueryForms; // highlight terms separated by a blank
21 private boolean highlightHitMode = false;
22 private int highlightHitModeOpenTags = 0;
23 private boolean firstPageBreakReachedMode = false; // in a page fragment: if a page break element is surrounded by an element (e.g. "s") then this element should not increment the currentHighlightElemPos
24 private boolean firstPageBreakReached = true;
25 private StringBuilder result = new StringBuilder();
26
27 public HighlightContentHandler() throws ApplicationException {
28 }
29
30 public HighlightContentHandler(String highlightElemName, int highlightElemPos) throws ApplicationException {
31 this.highlightElemName = highlightElemName;
32 this.highlightElemPos = highlightElemPos;
33 }
34
35 public HighlightContentHandler(String highlightElemName, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
36 this.highlightElemName = highlightElemName;
37 this.highlightElemPos = highlightElemPos;
38 this.highlightQueryType = highlightQueryType;
39 this.highlightQuery = highlightQuery;
40 if (highlightQuery != null) {
41 IndexHandler indexHandler = IndexHandler.getInstance();
42 ArrayList<String> queryTerms = indexHandler.fetchTerms(highlightQuery, language); // all query terms in query (also morphological terms)
43 highlightQueryForms = toString(queryTerms);
44 }
45 }
46
47 public void setFirstPageBreakReachedMode(boolean firstPageBreakReachedMode) {
48 this.firstPageBreakReachedMode = firstPageBreakReachedMode;
49 if (firstPageBreakReachedMode)
50 this.firstPageBreakReached = false; // is first set to false and later if a page break is found (by startElement) it is set to true
51 }
52
53 public StringBuilder getResult() {
54 return result;
55 }
56
57 public void startDocument() throws SAXException {
58 }
59
60 public void endDocument() throws SAXException {
61 }
62
63 public void characters(char[] c, int start, int length) throws SAXException {
64 char[] cCopy = new char[length];
65 System.arraycopy(c, start, cCopy, 0, length);
66 String charactersStr = String.valueOf(cCopy);
67 if (charactersStr != null && ! charactersStr.equals("")) {
68 charactersStr = StringUtils.deresolveXmlEntities(charactersStr);
69 write(charactersStr);
70 }
71 }
72
73 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
74 }
75
76 public void processingInstruction(String target, String data) throws SAXException {
77 }
78
79 public void setDocumentLocator(Locator locator) {
80 }
81
82 public void startPrefixMapping(String prefix, String uri) throws SAXException {
83 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
84 if (prefix != null && prefix.equals(""))
85 xmlnsString = "xmlns" + "=\"" + uri + "\" ";
86 }
87
88 public void endPrefixMapping(String prefix) throws SAXException {
89 }
90
91 public void skippedEntity(String name) throws SAXException {
92 }
93
94 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
95 int attrSize = attrs.getLength();
96 String attrString = "";
97 for (int i=0; i<attrSize; i++) {
98 String attrQName = attrs.getQName(i);
99 String attrValue = attrs.getValue(i);
100 attrValue = StringUtils.forXML(attrValue);
101 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
102 }
103 if (attrString != null && ! attrString.isEmpty()) {
104 attrString = attrString.trim();
105 }
106 if (xmlnsString != null && ! xmlnsString.isEmpty()) {
107 xmlnsString = xmlnsString.trim();
108 }
109 if (localName.equals("pb"))
110 firstPageBreakReached = true;
111 // start highlight element at position
112 if (highlightElemName != null && highlightElemName.equals(localName) && firstPageBreakReached) {
113 currentHighlightElemPos++;
114 if (currentHighlightElemPos == highlightElemPos && highlightElemModeOpenTags == 0) {
115 highlightElemMode = true;
116 write("<hi type=\"elem\">");
117 }
118 }
119 if (highlightElemMode) {
120 highlightElemModeOpenTags++;
121 }
122 // start highlight query
123 if (highlightQuery != null && localName.equals("w")) {
124 boolean matched = false;
125 String attrQName = "form";
126 if (highlightQueryType.equals("orig"))
127 attrQName = "form";
128 else if (highlightQueryType.equals("reg"))
129 attrQName = "formRegularized";
130 else if (highlightQueryType.equals("norm"))
131 attrQName = "formNormalized";
132 else if (highlightQueryType.equals("morph"))
133 attrQName = "lemmas";
134 String attrValue = getAttrValue(attrs, attrQName);
135 if (highlightQueryType.equals("reg") && attrValue == null)
136 attrValue = getAttrValue(attrs, "form"); // if no regularized form exist it takes the form
137 if (attrValue != null) {
138 String[] forms = highlightQueryForms.split(" ");
139 for (int i=0; i<forms.length; i++) {
140 if (! matched) {
141 String form = forms[i];
142 if (form.endsWith("*")) { // TODO support middle wildcard queries: bla*bla bla?bla
143 form = form.replace("*", "");
144 matched = attrValue.startsWith(form);
145 } else {
146 matched = attrValue.equals(form);
147 }
148 }
149 }
150 }
151 if ((highlightElemName == null && matched && highlightHitModeOpenTags == 0) || (highlightElemName != null && highlightElemMode && matched && highlightHitModeOpenTags == 0)) {
152 highlightHitMode = true;
153 write("<hi type=\"hit\">");
154 }
155 }
156 if (highlightHitMode) {
157 highlightHitModeOpenTags++;
158 }
159 write("<" + name);
160 if (xmlnsString != null && ! xmlnsString.isEmpty())
161 write(" " + xmlnsString);
162 if (attrString != null && ! attrString.isEmpty())
163 write(" " + attrString);
164 write(">");
165 xmlnsString = "";
166 }
167
168 public void endElement(String uri, String localName, String name) throws SAXException {
169 write("</" + name + ">");
170 // end highlight element at position
171 if (highlightElemMode) {
172 if (highlightElemModeOpenTags == 1) {
173 highlightElemMode = false;
174 write("</hi>");
175 }
176 highlightElemModeOpenTags--;
177 }
178 // end highlight query
179 if (highlightHitMode) {
180 if (highlightHitModeOpenTags == 1) {
181 highlightHitMode = false;
182 write("</hi>");
183 }
184 highlightHitModeOpenTags--;
185 }
186 }
187
188 private String toString(ArrayList<String> queryForms) {
189 String queryFormsStr = "";
190 for (int i=0; i<queryForms.size(); i++) {
191 String form = queryForms.get(i);
192 queryFormsStr = queryFormsStr + form + " ";
193 }
194 if (queryForms == null || queryForms.size() == 0)
195 return null;
196 else
197 return queryFormsStr.substring(0, queryFormsStr.length() -1);
198 }
199
200 private void write(String outStr) throws SAXException {
201 result.append(outStr);
202 }
203
204 private String getAttrValue(Attributes attrs, String attrQName) {
205 String retValue = null;
206 int attrSize = attrs.getLength();
207 for (int i=0; i<attrSize; i++) {
208 String attrQNameTmp = attrs.getQName(i);
209 String attrValue = attrs.getValue(i);
210 if (attrQNameTmp.equals(attrQName))
211 return attrValue;
212 }
213 return retValue;
214 }
215 }