Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/test/TestLocal.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.cms.test; | |
2 | |
3 import java.io.BufferedInputStream; | |
4 import java.io.File; | |
5 import java.io.FileReader; | |
6 import java.io.IOException; | |
7 import java.io.InputStream; | |
8 import java.io.PrintWriter; | |
9 import java.io.StringReader; | |
10 import java.io.StringWriter; | |
11 import java.net.URL; | |
12 import java.util.ArrayList; | |
13 import java.util.Date; | |
14 import java.util.Hashtable; | |
15 | |
16 import javax.xml.transform.stream.StreamSource; | |
17 | |
18 import net.sf.saxon.s9api.Processor; | |
19 import net.sf.saxon.s9api.QName; | |
20 import net.sf.saxon.s9api.Serializer; | |
21 import net.sf.saxon.s9api.XdmNode; | |
22 import net.sf.saxon.s9api.XsltCompiler; | |
23 import net.sf.saxon.s9api.XsltExecutable; | |
24 | |
25 import org.apache.commons.io.FileUtils; | |
26 import org.apache.commons.io.IOUtils; | |
27 import org.apache.lucene.document.Document; | |
28 import org.apache.lucene.document.Fieldable; | |
29 import org.apache.lucene.index.Term; | |
30 import org.xml.sax.InputSource; | |
31 import org.xml.sax.SAXException; | |
32 import org.xml.sax.XMLReader; | |
33 | |
34 import com.sun.org.apache.xerces.internal.parsers.SAXParser; | |
35 | |
36 import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; | |
37 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsChainScheduler; | |
38 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation; | |
39 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
40 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; | |
41 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; | |
42 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; | |
43 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; | |
44 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler; | |
45 import de.mpg.mpiwg.berlin.mpdl.util.Util; | |
46 import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; | |
47 import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler; | |
48 import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler; | |
49 import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; | |
50 import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer; | |
51 import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler; | |
52 import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits; | |
53 import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord; | |
54 import de.mpg.mpiwg.berlin.mpdl.cms.document.Token; | |
55 | |
56 public class TestLocal { | |
57 private IndexHandler indexer; | |
58 | |
59 public static void main(String[] args) throws ApplicationException { | |
60 try { | |
61 TestLocal test = new TestLocal(); | |
62 test.init(); | |
63 // test.importAllDocuments(); | |
64 // test.testTransform(); | |
65 // test.testXml(); | |
66 // test.generateToc(); | |
67 test.testCalls(); | |
68 // test.generatePdf(); | |
69 // test.xquery(); | |
70 // test.createToc(); | |
71 // test.testScheduler(); | |
72 // test.getDocInfo(); | |
73 // test.testChars(); | |
74 test.end(); | |
75 } catch (Exception e) { | |
76 e.printStackTrace(); | |
77 } | |
78 } | |
79 | |
80 private void init() throws ApplicationException { | |
81 indexer = IndexHandler.getInstance(); | |
82 } | |
83 | |
84 private void end() throws ApplicationException { | |
85 indexer.end(); | |
86 } | |
87 | |
88 private void testXml() throws ApplicationException { | |
89 try { | |
90 DocumentHandler docHandler = new DocumentHandler(); | |
91 String docDirName = docHandler.getDocDir("/echo/la/Benedetti_1585_163127KK.xml"); | |
92 String pageXmlFileName = docDirName + "/pages" + "/page-" + "444" + ".xml"; | |
93 File pageXmlFile = new File(pageXmlFileName); | |
94 String pageXmlStr = null; | |
95 if (pageXmlFile.exists()) | |
96 pageXmlStr = FileUtils.readFileToString(pageXmlFile, "utf-8"); | |
97 System.out.println(pageXmlStr); | |
98 String tokStr = tokenizeWithLemmas(pageXmlStr, "lat"); | |
99 System.out.println(tokStr); | |
100 tokStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokStr; | |
101 byte[] blablabla = tokStr.getBytes("utf-8"); | |
102 String blablu = new String(blablabla, "utf-8"); | |
103 String bla = enrichWordsOrigRegNorm(blablu); | |
104 System.out.println(bla); | |
105 | |
106 XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); | |
107 URL url = new URL("file:/var/yp/Test_1789.xml"); | |
108 XdmNode docNode = xQueryEvaluator.parse(url); // if it is not parseable an exception with a detail message is thrown | |
109 | |
110 File srcFile = new File("/Users/jwillenborg/mpdl/data/xml/documents/tei/de/dt-ptolemaeus-tei-merge2.xml"); | |
111 FileReader docFileReader = new FileReader(srcFile); | |
112 XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader); | |
113 docXmlTokenizer.setDocIdentifier("/tei/de/dt-ptolemaeus-tei-merge2.xml"); | |
114 docXmlTokenizer.tokenize(); | |
115 ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements("s"); | |
116 String blabla = ""; | |
117 } catch (Exception e) { | |
118 e.printStackTrace(); | |
119 } | |
120 } | |
121 | |
122 private void importAllDocuments() throws ApplicationException { | |
123 DocumentHandler docHandler = new DocumentHandler(); | |
124 CmsDocOperation docOperation = new CmsDocOperation("importDirectory", "file:/Users/jwillenborg/test/documents", null, null); | |
125 docOperation.setCollectionNames("echo"); | |
126 docHandler.doOperation(docOperation); | |
127 } | |
128 | |
129 private void generatePdf() throws ApplicationException { | |
130 long begin = new Date().getTime(); | |
131 DocumentHandler docHandler = new DocumentHandler(); | |
132 CmsDocOperation docOperation = new CmsDocOperation("createPdf", null, null, "/echo/la/Benedetti_1585_163127KK.xml"); | |
133 docHandler.doOperation(docOperation); | |
134 docOperation = new CmsDocOperation("createPdf", null, null, "/tei/de/dt-ptolemaeus-tei-merge2.xml"); | |
135 // docHandler.doOperation(docOperation); | |
136 long end = new Date().getTime(); | |
137 System.out.println("Needed time: " + (end - begin)); | |
138 } | |
139 | |
140 private void testChars() throws ApplicationException { | |
141 String docId = "/test/benedetti/page-444.xml"; | |
142 String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docId; | |
143 DocumentHandler docHandler = new DocumentHandler(); | |
144 CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docId); | |
145 docHandler.doOperation(docOperation); | |
146 } | |
147 | |
148 private void testCalls() throws ApplicationException { | |
149 Date before = new Date(); | |
150 System.out.println("Indexing start: " + before.getTime()); | |
151 String docIdGoerz = "/tei/de/dt-ptolemaeus-tei-merge2.xml"; | |
152 String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdGoerz; | |
153 DocumentHandler docHandler = new DocumentHandler(); | |
154 CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdGoerz); | |
155 // docHandler.doOperation(docOperation); | |
156 String docIdSchulz = "/tei/de/Schulz_2009.xml"; | |
157 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSchulz; | |
158 docHandler = new DocumentHandler(); | |
159 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSchulz); | |
160 // docHandler.doOperation(docOperation); | |
161 String docIdBenedetti = "/echo/la/Benedetti_1585_163127KK.xml"; | |
162 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdBenedetti; | |
163 docHandler = new DocumentHandler(); | |
164 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdBenedetti); | |
165 // docHandler.doOperation(docOperation); | |
166 String docIdSongYingxing = "/echo/zh/SongYingxing_1637.xml"; | |
167 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSongYingxing; | |
168 docHandler = new DocumentHandler(); | |
169 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSongYingxing); | |
170 // docHandler.doOperation(docOperation); | |
171 String docIdMonte = "/archimedes/la/monte_mecha_036_la_1577.xml"; | |
172 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMonte; | |
173 docHandler = new DocumentHandler(); | |
174 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMonte); | |
175 // docHandler.doOperation(docOperation); | |
176 String docIdEinstein = "/diverse/de/Einst_Antwo_de_1912.xml"; | |
177 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstein; | |
178 docHandler = new DocumentHandler(); | |
179 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstein); | |
180 // docHandler.doOperation(docOperation); | |
181 String docIdEinsteinUeber = "/diverse/de/Einst_Ueber_de_1907_02.xml"; | |
182 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinsteinUeber; | |
183 docHandler = new DocumentHandler(); | |
184 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinsteinUeber); | |
185 // docHandler.doOperation(docOperation); | |
186 String docIdTest = "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml"; | |
187 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdTest; | |
188 docHandler = new DocumentHandler(); | |
189 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdTest); | |
190 // docHandler.doOperation(docOperation); | |
191 // indexer.deleteDocument(docIdGoerz); | |
192 // indexer.deleteDocument(docIdBenedetti); | |
193 MorphologyCache.getInstance().end(); | |
194 LexHandler.getInstance().end(); | |
195 } | |
196 | |
197 private Hashtable<Integer, StringBuilder> getFragments(String fileName) throws ApplicationException { | |
198 try { | |
199 GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(); | |
200 XMLReader xmlParser = new SAXParser(); | |
201 xmlParser.setContentHandler(getFragmentsContentHandler); | |
202 InputSource inputSource = new InputSource(fileName); | |
203 xmlParser.parse(inputSource); | |
204 Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages(); | |
205 return resultFragments; | |
206 } catch (SAXException e) { | |
207 throw new ApplicationException(e); | |
208 } catch (IOException e) { | |
209 throw new ApplicationException(e); | |
210 } | |
211 } | |
212 | |
213 private void testTransform() throws ApplicationException { | |
214 Date begin = new Date(); | |
215 XslResourceTransformer xslResourceTransformer = new XslResourceTransformer("pageXml.xsl"); | |
216 xslResourceTransformer = new XslResourceTransformer("pageTei.xsl"); | |
217 xslResourceTransformer = new XslResourceTransformer("pageArchimedes.xsl"); | |
218 xslResourceTransformer = new XslResourceTransformer("pageXhtml.xsl"); | |
219 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); | |
220 Date end = new Date(); | |
221 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); | |
222 begin = new Date(); | |
223 String docFilePath = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585/pages/page-13-morph.xml"; | |
224 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); | |
225 String result = xslResourceTransformer.transform(docFilePath); | |
226 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); | |
227 result = xslResourceTransformer.transform(docFilePath); | |
228 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); | |
229 result = xslResourceTransformer.transform(docFilePath); | |
230 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); | |
231 result = xslResourceTransformer.transform(docFilePath); | |
232 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); | |
233 result = xslResourceTransformer.transform(docFilePath); | |
234 end = new Date(); | |
235 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); | |
236 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); | |
237 begin = new Date(); | |
238 result = xslResourceTransformer.transform(docFilePath); | |
239 result = xslResourceTransformer.transform(docFilePath); | |
240 result = xslResourceTransformer.transform(docFilePath); | |
241 result = xslResourceTransformer.transform(docFilePath); | |
242 result = xslResourceTransformer.transform(docFilePath); | |
243 end = new Date(); | |
244 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); | |
245 try { | |
246 Processor processor = new Processor(false); | |
247 XsltCompiler xsltCompiler = processor.newXsltCompiler(); | |
248 URL xslUrl = XslResourceTransformer.class.getResource("pageEcho.xsl"); | |
249 StreamSource xslStreamSource = new StreamSource(xslUrl.openStream()); | |
250 XsltExecutable xsltExecutable = xsltCompiler.compile(xslStreamSource); | |
251 net.sf.saxon.s9api.XsltTransformer xsltTransformer = xsltExecutable.load(); | |
252 Serializer serializer = new Serializer(); | |
253 serializer.setOutputWriter(new StringWriter()); | |
254 begin = new Date(); | |
255 for (int i=0; i<=5; i++) { | |
256 StreamSource xmlDoc = new StreamSource(docFilePath); | |
257 xsltTransformer.setSource(xmlDoc); // needs some time for bigger documents | |
258 xsltTransformer.setDestination(serializer); | |
259 xsltTransformer.transform(); // needs some time for bigger documents | |
260 result = serializer.getOutputDestination().toString(); | |
261 } | |
262 end = new Date(); | |
263 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); | |
264 } catch (Exception e) { | |
265 | |
266 } | |
267 } | |
268 | |
269 private String tokenizeXmlFragment() throws ApplicationException { | |
270 String result = null; | |
271 try { | |
272 String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8"); | |
273 String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13"; | |
274 URL srcUrl = new URL(srcUrlStr); | |
275 InputStream inputStream = srcUrl.openStream(); | |
276 BufferedInputStream in = new BufferedInputStream(inputStream); | |
277 xmlFragment = IOUtils.toString(in, "utf-8"); | |
278 in.close(); | |
279 | |
280 XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); | |
281 xmlTokenizer.setLanguage("lat"); | |
282 String[] stopElements = {"var"}; | |
283 // xmlTokenizer.setOutputFormat("string"); | |
284 String[] outputOptions = {"withLemmas"}; | |
285 xmlTokenizer.setOutputOptions(outputOptions); | |
286 xmlTokenizer.setStopElements(stopElements); | |
287 xmlTokenizer.tokenize(); | |
288 result = xmlTokenizer.getXmlResult(); | |
289 System.out.println(result); | |
290 } catch (Exception e) { | |
291 throw new ApplicationException(e); | |
292 } | |
293 return result; | |
294 } | |
295 | |
296 private String normalizeWords(String xmlStr) throws ApplicationException { | |
297 try { | |
298 WordContentHandler wordContentHandler = new WordContentHandler(); | |
299 XMLReader xmlParser = new SAXParser(); | |
300 xmlParser.setContentHandler(wordContentHandler); | |
301 StringReader strReader = new StringReader(xmlStr); | |
302 InputSource inputSource = new InputSource(strReader); | |
303 xmlParser.parse(inputSource); | |
304 String result = wordContentHandler.getResult(); | |
305 return result; | |
306 } catch (SAXException e) { | |
307 throw new ApplicationException(e); | |
308 } catch (IOException e) { | |
309 throw new ApplicationException(e); | |
310 } | |
311 } | |
312 | |
313 private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException { | |
314 StringReader strReader = new StringReader(xmlStr); | |
315 XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader); | |
316 xmlTokenizer.setLanguage(language); | |
317 String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance) | |
318 xmlTokenizer.setOutputOptions(outputOptionsWithLemmas); | |
319 xmlTokenizer.tokenize(); | |
320 String retStr = xmlTokenizer.getXmlResult(); | |
321 return retStr; | |
322 } | |
323 | |
324 private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException { | |
325 try { | |
326 WordContentHandler wordContentHandler = new WordContentHandler(); | |
327 XMLReader xmlParser = new SAXParser(); | |
328 xmlParser.setContentHandler(wordContentHandler); | |
329 StringReader strReader = new StringReader(xmlStr); | |
330 InputSource inputSource = new InputSource(strReader); | |
331 xmlParser.parse(inputSource); | |
332 String result = wordContentHandler.getResult(); | |
333 return result; | |
334 } catch (SAXException e) { | |
335 throw new ApplicationException(e); | |
336 } catch (IOException e) { | |
337 throw new ApplicationException(e); | |
338 } | |
339 } | |
340 | |
341 private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException { | |
342 String result = null; | |
343 try { | |
344 xmlStr = normalizeWords(xmlStr); | |
345 HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language); | |
346 highlightContentHandler.setFirstPageBreakReachedMode(true); | |
347 XMLReader xmlParser = new SAXParser(); | |
348 xmlParser.setContentHandler(highlightContentHandler); | |
349 StringReader stringReader = new StringReader(xmlStr); | |
350 InputSource inputSource = new InputSource(stringReader); | |
351 xmlParser.parse(inputSource); | |
352 result = highlightContentHandler.getResult().toString(); | |
353 } catch (SAXException e) { | |
354 throw new ApplicationException(e); | |
355 } catch (IOException e) { | |
356 throw new ApplicationException(e); | |
357 } | |
358 return result; | |
359 } | |
360 | |
361 private void testScheduler() throws ApplicationException { | |
362 CmsDocOperation docOperation = new CmsDocOperation("update", "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=/echo/zh/SongYingxing_1637.xml", null, "/echo/zh/SongYingxing_1637.xml"); | |
363 String[] elemNames = {"s", "head"}; | |
364 docOperation.setElementNames(elemNames); | |
365 CmsChainScheduler scheduler = CmsChainScheduler.getInstance(); | |
366 docOperation = scheduler.doOperation(docOperation); | |
367 String bla = ""; | |
368 } | |
369 | |
370 private void xquery() throws ApplicationException { | |
371 try { | |
372 XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); | |
373 URL srcUrl = new URL("file:/Users/jwillenborg/tmp/blablabla/Benedetti_1585.xml"); | |
374 String getTocEntries = "let $tocEntries := //echo:div[@type = 'section' or @type = 'chapter']"; | |
375 String getFigures = | |
376 "let $allFigures := //*:figure " + | |
377 "let $figures := " + | |
378 " for $figure at $pos in $allFigures "+ | |
379 " let $caption := string-join($figure/*:caption/text(), ' ') " + | |
380 " let $description := string-join($figure/*:description/text(), ' ') " + | |
381 " let $variables := string-join($figure/*:variables/text(), ' ') " + | |
382 " let $retFigure := " + | |
383 " element {'figure'}" + | |
384 " { attribute {'number'} {$pos}, " + | |
385 " element {'caption'} {$caption}, " + | |
386 " element {'description'} {$description}, " + | |
387 " element {'variables'} {$variables} }" + | |
388 " return " + | |
389 " $retFigure " + | |
390 "return $figures"; | |
391 String result = xQueryEvaluator.evaluateAsString(srcUrl, getFigures); | |
392 String bla = result; | |
393 } catch (Exception e) { | |
394 throw new ApplicationException(e); | |
395 } | |
396 } | |
397 | |
398 private void createToc() throws ApplicationException { | |
399 String docDirName = "/Users/jwillenborg/mpdl/data/xml/documents/echo/zh/SongYingxing_1637"; | |
400 XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl"); | |
401 File tocFile = new File(docDirName + "/toc.xml"); | |
402 String docDestFileName = docDirName + "/SongYingxing_1637.xml"; | |
403 String tocResult = tocTransformer.transform(docDestFileName); | |
404 String bla = ""; | |
405 } | |
406 | |
407 private void getDocInfo() throws ApplicationException { | |
408 IndexHandler indexHandler = IndexHandler.getInstance(); | |
409 MetadataRecord mdRecord = indexHandler.getDocMetadata("/echo/la/Benedetti_1585_163127KK.xml"); | |
410 } | |
411 | |
412 } |