comparison software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/test/TestLocal.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
comparison
equal deleted inserted replaced
23:e845310098ba 25:e9fe3186670c
1 package de.mpg.mpiwg.berlin.mpdl.cms.test;
2
3 import java.io.BufferedInputStream;
4 import java.io.File;
5 import java.io.FileReader;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.PrintWriter;
9 import java.io.StringReader;
10 import java.io.StringWriter;
11 import java.net.URL;
12 import java.util.ArrayList;
13 import java.util.Date;
14 import java.util.Hashtable;
15
16 import javax.xml.transform.stream.StreamSource;
17
18 import net.sf.saxon.s9api.Processor;
19 import net.sf.saxon.s9api.QName;
20 import net.sf.saxon.s9api.Serializer;
21 import net.sf.saxon.s9api.XdmNode;
22 import net.sf.saxon.s9api.XsltCompiler;
23 import net.sf.saxon.s9api.XsltExecutable;
24
25 import org.apache.commons.io.FileUtils;
26 import org.apache.commons.io.IOUtils;
27 import org.apache.lucene.document.Document;
28 import org.apache.lucene.document.Fieldable;
29 import org.apache.lucene.index.Term;
30 import org.xml.sax.InputSource;
31 import org.xml.sax.SAXException;
32 import org.xml.sax.XMLReader;
33
34 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
35
36 import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
37 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsChainScheduler;
38 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation;
39 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
40 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
41 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;
42 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
43 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
44 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler;
45 import de.mpg.mpiwg.berlin.mpdl.util.Util;
46 import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
47 import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler;
48 import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler;
49 import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
50 import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer;
51 import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler;
52 import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits;
53 import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord;
54 import de.mpg.mpiwg.berlin.mpdl.cms.document.Token;
55
56 public class TestLocal {
57 private IndexHandler indexer;
58
59 public static void main(String[] args) throws ApplicationException {
60 try {
61 TestLocal test = new TestLocal();
62 test.init();
63 // test.importAllDocuments();
64 // test.createAllPdfInDirectory();
65 // test.testTransform();
66 // test.testXml();
67 // test.generateToc();
68 test.testCalls();
69 // test.generatePdf();
70 // test.xquery();
71 // test.createToc();
72 // test.testScheduler();
73 // test.getDocInfo();
74 // test.testChars();
75 test.end();
76 } catch (Exception e) {
77 e.printStackTrace();
78 }
79 }
80
81 private void init() throws ApplicationException {
82 indexer = IndexHandler.getInstance();
83 }
84
85 private void end() throws ApplicationException {
86 indexer.end();
87 }
88
89 private void testXml() throws ApplicationException {
90 try {
91 DocumentHandler docHandler = new DocumentHandler();
92 String docDirName = docHandler.getDocDir("/echo/la/Benedetti_1585_163127KK.xml");
93 String pageXmlFileName = docDirName + "/pages" + "/page-" + "444" + ".xml";
94 File pageXmlFile = new File(pageXmlFileName);
95 String pageXmlStr = null;
96 if (pageXmlFile.exists())
97 pageXmlStr = FileUtils.readFileToString(pageXmlFile, "utf-8");
98 System.out.println(pageXmlStr);
99 String tokStr = tokenizeWithLemmas(pageXmlStr, "lat");
100 System.out.println(tokStr);
101 tokStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokStr;
102 byte[] blablabla = tokStr.getBytes("utf-8");
103 String blablu = new String(blablabla, "utf-8");
104 String bla = enrichWordsOrigRegNorm(blablu);
105 System.out.println(bla);
106
107 XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
108 URL url = new URL("file:/var/yp/Test_1789.xml");
109 XdmNode docNode = xQueryEvaluator.parse(url); // if it is not parseable an exception with a detail message is thrown
110
111 File srcFile = new File("/Users/jwillenborg/mpdl/data/xml/documents/tei/de/dt-ptolemaeus-tei-merge2.xml");
112 FileReader docFileReader = new FileReader(srcFile);
113 XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader);
114 docXmlTokenizer.setDocIdentifier("/tei/de/dt-ptolemaeus-tei-merge2.xml");
115 docXmlTokenizer.tokenize();
116 ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements("s");
117 String blabla = "";
118 } catch (Exception e) {
119 e.printStackTrace();
120 }
121 }
122
123 private void importAllDocuments() throws ApplicationException {
124 DocumentHandler docHandler = new DocumentHandler();
125 CmsDocOperation docOperation = new CmsDocOperation("importDirectory", "file:/Users/jwillenborg/test/documents", null, null);
126 docOperation.setCollectionNames("echo");
127 docHandler.doOperation(docOperation);
128 }
129
130 private void createAllPdfInDirectory() throws ApplicationException {
131 DocumentHandler docHandler = new DocumentHandler();
132 CmsDocOperation docOperation = new CmsDocOperation("createAllPdfInDirectory", "file:/Users/jwillenborg/test/documents", null, null);
133 docOperation.setCollectionNames("echo");
134 docHandler.doOperation(docOperation);
135 }
136
137 private void generatePdf() throws ApplicationException {
138 long begin = new Date().getTime();
139 DocumentHandler docHandler = new DocumentHandler();
140 // String docId = "/echo/la/Benedetti_1585_163127KK.xml";
141 String docId = "/diverse/de/Einst_Ueber_de_1907_02.xml";
142 // String docId = "/archimedes/it/caver_metod_020_it_1891.xml";
143 CmsDocOperation docOperation = new CmsDocOperation("createPdf", null, null, docId);
144 docHandler.doOperation(docOperation);
145 long end = new Date().getTime();
146 System.out.println("Needed time: " + (end - begin));
147 }
148
149 private void testChars() throws ApplicationException {
150 String docId = "/test/benedetti/page-444.xml";
151 String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docId;
152 DocumentHandler docHandler = new DocumentHandler();
153 CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docId);
154 docHandler.doOperation(docOperation);
155 }
156
157 private void testCalls() throws ApplicationException {
158 Date before = new Date();
159 System.out.println("Indexing start: " + before.getTime());
160 String docIdGoerz = "/tei/de/dt-ptolemaeus-tei-merge2.xml";
161 String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdGoerz;
162 DocumentHandler docHandler = new DocumentHandler();
163 CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdGoerz);
164 // docHandler.doOperation(docOperation);
165 String docIdSchulz = "/tei/de/Schulz_2009.xml";
166 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSchulz;
167 docHandler = new DocumentHandler();
168 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSchulz);
169 // docHandler.doOperation(docOperation);
170 String docIdBenedetti = "/echo/la/Benedetti_1585_163127KK.xml";
171 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdBenedetti;
172 docHandler = new DocumentHandler();
173 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdBenedetti);
174 // docHandler.doOperation(docOperation);
175 String docIdSongYingxing = "/echo/zh/SongYingxing_1637.xml";
176 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSongYingxing;
177 docHandler = new DocumentHandler();
178 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSongYingxing);
179 // docHandler.doOperation(docOperation);
180 String docIdMonte = "/archimedes/la/monte_mecha_036_la_1577.xml";
181 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMonte;
182 docHandler = new DocumentHandler();
183 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMonte);
184 // docHandler.doOperation(docOperation);
185 String docIdEinstein = "/diverse/de/Einst_Antwo_de_1912.xml";
186 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstein;
187 docHandler = new DocumentHandler();
188 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstein);
189 // docHandler.doOperation(docOperation);
190 String docIdEinsteinUeber = "/diverse/de/Einst_Ueber_de_1907_02.xml";
191 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinsteinUeber;
192 docHandler = new DocumentHandler();
193 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinsteinUeber);
194 // docHandler.doOperation(docOperation);
195 String docIdTest = "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml";
196 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdTest;
197 docHandler = new DocumentHandler();
198 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdTest);
199 // docHandler.doOperation(docOperation);
200 String docIdMega = "/test/mega/MEGA_A2_B013-00_ETX.xml";
201 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMega;
202 docHandler = new DocumentHandler();
203 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMega);
204 // docHandler.doOperation(docOperation);
205 String docIdDiverse = "/diverse/en/078_A_1916.xml";
206 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdDiverse;
207 docHandler = new DocumentHandler();
208 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdDiverse);
209 // docHandler.doOperation(docOperation);
210 String docIdEinstGrossmann = "/diverse/de/EinsteinGrossmann.xml";
211 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstGrossmann;
212 docHandler = new DocumentHandler();
213 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstGrossmann);
214 // docHandler.doOperation(docOperation);
215 String docIdEinstGrund = "/diverse/en/078_A_1916.xml";
216 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstGrund;
217 docHandler = new DocumentHandler();
218 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstGrund);
219 // docHandler.doOperation(docOperation);
220 String docIdVolta = "/archimedes/it/volta_nuoMemLetTerz_922_it_1795.xml";
221 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdVolta;
222 docHandler = new DocumentHandler();
223 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdVolta);
224 // docHandler.doOperation(docOperation);
225 String docIdVitruv = "/echo/it/Vitruvius_1747_Y1G1TRCW.xml";
226 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdVitruv;
227 docHandler = new DocumentHandler();
228 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdVitruv);
229 docHandler.doOperation(docOperation);
230 // indexer.deleteDocument(docIdGoerz);
231 // indexer.deleteDocument(docIdBenedetti);
232 MorphologyCache.getInstance().end();
233 LexHandler.getInstance().end();
234 }
235
236 private Hashtable<Integer, StringBuilder> getFragments(String fileName) throws ApplicationException {
237 try {
238 GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler();
239 XMLReader xmlParser = new SAXParser();
240 xmlParser.setContentHandler(getFragmentsContentHandler);
241 InputSource inputSource = new InputSource(fileName);
242 xmlParser.parse(inputSource);
243 Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
244 return resultFragments;
245 } catch (SAXException e) {
246 throw new ApplicationException(e);
247 } catch (IOException e) {
248 throw new ApplicationException(e);
249 }
250 }
251
252 private void testTransform() throws ApplicationException {
253 Date begin = new Date();
254 XslResourceTransformer xslResourceTransformer = new XslResourceTransformer("pageXml.xsl");
255 xslResourceTransformer = new XslResourceTransformer("pageTei.xsl");
256 xslResourceTransformer = new XslResourceTransformer("pageArchimedes.xsl");
257 xslResourceTransformer = new XslResourceTransformer("pageXhtml.xsl");
258 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
259 Date end = new Date();
260 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
261 begin = new Date();
262 String docFilePath = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585/pages/page-13-morph.xml";
263 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
264 String result = xslResourceTransformer.transform(docFilePath);
265 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
266 result = xslResourceTransformer.transform(docFilePath);
267 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
268 result = xslResourceTransformer.transform(docFilePath);
269 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
270 result = xslResourceTransformer.transform(docFilePath);
271 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
272 result = xslResourceTransformer.transform(docFilePath);
273 end = new Date();
274 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
275 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
276 begin = new Date();
277 result = xslResourceTransformer.transform(docFilePath);
278 result = xslResourceTransformer.transform(docFilePath);
279 result = xslResourceTransformer.transform(docFilePath);
280 result = xslResourceTransformer.transform(docFilePath);
281 result = xslResourceTransformer.transform(docFilePath);
282 end = new Date();
283 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
284 try {
285 Processor processor = new Processor(false);
286 XsltCompiler xsltCompiler = processor.newXsltCompiler();
287 URL xslUrl = XslResourceTransformer.class.getResource("pageEcho.xsl");
288 StreamSource xslStreamSource = new StreamSource(xslUrl.openStream());
289 XsltExecutable xsltExecutable = xsltCompiler.compile(xslStreamSource);
290 net.sf.saxon.s9api.XsltTransformer xsltTransformer = xsltExecutable.load();
291 Serializer serializer = new Serializer();
292 serializer.setOutputWriter(new StringWriter());
293 begin = new Date();
294 for (int i=0; i<=5; i++) {
295 StreamSource xmlDoc = new StreamSource(docFilePath);
296 xsltTransformer.setSource(xmlDoc); // needs some time for bigger documents
297 xsltTransformer.setDestination(serializer);
298 xsltTransformer.transform(); // needs some time for bigger documents
299 result = serializer.getOutputDestination().toString();
300 }
301 end = new Date();
302 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
303 } catch (Exception e) {
304
305 }
306 }
307
308 private String tokenizeXmlFragment() throws ApplicationException {
309 String result = null;
310 try {
311 String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8");
312 String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13";
313 URL srcUrl = new URL(srcUrlStr);
314 InputStream inputStream = srcUrl.openStream();
315 BufferedInputStream in = new BufferedInputStream(inputStream);
316 xmlFragment = IOUtils.toString(in, "utf-8");
317 in.close();
318
319 XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment));
320 xmlTokenizer.setLanguage("lat");
321 String[] stopElements = {"var"};
322 // xmlTokenizer.setOutputFormat("string");
323 String[] outputOptions = {"withLemmas"};
324 xmlTokenizer.setOutputOptions(outputOptions);
325 xmlTokenizer.setStopElements(stopElements);
326 xmlTokenizer.tokenize();
327 result = xmlTokenizer.getXmlResult();
328 System.out.println(result);
329 } catch (Exception e) {
330 throw new ApplicationException(e);
331 }
332 return result;
333 }
334
335 private String normalizeWords(String xmlStr) throws ApplicationException {
336 try {
337 WordContentHandler wordContentHandler = new WordContentHandler();
338 XMLReader xmlParser = new SAXParser();
339 xmlParser.setContentHandler(wordContentHandler);
340 StringReader strReader = new StringReader(xmlStr);
341 InputSource inputSource = new InputSource(strReader);
342 xmlParser.parse(inputSource);
343 String result = wordContentHandler.getResult();
344 return result;
345 } catch (SAXException e) {
346 throw new ApplicationException(e);
347 } catch (IOException e) {
348 throw new ApplicationException(e);
349 }
350 }
351
352 private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException {
353 StringReader strReader = new StringReader(xmlStr);
354 XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader);
355 xmlTokenizer.setLanguage(language);
356 String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance)
357 xmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
358 xmlTokenizer.tokenize();
359 String retStr = xmlTokenizer.getXmlResult();
360 return retStr;
361 }
362
363 private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
364 try {
365 WordContentHandler wordContentHandler = new WordContentHandler();
366 XMLReader xmlParser = new SAXParser();
367 xmlParser.setContentHandler(wordContentHandler);
368 StringReader strReader = new StringReader(xmlStr);
369 InputSource inputSource = new InputSource(strReader);
370 xmlParser.parse(inputSource);
371 String result = wordContentHandler.getResult();
372 return result;
373 } catch (SAXException e) {
374 throw new ApplicationException(e);
375 } catch (IOException e) {
376 throw new ApplicationException(e);
377 }
378 }
379
380 private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
381 String result = null;
382 try {
383 xmlStr = normalizeWords(xmlStr);
384 HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language);
385 highlightContentHandler.setFirstPageBreakReachedMode(true);
386 XMLReader xmlParser = new SAXParser();
387 xmlParser.setContentHandler(highlightContentHandler);
388 StringReader stringReader = new StringReader(xmlStr);
389 InputSource inputSource = new InputSource(stringReader);
390 xmlParser.parse(inputSource);
391 result = highlightContentHandler.getResult().toString();
392 } catch (SAXException e) {
393 throw new ApplicationException(e);
394 } catch (IOException e) {
395 throw new ApplicationException(e);
396 }
397 return result;
398 }
399
400 private void testScheduler() throws ApplicationException {
401 CmsDocOperation docOperation = new CmsDocOperation("update", "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=/echo/zh/SongYingxing_1637.xml", null, "/echo/zh/SongYingxing_1637.xml");
402 String[] elemNames = {"s", "head"};
403 docOperation.setElementNames(elemNames);
404 CmsChainScheduler scheduler = CmsChainScheduler.getInstance();
405 docOperation = scheduler.doOperation(docOperation);
406 String bla = "";
407 }
408
409 private void xquery() throws ApplicationException {
410 try {
411 XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
412 URL srcUrl = new URL("file:/Users/jwillenborg/tmp/blablabla/Benedetti_1585.xml");
413 String getTocEntries = "let $tocEntries := //echo:div[@type = 'section' or @type = 'chapter']";
414 String getFigures =
415 "let $allFigures := //*:figure " +
416 "let $figures := " +
417 " for $figure at $pos in $allFigures "+
418 " let $caption := string-join($figure/*:caption/text(), ' ') " +
419 " let $description := string-join($figure/*:description/text(), ' ') " +
420 " let $variables := string-join($figure/*:variables/text(), ' ') " +
421 " let $retFigure := " +
422 " element {'figure'}" +
423 " { attribute {'number'} {$pos}, " +
424 " element {'caption'} {$caption}, " +
425 " element {'description'} {$description}, " +
426 " element {'variables'} {$variables} }" +
427 " return " +
428 " $retFigure " +
429 "return $figures";
430 String result = xQueryEvaluator.evaluateAsString(srcUrl, getFigures);
431 String bla = result;
432 } catch (Exception e) {
433 throw new ApplicationException(e);
434 }
435 }
436
437 private void createToc() throws ApplicationException {
438 String docDirName = "/Users/jwillenborg/mpdl/data/xml/documents/echo/zh/SongYingxing_1637";
439 XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl");
440 File tocFile = new File(docDirName + "/toc.xml");
441 String docDestFileName = docDirName + "/SongYingxing_1637.xml";
442 String tocResult = tocTransformer.transform(docDestFileName);
443 String bla = "";
444 }
445
446 private void getDocInfo() throws ApplicationException {
447 IndexHandler indexHandler = IndexHandler.getInstance();
448 MetadataRecord mdRecord = indexHandler.getDocMetadata("/echo/la/Benedetti_1585_163127KK.xml");
449 }
450
451 }