comparison software/mpdl-services/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/test/TestLocal.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.cms.test;
2
3 import java.io.BufferedInputStream;
4 import java.io.File;
5 import java.io.FileReader;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.PrintWriter;
9 import java.io.StringReader;
10 import java.io.StringWriter;
11 import java.net.URL;
12 import java.util.ArrayList;
13 import java.util.Date;
14 import java.util.Hashtable;
15
16 import javax.xml.transform.stream.StreamSource;
17
18 import net.sf.saxon.s9api.Processor;
19 import net.sf.saxon.s9api.QName;
20 import net.sf.saxon.s9api.Serializer;
21 import net.sf.saxon.s9api.XdmNode;
22 import net.sf.saxon.s9api.XsltCompiler;
23 import net.sf.saxon.s9api.XsltExecutable;
24
25 import org.apache.commons.io.FileUtils;
26 import org.apache.commons.io.IOUtils;
27 import org.apache.lucene.document.Document;
28 import org.apache.lucene.document.Fieldable;
29 import org.apache.lucene.index.Term;
30 import org.xml.sax.InputSource;
31 import org.xml.sax.SAXException;
32 import org.xml.sax.XMLReader;
33
34 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
35
36 import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
37 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsChainScheduler;
38 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation;
39 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
40 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
41 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;
42 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
43 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
44 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler;
45 import de.mpg.mpiwg.berlin.mpdl.util.Util;
46 import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
47 import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler;
48 import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler;
49 import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
50 import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer;
51 import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler;
52 import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits;
53 import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord;
54 import de.mpg.mpiwg.berlin.mpdl.cms.document.Token;
55
56 public class TestLocal {
57 private IndexHandler indexer;
58
59 public static void main(String[] args) throws ApplicationException {
60 try {
61 TestLocal test = new TestLocal();
62 test.init();
63 // test.importAllDocuments();
64 // test.testTransform();
65 // test.testXml();
66 // test.generateToc();
67 test.testCalls();
68 // test.generatePdf();
69 // test.xquery();
70 // test.createToc();
71 // test.testScheduler();
72 // test.getDocInfo();
73 // test.testChars();
74 test.end();
75 } catch (Exception e) {
76 e.printStackTrace();
77 }
78 }
79
80 private void init() throws ApplicationException {
81 indexer = IndexHandler.getInstance();
82 }
83
84 private void end() throws ApplicationException {
85 indexer.end();
86 }
87
88 private void testXml() throws ApplicationException {
89 try {
90 DocumentHandler docHandler = new DocumentHandler();
91 String docDirName = docHandler.getDocDir("/echo/la/Benedetti_1585_163127KK.xml");
92 String pageXmlFileName = docDirName + "/pages" + "/page-" + "444" + ".xml";
93 File pageXmlFile = new File(pageXmlFileName);
94 String pageXmlStr = null;
95 if (pageXmlFile.exists())
96 pageXmlStr = FileUtils.readFileToString(pageXmlFile, "utf-8");
97 System.out.println(pageXmlStr);
98 String tokStr = tokenizeWithLemmas(pageXmlStr, "lat");
99 System.out.println(tokStr);
100 tokStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokStr;
101 byte[] blablabla = tokStr.getBytes("utf-8");
102 String blablu = new String(blablabla, "utf-8");
103 String bla = enrichWordsOrigRegNorm(blablu);
104 System.out.println(bla);
105
106 XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
107 URL url = new URL("file:/var/yp/Test_1789.xml");
108 XdmNode docNode = xQueryEvaluator.parse(url); // if it is not parseable an exception with a detail message is thrown
109
110 File srcFile = new File("/Users/jwillenborg/mpdl/data/xml/documents/tei/de/dt-ptolemaeus-tei-merge2.xml");
111 FileReader docFileReader = new FileReader(srcFile);
112 XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader);
113 docXmlTokenizer.setDocIdentifier("/tei/de/dt-ptolemaeus-tei-merge2.xml");
114 docXmlTokenizer.tokenize();
115 ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements("s");
116 String blabla = "";
117 } catch (Exception e) {
118 e.printStackTrace();
119 }
120 }
121
122 private void importAllDocuments() throws ApplicationException {
123 DocumentHandler docHandler = new DocumentHandler();
124 CmsDocOperation docOperation = new CmsDocOperation("importDirectory", "file:/Users/jwillenborg/test/documents", null, null);
125 docOperation.setCollectionNames("echo");
126 docHandler.doOperation(docOperation);
127 }
128
129 private void generatePdf() throws ApplicationException {
130 long begin = new Date().getTime();
131 DocumentHandler docHandler = new DocumentHandler();
132 CmsDocOperation docOperation = new CmsDocOperation("createPdf", null, null, "/echo/la/Benedetti_1585_163127KK.xml");
133 docHandler.doOperation(docOperation);
134 docOperation = new CmsDocOperation("createPdf", null, null, "/tei/de/dt-ptolemaeus-tei-merge2.xml");
135 // docHandler.doOperation(docOperation);
136 long end = new Date().getTime();
137 System.out.println("Needed time: " + (end - begin));
138 }
139
140 private void testChars() throws ApplicationException {
141 String docId = "/test/benedetti/page-444.xml";
142 String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docId;
143 DocumentHandler docHandler = new DocumentHandler();
144 CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docId);
145 docHandler.doOperation(docOperation);
146 }
147
148 private void testCalls() throws ApplicationException {
149 Date before = new Date();
150 System.out.println("Indexing start: " + before.getTime());
151 String docIdGoerz = "/tei/de/dt-ptolemaeus-tei-merge2.xml";
152 String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdGoerz;
153 DocumentHandler docHandler = new DocumentHandler();
154 CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdGoerz);
155 // docHandler.doOperation(docOperation);
156 String docIdSchulz = "/tei/de/Schulz_2009.xml";
157 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSchulz;
158 docHandler = new DocumentHandler();
159 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSchulz);
160 // docHandler.doOperation(docOperation);
161 String docIdBenedetti = "/echo/la/Benedetti_1585_163127KK.xml";
162 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdBenedetti;
163 docHandler = new DocumentHandler();
164 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdBenedetti);
165 // docHandler.doOperation(docOperation);
166 String docIdSongYingxing = "/echo/zh/SongYingxing_1637.xml";
167 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSongYingxing;
168 docHandler = new DocumentHandler();
169 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSongYingxing);
170 // docHandler.doOperation(docOperation);
171 String docIdMonte = "/archimedes/la/monte_mecha_036_la_1577.xml";
172 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMonte;
173 docHandler = new DocumentHandler();
174 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMonte);
175 // docHandler.doOperation(docOperation);
176 String docIdEinstein = "/diverse/de/Einst_Antwo_de_1912.xml";
177 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstein;
178 docHandler = new DocumentHandler();
179 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstein);
180 // docHandler.doOperation(docOperation);
181 String docIdEinsteinUeber = "/diverse/de/Einst_Ueber_de_1907_02.xml";
182 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinsteinUeber;
183 docHandler = new DocumentHandler();
184 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinsteinUeber);
185 // docHandler.doOperation(docOperation);
186 String docIdTest = "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml";
187 docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdTest;
188 docHandler = new DocumentHandler();
189 docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdTest);
190 // docHandler.doOperation(docOperation);
191 // indexer.deleteDocument(docIdGoerz);
192 // indexer.deleteDocument(docIdBenedetti);
193 MorphologyCache.getInstance().end();
194 LexHandler.getInstance().end();
195 }
196
197 private Hashtable<Integer, StringBuilder> getFragments(String fileName) throws ApplicationException {
198 try {
199 GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler();
200 XMLReader xmlParser = new SAXParser();
201 xmlParser.setContentHandler(getFragmentsContentHandler);
202 InputSource inputSource = new InputSource(fileName);
203 xmlParser.parse(inputSource);
204 Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
205 return resultFragments;
206 } catch (SAXException e) {
207 throw new ApplicationException(e);
208 } catch (IOException e) {
209 throw new ApplicationException(e);
210 }
211 }
212
213 private void testTransform() throws ApplicationException {
214 Date begin = new Date();
215 XslResourceTransformer xslResourceTransformer = new XslResourceTransformer("pageXml.xsl");
216 xslResourceTransformer = new XslResourceTransformer("pageTei.xsl");
217 xslResourceTransformer = new XslResourceTransformer("pageArchimedes.xsl");
218 xslResourceTransformer = new XslResourceTransformer("pageXhtml.xsl");
219 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
220 Date end = new Date();
221 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
222 begin = new Date();
223 String docFilePath = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585/pages/page-13-morph.xml";
224 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
225 String result = xslResourceTransformer.transform(docFilePath);
226 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
227 result = xslResourceTransformer.transform(docFilePath);
228 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
229 result = xslResourceTransformer.transform(docFilePath);
230 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
231 result = xslResourceTransformer.transform(docFilePath);
232 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
233 result = xslResourceTransformer.transform(docFilePath);
234 end = new Date();
235 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
236 xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
237 begin = new Date();
238 result = xslResourceTransformer.transform(docFilePath);
239 result = xslResourceTransformer.transform(docFilePath);
240 result = xslResourceTransformer.transform(docFilePath);
241 result = xslResourceTransformer.transform(docFilePath);
242 result = xslResourceTransformer.transform(docFilePath);
243 end = new Date();
244 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
245 try {
246 Processor processor = new Processor(false);
247 XsltCompiler xsltCompiler = processor.newXsltCompiler();
248 URL xslUrl = XslResourceTransformer.class.getResource("pageEcho.xsl");
249 StreamSource xslStreamSource = new StreamSource(xslUrl.openStream());
250 XsltExecutable xsltExecutable = xsltCompiler.compile(xslStreamSource);
251 net.sf.saxon.s9api.XsltTransformer xsltTransformer = xsltExecutable.load();
252 Serializer serializer = new Serializer();
253 serializer.setOutputWriter(new StringWriter());
254 begin = new Date();
255 for (int i=0; i<=5; i++) {
256 StreamSource xmlDoc = new StreamSource(docFilePath);
257 xsltTransformer.setSource(xmlDoc); // needs some time for bigger documents
258 xsltTransformer.setDestination(serializer);
259 xsltTransformer.transform(); // needs some time for bigger documents
260 result = serializer.getOutputDestination().toString();
261 }
262 end = new Date();
263 System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
264 } catch (Exception e) {
265
266 }
267 }
268
269 private String tokenizeXmlFragment() throws ApplicationException {
270 String result = null;
271 try {
272 String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8");
273 String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13";
274 URL srcUrl = new URL(srcUrlStr);
275 InputStream inputStream = srcUrl.openStream();
276 BufferedInputStream in = new BufferedInputStream(inputStream);
277 xmlFragment = IOUtils.toString(in, "utf-8");
278 in.close();
279
280 XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment));
281 xmlTokenizer.setLanguage("lat");
282 String[] stopElements = {"var"};
283 // xmlTokenizer.setOutputFormat("string");
284 String[] outputOptions = {"withLemmas"};
285 xmlTokenizer.setOutputOptions(outputOptions);
286 xmlTokenizer.setStopElements(stopElements);
287 xmlTokenizer.tokenize();
288 result = xmlTokenizer.getXmlResult();
289 System.out.println(result);
290 } catch (Exception e) {
291 throw new ApplicationException(e);
292 }
293 return result;
294 }
295
296 private String normalizeWords(String xmlStr) throws ApplicationException {
297 try {
298 WordContentHandler wordContentHandler = new WordContentHandler();
299 XMLReader xmlParser = new SAXParser();
300 xmlParser.setContentHandler(wordContentHandler);
301 StringReader strReader = new StringReader(xmlStr);
302 InputSource inputSource = new InputSource(strReader);
303 xmlParser.parse(inputSource);
304 String result = wordContentHandler.getResult();
305 return result;
306 } catch (SAXException e) {
307 throw new ApplicationException(e);
308 } catch (IOException e) {
309 throw new ApplicationException(e);
310 }
311 }
312
313 private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException {
314 StringReader strReader = new StringReader(xmlStr);
315 XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader);
316 xmlTokenizer.setLanguage(language);
317 String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance)
318 xmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
319 xmlTokenizer.tokenize();
320 String retStr = xmlTokenizer.getXmlResult();
321 return retStr;
322 }
323
324 private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
325 try {
326 WordContentHandler wordContentHandler = new WordContentHandler();
327 XMLReader xmlParser = new SAXParser();
328 xmlParser.setContentHandler(wordContentHandler);
329 StringReader strReader = new StringReader(xmlStr);
330 InputSource inputSource = new InputSource(strReader);
331 xmlParser.parse(inputSource);
332 String result = wordContentHandler.getResult();
333 return result;
334 } catch (SAXException e) {
335 throw new ApplicationException(e);
336 } catch (IOException e) {
337 throw new ApplicationException(e);
338 }
339 }
340
341 private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
342 String result = null;
343 try {
344 xmlStr = normalizeWords(xmlStr);
345 HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language);
346 highlightContentHandler.setFirstPageBreakReachedMode(true);
347 XMLReader xmlParser = new SAXParser();
348 xmlParser.setContentHandler(highlightContentHandler);
349 StringReader stringReader = new StringReader(xmlStr);
350 InputSource inputSource = new InputSource(stringReader);
351 xmlParser.parse(inputSource);
352 result = highlightContentHandler.getResult().toString();
353 } catch (SAXException e) {
354 throw new ApplicationException(e);
355 } catch (IOException e) {
356 throw new ApplicationException(e);
357 }
358 return result;
359 }
360
361 private void testScheduler() throws ApplicationException {
362 CmsDocOperation docOperation = new CmsDocOperation("update", "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=/echo/zh/SongYingxing_1637.xml", null, "/echo/zh/SongYingxing_1637.xml");
363 String[] elemNames = {"s", "head"};
364 docOperation.setElementNames(elemNames);
365 CmsChainScheduler scheduler = CmsChainScheduler.getInstance();
366 docOperation = scheduler.doOperation(docOperation);
367 String bla = "";
368 }
369
370 private void xquery() throws ApplicationException {
371 try {
372 XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
373 URL srcUrl = new URL("file:/Users/jwillenborg/tmp/blablabla/Benedetti_1585.xml");
374 String getTocEntries = "let $tocEntries := //echo:div[@type = 'section' or @type = 'chapter']";
375 String getFigures =
376 "let $allFigures := //*:figure " +
377 "let $figures := " +
378 " for $figure at $pos in $allFigures "+
379 " let $caption := string-join($figure/*:caption/text(), ' ') " +
380 " let $description := string-join($figure/*:description/text(), ' ') " +
381 " let $variables := string-join($figure/*:variables/text(), ' ') " +
382 " let $retFigure := " +
383 " element {'figure'}" +
384 " { attribute {'number'} {$pos}, " +
385 " element {'caption'} {$caption}, " +
386 " element {'description'} {$description}, " +
387 " element {'variables'} {$variables} }" +
388 " return " +
389 " $retFigure " +
390 "return $figures";
391 String result = xQueryEvaluator.evaluateAsString(srcUrl, getFigures);
392 String bla = result;
393 } catch (Exception e) {
394 throw new ApplicationException(e);
395 }
396 }
397
398 private void createToc() throws ApplicationException {
399 String docDirName = "/Users/jwillenborg/mpdl/data/xml/documents/echo/zh/SongYingxing_1637";
400 XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl");
401 File tocFile = new File(docDirName + "/toc.xml");
402 String docDestFileName = docDirName + "/SongYingxing_1637.xml";
403 String tocResult = tocTransformer.transform(docDestFileName);
404 String bla = "";
405 }
406
407 private void getDocInfo() throws ApplicationException {
408 IndexHandler indexHandler = IndexHandler.getInstance();
409 MetadataRecord mdRecord = indexHandler.getDocMetadata("/echo/la/Benedetti_1585_163127KK.xml");
410 }
411
412 }