Mercurial > hg > mpdl-group
view software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/test/TestLocal.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.cms.test; import java.io.BufferedInputStream; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.io.StringReader; import java.io.StringWriter; import java.net.URL; import java.util.ArrayList; import java.util.Date; import java.util.Hashtable; import javax.xml.transform.stream.StreamSource; import net.sf.saxon.s9api.Processor; import net.sf.saxon.s9api.QName; import net.sf.saxon.s9api.Serializer; import net.sf.saxon.s9api.XdmNode; import net.sf.saxon.s9api.XsltCompiler; import net.sf.saxon.s9api.XsltExecutable; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.Term; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsChainScheduler; import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler; import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler; import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer; import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler; import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits; import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord; import de.mpg.mpiwg.berlin.mpdl.cms.document.Token; public class TestLocal { private IndexHandler indexer; public static void main(String[] args) throws ApplicationException { try { TestLocal test = new TestLocal(); test.init(); // test.importAllDocuments(); // test.createAllPdfInDirectory(); // test.testTransform(); // test.testXml(); // test.generateToc(); test.testCalls(); // test.generatePdf(); // test.xquery(); // test.createToc(); // test.testScheduler(); // test.getDocInfo(); // test.testChars(); test.end(); } catch (Exception e) { e.printStackTrace(); } } private void init() throws ApplicationException { indexer = IndexHandler.getInstance(); } private void end() throws ApplicationException { indexer.end(); } private void testXml() throws ApplicationException { try { DocumentHandler docHandler = new DocumentHandler(); String docDirName = docHandler.getDocDir("/echo/la/Benedetti_1585_163127KK.xml"); String pageXmlFileName = docDirName + "/pages" + "/page-" + "444" + ".xml"; File pageXmlFile = new File(pageXmlFileName); String pageXmlStr = null; if (pageXmlFile.exists()) pageXmlStr = FileUtils.readFileToString(pageXmlFile, "utf-8"); System.out.println(pageXmlStr); String tokStr = tokenizeWithLemmas(pageXmlStr, "lat"); System.out.println(tokStr); tokStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokStr; byte[] blablabla = tokStr.getBytes("utf-8"); String blablu = new String(blablabla, "utf-8"); String bla = enrichWordsOrigRegNorm(blablu); System.out.println(bla); XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); URL url = new URL("file:/var/yp/Test_1789.xml"); XdmNode docNode = xQueryEvaluator.parse(url); // if it is not parseable an exception with a detail message is thrown File srcFile = new File("/Users/jwillenborg/mpdl/data/xml/documents/tei/de/dt-ptolemaeus-tei-merge2.xml"); FileReader docFileReader = new FileReader(srcFile); XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader); docXmlTokenizer.setDocIdentifier("/tei/de/dt-ptolemaeus-tei-merge2.xml"); docXmlTokenizer.tokenize(); ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements("s"); String blabla = ""; } catch (Exception e) { e.printStackTrace(); } } private void importAllDocuments() throws ApplicationException { DocumentHandler docHandler = new DocumentHandler(); CmsDocOperation docOperation = new CmsDocOperation("importDirectory", "file:/Users/jwillenborg/test/documents", null, null); docOperation.setCollectionNames("echo"); docHandler.doOperation(docOperation); } private void createAllPdfInDirectory() throws ApplicationException { DocumentHandler docHandler = new DocumentHandler(); CmsDocOperation docOperation = new CmsDocOperation("createAllPdfInDirectory", "file:/Users/jwillenborg/test/documents", null, null); docOperation.setCollectionNames("echo"); docHandler.doOperation(docOperation); } private void generatePdf() throws ApplicationException { long begin = new Date().getTime(); DocumentHandler docHandler = new DocumentHandler(); // String docId = "/echo/la/Benedetti_1585_163127KK.xml"; String docId = "/diverse/de/Einst_Ueber_de_1907_02.xml"; // String docId = "/archimedes/it/caver_metod_020_it_1891.xml"; CmsDocOperation docOperation = new CmsDocOperation("createPdf", null, null, docId); docHandler.doOperation(docOperation); long end = new Date().getTime(); System.out.println("Needed time: " + (end - begin)); } private void testChars() throws ApplicationException { String docId = "/test/benedetti/page-444.xml"; String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docId; DocumentHandler docHandler = new DocumentHandler(); CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docId); docHandler.doOperation(docOperation); } private void testCalls() throws ApplicationException { Date before = new Date(); System.out.println("Indexing start: " + before.getTime()); String docIdGoerz = "/tei/de/dt-ptolemaeus-tei-merge2.xml"; String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdGoerz; DocumentHandler docHandler = new DocumentHandler(); CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdGoerz); // docHandler.doOperation(docOperation); String docIdSchulz = "/tei/de/Schulz_2009.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSchulz; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSchulz); // docHandler.doOperation(docOperation); String docIdBenedetti = "/echo/la/Benedetti_1585_163127KK.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdBenedetti; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdBenedetti); // docHandler.doOperation(docOperation); String docIdSongYingxing = "/echo/zh/SongYingxing_1637.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSongYingxing; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSongYingxing); // docHandler.doOperation(docOperation); String docIdMonte = "/archimedes/la/monte_mecha_036_la_1577.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMonte; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMonte); // docHandler.doOperation(docOperation); String docIdEinstein = "/diverse/de/Einst_Antwo_de_1912.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstein; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstein); // docHandler.doOperation(docOperation); String docIdEinsteinUeber = "/diverse/de/Einst_Ueber_de_1907_02.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinsteinUeber; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinsteinUeber); // docHandler.doOperation(docOperation); String docIdTest = "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdTest; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdTest); // docHandler.doOperation(docOperation); String docIdMega = "/test/mega/MEGA_A2_B013-00_ETX.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMega; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMega); // docHandler.doOperation(docOperation); String docIdDiverse = "/diverse/en/078_A_1916.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdDiverse; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdDiverse); // docHandler.doOperation(docOperation); String docIdEinstGrossmann = "/diverse/de/EinsteinGrossmann.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstGrossmann; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstGrossmann); // docHandler.doOperation(docOperation); String docIdEinstGrund = "/diverse/en/078_A_1916.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstGrund; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstGrund); // docHandler.doOperation(docOperation); String docIdVolta = "/archimedes/it/volta_nuoMemLetTerz_922_it_1795.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdVolta; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdVolta); // docHandler.doOperation(docOperation); String docIdVitruv = "/echo/it/Vitruvius_1747_Y1G1TRCW.xml"; docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdVitruv; docHandler = new DocumentHandler(); docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdVitruv); docHandler.doOperation(docOperation); // indexer.deleteDocument(docIdGoerz); // indexer.deleteDocument(docIdBenedetti); MorphologyCache.getInstance().end(); LexHandler.getInstance().end(); } private Hashtable<Integer, StringBuilder> getFragments(String fileName) throws ApplicationException { try { GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(getFragmentsContentHandler); InputSource inputSource = new InputSource(fileName); xmlParser.parse(inputSource); Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages(); return resultFragments; } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } } private void testTransform() throws ApplicationException { Date begin = new Date(); XslResourceTransformer xslResourceTransformer = new XslResourceTransformer("pageXml.xsl"); xslResourceTransformer = new XslResourceTransformer("pageTei.xsl"); xslResourceTransformer = new XslResourceTransformer("pageArchimedes.xsl"); xslResourceTransformer = new XslResourceTransformer("pageXhtml.xsl"); xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); Date end = new Date(); System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); begin = new Date(); String docFilePath = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585/pages/page-13-morph.xml"; xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); String result = xslResourceTransformer.transform(docFilePath); xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); result = xslResourceTransformer.transform(docFilePath); xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); result = xslResourceTransformer.transform(docFilePath); xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); result = xslResourceTransformer.transform(docFilePath); xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); result = xslResourceTransformer.transform(docFilePath); end = new Date(); System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); begin = new Date(); result = xslResourceTransformer.transform(docFilePath); result = xslResourceTransformer.transform(docFilePath); result = xslResourceTransformer.transform(docFilePath); result = xslResourceTransformer.transform(docFilePath); result = xslResourceTransformer.transform(docFilePath); end = new Date(); System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); try { Processor processor = new Processor(false); XsltCompiler xsltCompiler = processor.newXsltCompiler(); URL xslUrl = XslResourceTransformer.class.getResource("pageEcho.xsl"); StreamSource xslStreamSource = new StreamSource(xslUrl.openStream()); XsltExecutable xsltExecutable = xsltCompiler.compile(xslStreamSource); net.sf.saxon.s9api.XsltTransformer xsltTransformer = xsltExecutable.load(); Serializer serializer = new Serializer(); serializer.setOutputWriter(new StringWriter()); begin = new Date(); for (int i=0; i<=5; i++) { StreamSource xmlDoc = new StreamSource(docFilePath); xsltTransformer.setSource(xmlDoc); // needs some time for bigger documents xsltTransformer.setDestination(serializer); xsltTransformer.transform(); // needs some time for bigger documents result = serializer.getOutputDestination().toString(); } end = new Date(); System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); } catch (Exception e) { } } private String tokenizeXmlFragment() throws ApplicationException { String result = null; try { String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8"); String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13"; URL srcUrl = new URL(srcUrlStr); InputStream inputStream = srcUrl.openStream(); BufferedInputStream in = new BufferedInputStream(inputStream); xmlFragment = IOUtils.toString(in, "utf-8"); in.close(); XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); xmlTokenizer.setLanguage("lat"); String[] stopElements = {"var"}; // xmlTokenizer.setOutputFormat("string"); String[] outputOptions = {"withLemmas"}; xmlTokenizer.setOutputOptions(outputOptions); xmlTokenizer.setStopElements(stopElements); xmlTokenizer.tokenize(); result = xmlTokenizer.getXmlResult(); System.out.println(result); } catch (Exception e) { throw new ApplicationException(e); } return result; } private String normalizeWords(String xmlStr) throws ApplicationException { try { WordContentHandler wordContentHandler = new WordContentHandler(); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(wordContentHandler); StringReader strReader = new StringReader(xmlStr); InputSource inputSource = new InputSource(strReader); xmlParser.parse(inputSource); String result = wordContentHandler.getResult(); return result; } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } } private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException { StringReader strReader = new StringReader(xmlStr); XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader); xmlTokenizer.setLanguage(language); String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance) xmlTokenizer.setOutputOptions(outputOptionsWithLemmas); xmlTokenizer.tokenize(); String retStr = xmlTokenizer.getXmlResult(); return retStr; } private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException { try { WordContentHandler wordContentHandler = new WordContentHandler(); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(wordContentHandler); StringReader strReader = new StringReader(xmlStr); InputSource inputSource = new InputSource(strReader); xmlParser.parse(inputSource); String result = wordContentHandler.getResult(); return result; } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } } private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException { String result = null; try { xmlStr = normalizeWords(xmlStr); HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language); highlightContentHandler.setFirstPageBreakReachedMode(true); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(highlightContentHandler); StringReader stringReader = new StringReader(xmlStr); InputSource inputSource = new InputSource(stringReader); xmlParser.parse(inputSource); result = highlightContentHandler.getResult().toString(); } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } return result; } private void testScheduler() throws ApplicationException { CmsDocOperation docOperation = new CmsDocOperation("update", "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=/echo/zh/SongYingxing_1637.xml", null, "/echo/zh/SongYingxing_1637.xml"); String[] elemNames = {"s", "head"}; docOperation.setElementNames(elemNames); CmsChainScheduler scheduler = CmsChainScheduler.getInstance(); docOperation = scheduler.doOperation(docOperation); String bla = ""; } private void xquery() throws ApplicationException { try { XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); URL srcUrl = new URL("file:/Users/jwillenborg/tmp/blablabla/Benedetti_1585.xml"); String getTocEntries = "let $tocEntries := //echo:div[@type = 'section' or @type = 'chapter']"; String getFigures = "let $allFigures := //*:figure " + "let $figures := " + " for $figure at $pos in $allFigures "+ " let $caption := string-join($figure/*:caption/text(), ' ') " + " let $description := string-join($figure/*:description/text(), ' ') " + " let $variables := string-join($figure/*:variables/text(), ' ') " + " let $retFigure := " + " element {'figure'}" + " { attribute {'number'} {$pos}, " + " element {'caption'} {$caption}, " + " element {'description'} {$description}, " + " element {'variables'} {$variables} }" + " return " + " $retFigure " + "return $figures"; String result = xQueryEvaluator.evaluateAsString(srcUrl, getFigures); String bla = result; } catch (Exception e) { throw new ApplicationException(e); } } private void createToc() throws ApplicationException { String docDirName = "/Users/jwillenborg/mpdl/data/xml/documents/echo/zh/SongYingxing_1637"; XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl"); File tocFile = new File(docDirName + "/toc.xml"); String docDestFileName = docDirName + "/SongYingxing_1637.xml"; String tocResult = tocTransformer.transform(docDestFileName); String bla = ""; } private void getDocInfo() throws ApplicationException { IndexHandler indexHandler = IndexHandler.getInstance(); MetadataRecord mdRecord = indexHandler.getDocMetadata("/echo/la/Benedetti_1585_163127KK.xml"); } }