Mercurial > hg > mpdl-group
diff software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/test/TestLocal.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/test/TestLocal.java Tue May 21 10:19:32 2013 +0200 @@ -0,0 +1,451 @@ +package de.mpg.mpiwg.berlin.mpdl.cms.test; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.io.StringReader; +import java.io.StringWriter; +import java.net.URL; +import java.util.ArrayList; +import java.util.Date; +import java.util.Hashtable; + +import javax.xml.transform.stream.StreamSource; + +import net.sf.saxon.s9api.Processor; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.Serializer; +import net.sf.saxon.s9api.XdmNode; +import net.sf.saxon.s9api.XsltCompiler; +import net.sf.saxon.s9api.XsltExecutable; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.Term; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsChainScheduler; +import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer; +import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits; +import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord; +import de.mpg.mpiwg.berlin.mpdl.cms.document.Token; + +public class TestLocal { + private IndexHandler indexer; + + public static void main(String[] args) throws ApplicationException { + try { + TestLocal test = new TestLocal(); + test.init(); + // test.importAllDocuments(); + // test.createAllPdfInDirectory(); + // test.testTransform(); + // test.testXml(); + // test.generateToc(); + test.testCalls(); + // test.generatePdf(); + // test.xquery(); + // test.createToc(); + // test.testScheduler(); + // test.getDocInfo(); + // test.testChars(); + test.end(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void init() throws ApplicationException { + indexer = IndexHandler.getInstance(); + } + + private void end() throws ApplicationException { + indexer.end(); + } + + private void testXml() throws ApplicationException { + try { + DocumentHandler docHandler = new DocumentHandler(); + String docDirName = docHandler.getDocDir("/echo/la/Benedetti_1585_163127KK.xml"); + String pageXmlFileName = docDirName + "/pages" + "/page-" + "444" + ".xml"; + File pageXmlFile = new File(pageXmlFileName); + String pageXmlStr = null; + if (pageXmlFile.exists()) + pageXmlStr = FileUtils.readFileToString(pageXmlFile, "utf-8"); + System.out.println(pageXmlStr); + String tokStr = tokenizeWithLemmas(pageXmlStr, "lat"); + System.out.println(tokStr); + tokStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokStr; + byte[] blablabla = tokStr.getBytes("utf-8"); + String blablu = new String(blablabla, "utf-8"); + String bla = enrichWordsOrigRegNorm(blablu); + System.out.println(bla); + + XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); + URL url = new URL("file:/var/yp/Test_1789.xml"); + XdmNode docNode = xQueryEvaluator.parse(url); // if it is not parseable an exception with a detail message is thrown + + File srcFile = new File("/Users/jwillenborg/mpdl/data/xml/documents/tei/de/dt-ptolemaeus-tei-merge2.xml"); + FileReader docFileReader = new FileReader(srcFile); + XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader); + docXmlTokenizer.setDocIdentifier("/tei/de/dt-ptolemaeus-tei-merge2.xml"); + docXmlTokenizer.tokenize(); + ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements("s"); + String blabla = ""; + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void importAllDocuments() throws ApplicationException { + DocumentHandler docHandler = new DocumentHandler(); + CmsDocOperation docOperation = new CmsDocOperation("importDirectory", "file:/Users/jwillenborg/test/documents", null, null); + docOperation.setCollectionNames("echo"); + docHandler.doOperation(docOperation); + } + + private void createAllPdfInDirectory() throws ApplicationException { + DocumentHandler docHandler = new DocumentHandler(); + CmsDocOperation docOperation = new CmsDocOperation("createAllPdfInDirectory", "file:/Users/jwillenborg/test/documents", null, null); + docOperation.setCollectionNames("echo"); + docHandler.doOperation(docOperation); + } + + private void generatePdf() throws ApplicationException { + long begin = new Date().getTime(); + DocumentHandler docHandler = new DocumentHandler(); + // String docId = "/echo/la/Benedetti_1585_163127KK.xml"; + String docId = "/diverse/de/Einst_Ueber_de_1907_02.xml"; + // String docId = "/archimedes/it/caver_metod_020_it_1891.xml"; + CmsDocOperation docOperation = new CmsDocOperation("createPdf", null, null, docId); + docHandler.doOperation(docOperation); + long end = new Date().getTime(); + System.out.println("Needed time: " + (end - begin)); + } + + private void testChars() throws ApplicationException { + String docId = "/test/benedetti/page-444.xml"; + String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docId; + DocumentHandler docHandler = new DocumentHandler(); + CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docId); + docHandler.doOperation(docOperation); + } + + private void testCalls() throws ApplicationException { + Date before = new Date(); + System.out.println("Indexing start: " + before.getTime()); + String docIdGoerz = "/tei/de/dt-ptolemaeus-tei-merge2.xml"; + String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdGoerz; + DocumentHandler docHandler = new DocumentHandler(); + CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdGoerz); + // docHandler.doOperation(docOperation); + String docIdSchulz = "/tei/de/Schulz_2009.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSchulz; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSchulz); + // docHandler.doOperation(docOperation); + String docIdBenedetti = "/echo/la/Benedetti_1585_163127KK.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdBenedetti; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdBenedetti); + // docHandler.doOperation(docOperation); + String docIdSongYingxing = "/echo/zh/SongYingxing_1637.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSongYingxing; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSongYingxing); + // docHandler.doOperation(docOperation); + String docIdMonte = "/archimedes/la/monte_mecha_036_la_1577.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMonte; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMonte); + // docHandler.doOperation(docOperation); + String docIdEinstein = "/diverse/de/Einst_Antwo_de_1912.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstein; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstein); + // docHandler.doOperation(docOperation); + String docIdEinsteinUeber = "/diverse/de/Einst_Ueber_de_1907_02.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinsteinUeber; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinsteinUeber); + // docHandler.doOperation(docOperation); + String docIdTest = "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdTest; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdTest); + // docHandler.doOperation(docOperation); + String docIdMega = "/test/mega/MEGA_A2_B013-00_ETX.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMega; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMega); + // docHandler.doOperation(docOperation); + String docIdDiverse = "/diverse/en/078_A_1916.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdDiverse; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdDiverse); + // docHandler.doOperation(docOperation); + String docIdEinstGrossmann = "/diverse/de/EinsteinGrossmann.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstGrossmann; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstGrossmann); + // docHandler.doOperation(docOperation); + String docIdEinstGrund = "/diverse/en/078_A_1916.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstGrund; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstGrund); + // docHandler.doOperation(docOperation); + String docIdVolta = "/archimedes/it/volta_nuoMemLetTerz_922_it_1795.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdVolta; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdVolta); + // docHandler.doOperation(docOperation); + String docIdVitruv = "/echo/it/Vitruvius_1747_Y1G1TRCW.xml"; + docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdVitruv; + docHandler = new DocumentHandler(); + docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdVitruv); + docHandler.doOperation(docOperation); + // indexer.deleteDocument(docIdGoerz); + // indexer.deleteDocument(docIdBenedetti); + MorphologyCache.getInstance().end(); + LexHandler.getInstance().end(); + } + + private Hashtable<Integer, StringBuilder> getFragments(String fileName) throws ApplicationException { + try { + GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(getFragmentsContentHandler); + InputSource inputSource = new InputSource(fileName); + xmlParser.parse(inputSource); + Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages(); + return resultFragments; + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void testTransform() throws ApplicationException { + Date begin = new Date(); + XslResourceTransformer xslResourceTransformer = new XslResourceTransformer("pageXml.xsl"); + xslResourceTransformer = new XslResourceTransformer("pageTei.xsl"); + xslResourceTransformer = new XslResourceTransformer("pageArchimedes.xsl"); + xslResourceTransformer = new XslResourceTransformer("pageXhtml.xsl"); + xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); + Date end = new Date(); + System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); + begin = new Date(); + String docFilePath = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585/pages/page-13-morph.xml"; + xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); + String result = xslResourceTransformer.transform(docFilePath); + xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); + result = xslResourceTransformer.transform(docFilePath); + xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); + result = xslResourceTransformer.transform(docFilePath); + xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); + result = xslResourceTransformer.transform(docFilePath); + xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); + result = xslResourceTransformer.transform(docFilePath); + end = new Date(); + System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); + xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl"); + begin = new Date(); + result = xslResourceTransformer.transform(docFilePath); + result = xslResourceTransformer.transform(docFilePath); + result = xslResourceTransformer.transform(docFilePath); + result = xslResourceTransformer.transform(docFilePath); + result = xslResourceTransformer.transform(docFilePath); + end = new Date(); + System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); + try { + Processor processor = new Processor(false); + XsltCompiler xsltCompiler = processor.newXsltCompiler(); + URL xslUrl = XslResourceTransformer.class.getResource("pageEcho.xsl"); + StreamSource xslStreamSource = new StreamSource(xslUrl.openStream()); + XsltExecutable xsltExecutable = xsltCompiler.compile(xslStreamSource); + net.sf.saxon.s9api.XsltTransformer xsltTransformer = xsltExecutable.load(); + Serializer serializer = new Serializer(); + serializer.setOutputWriter(new StringWriter()); + begin = new Date(); + for (int i=0; i<=5; i++) { + StreamSource xmlDoc = new StreamSource(docFilePath); + xsltTransformer.setSource(xmlDoc); // needs some time for bigger documents + xsltTransformer.setDestination(serializer); + xsltTransformer.transform(); // needs some time for bigger documents + result = serializer.getOutputDestination().toString(); + } + end = new Date(); + System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms"); + } catch (Exception e) { + + } + } + + private String tokenizeXmlFragment() throws ApplicationException { + String result = null; + try { + String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8"); + String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13"; + URL srcUrl = new URL(srcUrlStr); + InputStream inputStream = srcUrl.openStream(); + BufferedInputStream in = new BufferedInputStream(inputStream); + xmlFragment = IOUtils.toString(in, "utf-8"); + in.close(); + + XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); + xmlTokenizer.setLanguage("lat"); + String[] stopElements = {"var"}; + // xmlTokenizer.setOutputFormat("string"); + String[] outputOptions = {"withLemmas"}; + xmlTokenizer.setOutputOptions(outputOptions); + xmlTokenizer.setStopElements(stopElements); + xmlTokenizer.tokenize(); + result = xmlTokenizer.getXmlResult(); + System.out.println(result); + } catch (Exception e) { + throw new ApplicationException(e); + } + return result; + } + + private String normalizeWords(String xmlStr) throws ApplicationException { + try { + WordContentHandler wordContentHandler = new WordContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(wordContentHandler); + StringReader strReader = new StringReader(xmlStr); + InputSource inputSource = new InputSource(strReader); + xmlParser.parse(inputSource); + String result = wordContentHandler.getResult(); + return result; + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException { + StringReader strReader = new StringReader(xmlStr); + XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader); + xmlTokenizer.setLanguage(language); + String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance) + xmlTokenizer.setOutputOptions(outputOptionsWithLemmas); + xmlTokenizer.tokenize(); + String retStr = xmlTokenizer.getXmlResult(); + return retStr; + } + + private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException { + try { + WordContentHandler wordContentHandler = new WordContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(wordContentHandler); + StringReader strReader = new StringReader(xmlStr); + InputSource inputSource = new InputSource(strReader); + xmlParser.parse(inputSource); + String result = wordContentHandler.getResult(); + return result; + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException { + String result = null; + try { + xmlStr = normalizeWords(xmlStr); + HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language); + highlightContentHandler.setFirstPageBreakReachedMode(true); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(highlightContentHandler); + StringReader stringReader = new StringReader(xmlStr); + InputSource inputSource = new InputSource(stringReader); + xmlParser.parse(inputSource); + result = highlightContentHandler.getResult().toString(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return result; + } + + private void testScheduler() throws ApplicationException { + CmsDocOperation docOperation = new CmsDocOperation("update", "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=/echo/zh/SongYingxing_1637.xml", null, "/echo/zh/SongYingxing_1637.xml"); + String[] elemNames = {"s", "head"}; + docOperation.setElementNames(elemNames); + CmsChainScheduler scheduler = CmsChainScheduler.getInstance(); + docOperation = scheduler.doOperation(docOperation); + String bla = ""; + } + + private void xquery() throws ApplicationException { + try { + XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); + URL srcUrl = new URL("file:/Users/jwillenborg/tmp/blablabla/Benedetti_1585.xml"); + String getTocEntries = "let $tocEntries := //echo:div[@type = 'section' or @type = 'chapter']"; + String getFigures = + "let $allFigures := //*:figure " + + "let $figures := " + + " for $figure at $pos in $allFigures "+ + " let $caption := string-join($figure/*:caption/text(), ' ') " + + " let $description := string-join($figure/*:description/text(), ' ') " + + " let $variables := string-join($figure/*:variables/text(), ' ') " + + " let $retFigure := " + + " element {'figure'}" + + " { attribute {'number'} {$pos}, " + + " element {'caption'} {$caption}, " + + " element {'description'} {$description}, " + + " element {'variables'} {$variables} }" + + " return " + + " $retFigure " + + "return $figures"; + String result = xQueryEvaluator.evaluateAsString(srcUrl, getFigures); + String bla = result; + } catch (Exception e) { + throw new ApplicationException(e); + } + } + + private void createToc() throws ApplicationException { + String docDirName = "/Users/jwillenborg/mpdl/data/xml/documents/echo/zh/SongYingxing_1637"; + XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl"); + File tocFile = new File(docDirName + "/toc.xml"); + String docDestFileName = docDirName + "/SongYingxing_1637.xml"; + String tocResult = tocTransformer.transform(docDestFileName); + String bla = ""; + } + + private void getDocInfo() throws ApplicationException { + IndexHandler indexHandler = IndexHandler.getInstance(); + MetadataRecord mdRecord = indexHandler.getDocMetadata("/echo/la/Benedetti_1585_163127KK.xml"); + } + +}