view software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/test/TestLocal.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.cms.test;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.StringWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.Hashtable;

import javax.xml.transform.stream.StreamSource;

import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.Serializer;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XsltCompiler;
import net.sf.saxon.s9api.XsltExecutable;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.Term;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsChainScheduler;
import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer;
import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler;
import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits;
import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord;
import de.mpg.mpiwg.berlin.mpdl.cms.document.Token;

public class TestLocal {
  private IndexHandler indexer;

  public static void main(String[] args) throws ApplicationException {
    try {
      TestLocal test = new TestLocal();
      test.init();
      // test.importAllDocuments();
      // test.createAllPdfInDirectory();
      // test.testTransform();
      // test.testXml();
      // test.generateToc();
      test.testCalls();
      // test.generatePdf();
      // test.xquery();
      // test.createToc();
      // test.testScheduler();
      // test.getDocInfo();
      // test.testChars();
      test.end();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  private void init() throws ApplicationException {
    indexer = IndexHandler.getInstance();
  }
  
  private void end() throws ApplicationException {
    indexer.end();
  }

  private void testXml() throws ApplicationException {
    try {
      DocumentHandler docHandler = new DocumentHandler();
      String docDirName = docHandler.getDocDir("/echo/la/Benedetti_1585_163127KK.xml");
      String pageXmlFileName = docDirName + "/pages" + "/page-" + "444" + ".xml";
      File pageXmlFile = new File(pageXmlFileName);
      String pageXmlStr = null;
      if (pageXmlFile.exists())
        pageXmlStr = FileUtils.readFileToString(pageXmlFile, "utf-8");
      System.out.println(pageXmlStr);
      String tokStr = tokenizeWithLemmas(pageXmlStr, "lat");
      System.out.println(tokStr);
      tokStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokStr;
      byte[] blablabla = tokStr.getBytes("utf-8");
      String blablu = new String(blablabla, "utf-8");
      String bla = enrichWordsOrigRegNorm(blablu);
      System.out.println(bla);
      
      XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
      URL url = new URL("file:/var/yp/Test_1789.xml");
      XdmNode docNode = xQueryEvaluator.parse(url); // if it is not parseable an exception with a detail message is thrown 
      
      File srcFile = new File("/Users/jwillenborg/mpdl/data/xml/documents/tei/de/dt-ptolemaeus-tei-merge2.xml");
      FileReader docFileReader = new FileReader(srcFile);
      XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader);
      docXmlTokenizer.setDocIdentifier("/tei/de/dt-ptolemaeus-tei-merge2.xml");
      docXmlTokenizer.tokenize();  
      ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements("s");
      String blabla = "";
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  
  private void importAllDocuments() throws ApplicationException {
    DocumentHandler docHandler = new DocumentHandler();
    CmsDocOperation docOperation = new CmsDocOperation("importDirectory", "file:/Users/jwillenborg/test/documents", null, null);
    docOperation.setCollectionNames("echo");
    docHandler.doOperation(docOperation);
  }
  
  private void createAllPdfInDirectory() throws ApplicationException {
    DocumentHandler docHandler = new DocumentHandler();
    CmsDocOperation docOperation = new CmsDocOperation("createAllPdfInDirectory", "file:/Users/jwillenborg/test/documents", null, null);
    docOperation.setCollectionNames("echo");
    docHandler.doOperation(docOperation);
  }
  
  private void generatePdf() throws ApplicationException {
    long begin = new Date().getTime();
    DocumentHandler docHandler = new DocumentHandler();
    // String docId = "/echo/la/Benedetti_1585_163127KK.xml";
    String docId = "/diverse/de/Einst_Ueber_de_1907_02.xml";
    // String docId = "/archimedes/it/caver_metod_020_it_1891.xml";
    CmsDocOperation docOperation = new CmsDocOperation("createPdf", null, null, docId);
    docHandler.doOperation(docOperation);
    long end = new Date().getTime();
    System.out.println("Needed time: " + (end - begin));
  }

  private void testChars() throws ApplicationException {
    String docId = "/test/benedetti/page-444.xml";
    String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docId;
    DocumentHandler docHandler = new DocumentHandler();
    CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docId);
    docHandler.doOperation(docOperation);
  }
  
  private void testCalls() throws ApplicationException {
    Date before = new Date();
    System.out.println("Indexing start: " + before.getTime());
    String docIdGoerz = "/tei/de/dt-ptolemaeus-tei-merge2.xml";
    String docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdGoerz;
    DocumentHandler docHandler = new DocumentHandler();
    CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdGoerz);
    // docHandler.doOperation(docOperation);
    String docIdSchulz = "/tei/de/Schulz_2009.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSchulz;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSchulz);
    // docHandler.doOperation(docOperation);
    String docIdBenedetti = "/echo/la/Benedetti_1585_163127KK.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdBenedetti;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdBenedetti);
    // docHandler.doOperation(docOperation);
    String docIdSongYingxing = "/echo/zh/SongYingxing_1637.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdSongYingxing;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdSongYingxing);
    // docHandler.doOperation(docOperation);
    String docIdMonte = "/archimedes/la/monte_mecha_036_la_1577.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMonte;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMonte);
    // docHandler.doOperation(docOperation);
    String docIdEinstein = "/diverse/de/Einst_Antwo_de_1912.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstein;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstein);
    // docHandler.doOperation(docOperation);
    String docIdEinsteinUeber = "/diverse/de/Einst_Ueber_de_1907_02.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinsteinUeber;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinsteinUeber);
    // docHandler.doOperation(docOperation);
    String docIdTest = "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdTest;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdTest);
    // docHandler.doOperation(docOperation);
    String docIdMega = "/test/mega/MEGA_A2_B013-00_ETX.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdMega;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMega);
    // docHandler.doOperation(docOperation);
    String docIdDiverse = "/diverse/en/078_A_1916.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdDiverse;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdDiverse);
    // docHandler.doOperation(docOperation);
    String docIdEinstGrossmann = "/diverse/de/EinsteinGrossmann.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstGrossmann;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstGrossmann);
    // docHandler.doOperation(docOperation);
    String docIdEinstGrund = "/diverse/en/078_A_1916.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdEinstGrund;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstGrund);
    // docHandler.doOperation(docOperation);
    String docIdVolta = "/archimedes/it/volta_nuoMemLetTerz_922_it_1795.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdVolta;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdVolta);
    // docHandler.doOperation(docOperation);
    String docIdVitruv = "/echo/it/Vitruvius_1747_Y1G1TRCW.xml";
    docSrcUrlStr = "file:/Users/jwillenborg/texts/mpdl/documents" + docIdVitruv;
    docHandler = new DocumentHandler();
    docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdVitruv);
    docHandler.doOperation(docOperation);
    // indexer.deleteDocument(docIdGoerz);
    // indexer.deleteDocument(docIdBenedetti);
    MorphologyCache.getInstance().end();
    LexHandler.getInstance().end();
  }

  private Hashtable<Integer, StringBuilder> getFragments(String fileName) throws ApplicationException {
    try {
      GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler();
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(getFragmentsContentHandler);
      InputSource inputSource = new InputSource(fileName);
      xmlParser.parse(inputSource);
      Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
      return resultFragments;
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private void testTransform() throws ApplicationException {
    Date begin = new Date();
    XslResourceTransformer xslResourceTransformer = new XslResourceTransformer("pageXml.xsl");
    xslResourceTransformer = new XslResourceTransformer("pageTei.xsl");
    xslResourceTransformer = new XslResourceTransformer("pageArchimedes.xsl");
    xslResourceTransformer = new XslResourceTransformer("pageXhtml.xsl");
    xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
    Date end = new Date();
    System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
    begin = new Date();
    String docFilePath = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585/pages/page-13-morph.xml";
    xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
    String result = xslResourceTransformer.transform(docFilePath);
    xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
    result = xslResourceTransformer.transform(docFilePath);
    xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
    result = xslResourceTransformer.transform(docFilePath);
    xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
    result = xslResourceTransformer.transform(docFilePath);
    xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
    result = xslResourceTransformer.transform(docFilePath);
    end = new Date();
    System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
    xslResourceTransformer = new XslResourceTransformer("pageEcho.xsl");
    begin = new Date();
    result = xslResourceTransformer.transform(docFilePath);
    result = xslResourceTransformer.transform(docFilePath);
    result = xslResourceTransformer.transform(docFilePath);
    result = xslResourceTransformer.transform(docFilePath);
    result = xslResourceTransformer.transform(docFilePath);
    end = new Date();
    System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
    try {
      Processor processor = new Processor(false); 
      XsltCompiler xsltCompiler = processor.newXsltCompiler();
      URL xslUrl = XslResourceTransformer.class.getResource("pageEcho.xsl");
      StreamSource xslStreamSource = new StreamSource(xslUrl.openStream());
      XsltExecutable xsltExecutable = xsltCompiler.compile(xslStreamSource);
      net.sf.saxon.s9api.XsltTransformer xsltTransformer = xsltExecutable.load();
      Serializer serializer = new Serializer();
      serializer.setOutputWriter(new StringWriter());
      begin = new Date();
      for (int i=0; i<=5; i++) {
        StreamSource xmlDoc = new StreamSource(docFilePath); 
        xsltTransformer.setSource(xmlDoc);  // needs some time for bigger documents
        xsltTransformer.setDestination(serializer);
        xsltTransformer.transform();  // needs some time for bigger documents
        result = serializer.getOutputDestination().toString();
      }
      end = new Date();
      System.out.println("Needed time: " + (end.getTime() - begin.getTime()) + " ms");
    } catch (Exception e) {
      
    }
  }
  
  private String tokenizeXmlFragment() throws ApplicationException {
    String result = null;
    try {
      String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8");
      String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13";
      URL srcUrl = new URL(srcUrlStr);
      InputStream inputStream = srcUrl.openStream();
      BufferedInputStream in = new BufferedInputStream(inputStream);
      xmlFragment = IOUtils.toString(in, "utf-8");
      in.close();

      XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment));
      xmlTokenizer.setLanguage("lat");
      String[] stopElements = {"var"};
      // xmlTokenizer.setOutputFormat("string");
      String[] outputOptions = {"withLemmas"};
      xmlTokenizer.setOutputOptions(outputOptions);
      xmlTokenizer.setStopElements(stopElements);
      xmlTokenizer.tokenize();
      result = xmlTokenizer.getXmlResult();
      System.out.println(result);
    } catch (Exception e) {
      throw new ApplicationException(e);
    }
    return result;
  }
  
  private String normalizeWords(String xmlStr) throws ApplicationException {
    try {
      WordContentHandler wordContentHandler = new WordContentHandler();
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(wordContentHandler);
      StringReader strReader = new StringReader(xmlStr);
      InputSource inputSource = new InputSource(strReader);
      xmlParser.parse(inputSource);
      String result = wordContentHandler.getResult();
      return result;
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException {
    StringReader strReader = new StringReader(xmlStr);
    XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader);
    xmlTokenizer.setLanguage(language);
    String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance)
    xmlTokenizer.setOutputOptions(outputOptionsWithLemmas); 
    xmlTokenizer.tokenize();  
    String retStr = xmlTokenizer.getXmlResult();
    return retStr;
  }
  
  private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
    try {
      WordContentHandler wordContentHandler = new WordContentHandler();
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(wordContentHandler);
      StringReader strReader = new StringReader(xmlStr);
      InputSource inputSource = new InputSource(strReader);
      xmlParser.parse(inputSource);
      String result = wordContentHandler.getResult();
      return result;
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
    String result = null;
    try {
      xmlStr = normalizeWords(xmlStr);
      HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language);
      highlightContentHandler.setFirstPageBreakReachedMode(true);
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(highlightContentHandler);
      StringReader stringReader = new StringReader(xmlStr);
      InputSource inputSource = new InputSource(stringReader);
      xmlParser.parse(inputSource);
      result = highlightContentHandler.getResult().toString();
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return result;
  }
  
  private void testScheduler() throws ApplicationException {
    CmsDocOperation docOperation = new CmsDocOperation("update", "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=/echo/zh/SongYingxing_1637.xml", null, "/echo/zh/SongYingxing_1637.xml"); 
    String[] elemNames = {"s", "head"};
    docOperation.setElementNames(elemNames);
    CmsChainScheduler scheduler = CmsChainScheduler.getInstance();
    docOperation = scheduler.doOperation(docOperation);
    String bla = "";
  }
  
  private void xquery() throws ApplicationException {
    try {
      XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
      URL srcUrl = new URL("file:/Users/jwillenborg/tmp/blablabla/Benedetti_1585.xml");
      String getTocEntries = "let $tocEntries := //echo:div[@type = 'section' or @type = 'chapter']";
      String getFigures = 
          "let $allFigures := //*:figure " +
          "let $figures := " +
          "  for $figure at $pos in $allFigures "+
          "    let $caption := string-join($figure/*:caption/text(), ' ') " +
          "    let $description := string-join($figure/*:description/text(), ' ') " +
          "    let $variables := string-join($figure/*:variables/text(), ' ') " +
          "    let $retFigure := " + 
          "      element {'figure'}" +
          "      { attribute {'number'} {$pos}, " +
          "       element {'caption'} {$caption}, " +
          "       element {'description'} {$description}, " +
          "       element {'variables'} {$variables} }" + 
          "    return " + 
          "      $retFigure " +
          "return $figures";
      String result = xQueryEvaluator.evaluateAsString(srcUrl, getFigures);
      String bla = result;
    } catch (Exception e) {
      throw new ApplicationException(e);
    }
  }
  
  private void createToc() throws ApplicationException {
    String docDirName = "/Users/jwillenborg/mpdl/data/xml/documents/echo/zh/SongYingxing_1637";
    XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl");
    File tocFile = new File(docDirName + "/toc.xml");
    String docDestFileName = docDirName + "/SongYingxing_1637.xml";
    String tocResult = tocTransformer.transform(docDestFileName);
    String bla = "";
  }
  
  private void getDocInfo() throws ApplicationException {
    IndexHandler indexHandler = IndexHandler.getInstance();
    MetadataRecord mdRecord = indexHandler.getDocMetadata("/echo/la/Benedetti_1585_163127KK.xml");
  }
  
}