mpdl-group: software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java comparison

comparison software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt

author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Tue, 21 May 2013 10:19:32 +0200
parents
children

comparison

equal deleted inserted replaced

-:e845310098ba
+:e9fe3186670c
+package de.mpg.mpiwg.berlin.mpdl.cms.document;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.List;
+import java.util.logging.Logger;
+import net.sf.saxon.s9api.Axis;
+import net.sf.saxon.s9api.QName;
+import net.sf.saxon.s9api.XdmNode;
+import net.sf.saxon.s9api.XdmNodeKind;
+import net.sf.saxon.s9api.XdmSequenceIterator;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.io.FileUtils;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.params.BasicHttpParams;
+import org.apache.http.params.HttpConnectionParams;
+import org.apache.http.params.HttpParams;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+import com.sun.org.apache.xerces.internal.parsers.SAXParser;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
+import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
+import de.mpg.mpiwg.berlin.mpdl.cms.general.Constants;
+import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer;
+/**
+* Handler for documents (singleton).
+*/
+public class DocumentHandler {
+private static Logger LOGGER = Logger.getLogger(DocumentHandler.class.getName());
+private static List<String> EXCLUDED_PROJECT_DOCS =
+Arrays.asList("/echo/zh/Min_chan_luyi_1_7MCGW0WG.xml", // the Saxon transfomer has heavy problems with some characters in CJK Unified Ideographs Extension B, e.g.: line 309 (second reg on page 16)
+"/echo/zh/Min_chan_luyi_2_U7Y9NQ9V.xml",
+"/echo/zh/Min_chan_luyi_3_2FP9M172.xml",
+"/echo/zh/Min_chan_luyi_4_FXA6FSFH.xml",
+"/echo/zh/Min_chan_luyi_5_VG6NY5XD.xml",
+"/echo/zh/Xifa_shenji.xml",
+"/echo/zh/Yulei_tushuo_1_HXX4MGZW.xml",
+"/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml");
+private long beginOfOperation;
+private long endOfOperation;
+public void doOperation(CmsDocOperation docOperation) throws ApplicationException {
+String operationName = docOperation.getName();
+if (operationName.equals("create")) {
+create(docOperation);
+} else if (operationName.equals("delete")) {
+delete(docOperation);
+} else if (operationName.equals("importDirectory")) {
+importDirectory(docOperation);
+} else if (operationName.equals("createPdf")) {
+createPdf(docOperation);
+} else if (operationName.equals("createAllPdfInDirectory")) {
+createAllPdfInDirectory(docOperation);
+}
+}
+private void importDirectory(CmsDocOperation docOperation) throws ApplicationException {
+try {
+LOGGER.info("Start of DocumentHandler. This operation could be time consuming because documents are indexed (normal indexing times are 1-10 minutes for a document)");
+beginOperation();
+String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory
+String collectionNames = docOperation.getCollectionNames();  // e.g. "echo"
+File localDocumentsDir = new File(new URI(localDocumentsUrlStr));
+boolean docDirExists = localDocumentsDir.exists();
+if (! docDirExists)
+throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again.");
+String[] fileExtensions = {"xml"};
+Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true);
+int i = 0;
+while(iterFiles.hasNext()) {
+i++;
+File xmlFile = iterFiles.next();
+String xmlFileStr = xmlFile.getPath();
+int relativePos = (int) localDocumentsDir.getPath().length();
+String docId = xmlFileStr.substring(relativePos);  // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml
+String xmlFileUrlStr = xmlFile.toURI().toURL().toString();
+CmsDocOperation createDocOperation = new CmsDocOperation("create", xmlFileUrlStr, null, docId);
+createDocOperation.setCollectionNames(collectionNames);
+try {
+doOperation(createDocOperation);
+Date now = new Date();
+LOGGER.info("Document " + i + ": " + docId + " successfully imported (" + now.toString() + ")");
+} catch (Exception e) {
+LOGGER.info("Document " + i + ": " + docId + " has problems:");
+e.printStackTrace();
+}
+}
+endOperation();
+LOGGER.info("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" );
+} catch (Exception e) {
+throw new ApplicationException(e);
+}
+}
+private void createAllPdfInDirectory(CmsDocOperation docOperation) throws ApplicationException {
+try {
+LOGGER.info("Start of generating Pdf-Documents. This operation could be time consuming because Pdf generation needs some time.");
+beginOperation();
+String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory
+String collectionNames = docOperation.getCollectionNames();  // e.g. "echo"
+File localDocumentsDir = new File(new URI(localDocumentsUrlStr));
+boolean docDirExists = localDocumentsDir.exists();
+if (! docDirExists)
+throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again.");
+String[] fileExtensions = {"xml"};
+Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true);
+int i = 0;
+while(iterFiles.hasNext()) {
+i++;
+File xmlFile = iterFiles.next();
+String xmlFileStr = xmlFile.getPath();
+int relativePos = (int) localDocumentsDir.getPath().length();
+String docId = xmlFileStr.substring(relativePos);  // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml
+CmsDocOperation createPdfOperation = new CmsDocOperation("createPdf", null, null, docId);
+createPdfOperation.setCollectionNames(collectionNames);
+try {
+doOperation(createPdfOperation);
+Date now = new Date();
+LOGGER.info("Pdf document " + i + ": " + docId + " successfully created (" + now.toString() + ")");
+} catch (Exception e) {
+LOGGER.info("Pdf document " + i + ": " + docId + " has problems:");
+e.printStackTrace();
+}
+}
+endOperation();
+LOGGER.info("The Pdf generation needed: " + (endOfOperation - beginOfOperation) + " ms" );
+} catch (Exception e) {
+throw new ApplicationException(e);
+}
+}
+private boolean isProjectDoc(String docId) {
+boolean isProjectDoc = true;
+if (EXCLUDED_PROJECT_DOCS.contains(docId))
+return false;
+return isProjectDoc;
+}
+private void create(CmsDocOperation docOperation) throws ApplicationException {
+try {
+String operationName = docOperation.getName();
+String srcUrlStr = docOperation.getSrcUrl();
+String docId = docOperation.getDocIdentifier();
+if (! isProjectDoc(docId)) {
+LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc");
+return;
+}
+String mainLanguage = docOperation.getMainLanguage();
+if (mainLanguage == null) {
+mainLanguage = getMainLanguage(docId);
+}
+String[] elementNames = docOperation.getElementNames();
+if (elementNames == null) {
+String[] defaultElementNames = {"s", "head", "caption", "variables", "description"};
+docOperation.setElementNames(defaultElementNames); // default
+}
+String docDirName = getDocDir(docId);
+String docDestFileName = getDocFullFileName(docId);
+URL srcUrl = null;
+String protocol = null;
+if (srcUrlStr != null && ! srcUrlStr.equals("empty")) {
+srcUrl = new URL(srcUrlStr);
+protocol = srcUrl.getProtocol();
+}
+File docDestFile = new File(docDestFileName);
+// parse validation on file
+XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
+XdmNode docNode = xQueryEvaluator.parse(srcUrl); // if it is not parseable an exception with a detail message is thrown
+String docType = getNodeType(docNode);  // archimedes, echo, TEI, html ...
+docType = docType.trim();
+if (docType == null) {
+docOperation.setErrorMessage("file type of: " + srcUrlStr + "is not supported");
+return;
+}
+// perform operation on file system
+if (protocol.equals("file")) {
+docOperation.setStatus("upload file: " + srcUrlStr + " to CMS");
+} else {
+docOperation.setStatus("download file from: " + srcUrlStr + " to CMS");
+}
+FileUtils.copyURLToFile(srcUrl, docDestFile, 100000, 100000);
+// replace anchor in echo documents and also add the number attribute to figures
+String docDestFileNameUpgrade = docDestFileName + ".upgrade";
+File docDestFileUpgrade = new File(docDestFileNameUpgrade);
+XslResourceTransformer replaceAnchorTransformer = new XslResourceTransformer("replaceAnchor.xsl");
+String docDestFileUrlStr = docDestFile.getPath();
+String result = replaceAnchorTransformer.transform(docDestFileUrlStr);
+FileUtils.writeStringToFile(docDestFileUpgrade, result, "utf-8");
+MetadataRecord mdRecord = new MetadataRecord();
+mdRecord.setDocId(docId);
+mdRecord.setCollectionNames(docOperation.getCollectionNames());
+mdRecord.setType("text/xml");
+// generate toc file (toc, figure, handwritten)
+XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl");
+File tocFile = new File(docDirName + "/toc.xml");
+String tocResult = tocTransformer.transform(docDestFileNameUpgrade);
+FileUtils.writeStringToFile(tocFile, tocResult, "utf-8");
+// Get metadata info of the xml document
+docOperation.setStatus("extract metadata of: " + srcUrlStr + " to CMS");
+XQueryEvaluator xQueryEvaluator2 = new XQueryEvaluator();
+mdRecord = getMetadataRecord(docDestFileUpgrade, docType, mdRecord, xQueryEvaluator2);
+String mdRecordLanguage = mdRecord.getLanguage();
+if (mdRecordLanguage == null && mainLanguage != null)
+mdRecord.setLanguage(mainLanguage);
+// save all pages as single xml files (untokenized and tokenized)
+docOperation.setStatus("extract page fragments of: " + srcUrlStr + " to CMS");
+File docDir = new File(docDirName + "/pages");
+FileUtils.deleteQuietly(docDir);  // first delete pages directory
+Hashtable<Integer, StringBuilder> pageFragments = getFragments(docDestFileNameUpgrade, "pb");
+int pageCount = pageFragments.size();
+if (pageCount == 0) {
+// no pb element is found: then the whole document is the first page
+String docXmlStr = FileUtils.readFileToString(docDestFileUpgrade, "utf-8");
+docXmlStr = docXmlStr.replaceAll("<\\?xml.*?\\?>", "");  // remove the xml declaration if it exists
+pageFragments = new Hashtable<Integer, StringBuilder>();
+pageFragments.put(new Integer(1), new StringBuilder(docXmlStr));
+pageCount = 1;
+}
+PageTransformer pageTransformer = new PageTransformer();
+for (int page=1; page<=pageCount; page++) {
+String fragment = pageFragments.get(new Integer(page)).toString();
+fragment = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + fragment;
+String docPageFileName = docDirName + "/pages/page-" + page + ".xml";
+File docPageFile = new File(docPageFileName);
+FileUtils.writeStringToFile(docPageFile, fragment, "utf-8");
+String language = mdRecord.getLanguage();
+String tokenizedXmlStr = tokenizeWithLemmas(fragment, language);  // xml fragment enriched with <w> elements
+tokenizedXmlStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokenizedXmlStr;
+tokenizedXmlStr = enrichWordsOrigRegNorm(tokenizedXmlStr);  // xml string: enrich <w> elements with normalization info (orig, reg, norm)
+String docPageTokenizedFileName = docDirName + "/pages/page-" + page + "-morph.xml";
+File docPageTokenizedFile = new File(docPageTokenizedFileName);
+FileUtils.writeStringToFile(docPageTokenizedFile, tokenizedXmlStr, "utf-8");
+String docPageHtmlFileName = docDirName + "/pages/page-" + page + ".html";
+File docPageHtmlFile = new File(docPageHtmlFileName);
+String htmlStr = pageTransformer.transform(tokenizedXmlStr, mdRecord, page, "html");
+FileUtils.writeStringToFile(docPageHtmlFile, htmlStr, "utf-8");
+}
+// perform operation on Lucene
+docOperation.setStatus(operationName + " document: " + docId + " in CMS");
+docOperation.setMdRecord(mdRecord);
+IndexHandler indexHandler = IndexHandler.getInstance();
+indexHandler.indexDocument(docOperation);
+} catch (IOException e) {
+throw new ApplicationException(e);
+}
+}
+private void delete(CmsDocOperation docOperation) throws ApplicationException {
+String operationName = docOperation.getName();
+String docIdentifier = docOperation.getDocIdentifier();
+if (docIdentifier == null || docIdentifier.trim().equals(""))
+throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
+String docDirStr = getDocDir(docIdentifier);
+File docDir = new File(docDirStr);
+boolean docExists = docDir.exists();
+if (! docExists) {
+throw new ApplicationException("Document:" + docIdentifier + " does not exists. Please use a name that exists and perform the operation \"Delete\" again.");
+}
+// perform operation on file system
+docOperation.setStatus(operationName + " document: " + docIdentifier + " in CMS");
+FileUtils.deleteQuietly(docDir);
+// perform operation on Lucene
+IndexHandler indexHandler = IndexHandler.getInstance();
+indexHandler.deleteDocument(docOperation);
+}
+private void createPdf(CmsDocOperation docOperation) throws ApplicationException {
+String docId = docOperation.getDocIdentifier();
+String operationName = docOperation.getName();
+if (docId == null || docId.trim().equals(""))
+throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
+if (! isProjectDoc(docId)) {
+LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc");
+return;
+}
+IndexHandler indexHandler = IndexHandler.getInstance();
+MetadataRecord mdRecord = indexHandler.getDocMetadata(docId);
+docOperation.setStatus("create PDF and HTML versions of the document: " + docId);
+PdfHandler pdfHandler = PdfHandler.getInstance();
+pdfHandler.createFile(true, true, mdRecord);  // generate Pdf + Html document
+}
+private MetadataRecord getMetadataRecord(File xmlFile, String schemaName, MetadataRecord mdRecord, XQueryEvaluator xQueryEvaluator) throws ApplicationException {
+if (schemaName == null)
+return mdRecord;
+try {
+URL srcUrl = xmlFile.toURI().toURL();
+if (schemaName.equals("archimedes"))
+mdRecord = getMetadataRecordArch(xQueryEvaluator, srcUrl, mdRecord);
+else if (schemaName.equals("echo"))
+mdRecord = getMetadataRecordEcho(xQueryEvaluator, srcUrl, mdRecord);
+else if (schemaName.equals("TEI"))
+mdRecord = getMetadataRecordTei(xQueryEvaluator, srcUrl, mdRecord);
+else if (schemaName.equals("html"))
+mdRecord = getMetadataRecordHtml(xQueryEvaluator, srcUrl, mdRecord);
+else
+mdRecord.setSchemaName("diverse"); // all other cases: set docType to schemaName
+} catch (MalformedURLException e) {
+throw new ApplicationException(e);
+}
+mdRecord.setLastModified(new Date());
+return mdRecord;
+}
+private MetadataRecord getMetadataRecordArch(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
+String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/archimedes//info");
+if (metadataXmlStr != null) {
+String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/locator");
+if (identifier != null)
+identifier = StringUtils.deresolveXmlEntities(identifier);
+String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/author");
+if (creator != null)
+creator = StringUtils.deresolveXmlEntities(creator);
+String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/title");
+if (title != null)
+title = StringUtils.deresolveXmlEntities(title);
+String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/lang[1]");
+if (language != null)
+language = StringUtils.deresolveXmlEntities(language);
+String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/place");
+if (place != null)
+place = StringUtils.deresolveXmlEntities(place);
+String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/date");
+Date date = null;
+if (yearStr != null && ! yearStr.equals("")) {
+yearStr = StringUtils.deresolveXmlEntities(yearStr);
+yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+if (yearStr != null) {
+try {
+date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
+} catch (Exception e) {
+// nothing
+}
+}
+}
+String rights = "open access";
+String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
+String accessRights = "free";
+mdRecord.setIdentifier(identifier);
+mdRecord.setLanguage(language);
+mdRecord.setCreator(creator);
+mdRecord.setTitle(title);
+mdRecord.setPublisher(place);
+mdRecord.setRights(rights);
+mdRecord.setDate(date);
+mdRecord.setLicense(license);
+mdRecord.setAccessRights(accessRights);
+// get echo metadata
+String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/echodir");
+String docId = mdRecord.getDocId();
+String echoIdTmp = docId;
+if (docId != null && ! docId.isEmpty()) {
+int start = docId.lastIndexOf("/");
+if (start != -1)
+start = start + 1;
+else
+start = 0;
+int end = docId.lastIndexOf(".");
+if (end == -1)
+end = docId.length();
+echoIdTmp = docId.substring(start, end);
+}
+String echoId = "/permanent/archimedes/" + echoIdTmp;
+if (echoIdTmp == null || echoIdTmp.isEmpty())
+echoId = null;
+if (echoDir != null && ! echoDir.isEmpty()) {
+echoId = echoDir;
+}
+mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord);
+}
+String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
+int pageCount = Integer.valueOf(pageCountStr);
+mdRecord.setPageCount(pageCount);
+mdRecord.setSchemaName("archimedes");
+return mdRecord;
+}
+private MetadataRecord getMetadataRecordEcho(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
+String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:echo/*:metadata");
+if (metadataXmlStr != null) {
+String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:identifier");
+if (identifier != null) {
+identifier = StringUtils.deresolveXmlEntities(identifier);
+}
+String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:creator");
+if (creator != null)
+creator = StringUtils.deresolveXmlEntities(creator);
+String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:title");
+if (title != null)
+title = StringUtils.deresolveXmlEntities(title);
+String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:language[1]");
+if (language != null)
+language = StringUtils.deresolveXmlEntities(language);
+String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:date");
+Date date = null;
+if (yearStr != null && ! yearStr.equals("")) {
+yearStr = StringUtils.deresolveXmlEntities(yearStr);
+yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+if (yearStr != null) {
+try {
+date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
+} catch (Exception e) {
+// nothing
+}
+}
+}
+String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:rights");
+if (rights != null)
+rights = StringUtils.deresolveXmlEntities(rights);
+String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:license");
+if (license != null)
+license = StringUtils.deresolveXmlEntities(license);
+String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:accessRights");
+if (accessRights != null)
+accessRights = StringUtils.deresolveXmlEntities(accessRights);
+mdRecord.setIdentifier(identifier);
+mdRecord.setLanguage(language);
+mdRecord.setCreator(creator);
+mdRecord.setTitle(title);
+mdRecord.setRights(rights);
+mdRecord.setDate(date);
+mdRecord.setLicense(license);
+mdRecord.setAccessRights(accessRights);
+// get echo metadata
+String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:echodir");
+String echoIdTmp = identifier;
+if (identifier != null && ! identifier.isEmpty()) {
+int start = identifier.indexOf("ECHO:");
+if (start != -1)
+start = start + 5;
+else
+start = 0;
+int end = identifier.lastIndexOf(".");
+if (end == -1)
+end = identifier.length();
+echoIdTmp = identifier.substring(start, end);
+}
+String echoId = "/permanent/library/" + echoIdTmp;
+if (echoIdTmp == null || echoIdTmp.isEmpty())
+echoId = null;
+if (echoDir != null && ! echoDir.isEmpty()) {
+echoId = echoDir;
+}
+mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord);
+}
+String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
+int pageCount = Integer.valueOf(pageCountStr);
+mdRecord.setPageCount(pageCount);
+mdRecord.setSchemaName("echo");
+return mdRecord;
+}
+private MetadataRecord getMetadataRecordTei(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
+String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:TEI/*:teiHeader");
+if (metadataXmlStr != null) {
+String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:idno");
+if (identifier != null) {
+identifier = StringUtils.deresolveXmlEntities(identifier);
+identifier = deleteSpecialChars(identifier);
+}
+String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:author");
+if (creator != null)
+creator = StringUtils.deresolveXmlEntities(creator);
+String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:title");
+if (title != null)
+title = StringUtils.deresolveXmlEntities(title);
+String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:langUsage/*:language[1]/@ident)");
+if (language != null && language.isEmpty())
+language = null;
+if (language != null) {
+language = language.toLowerCase();
+if (language.length() == 5) {  // e.g. "de-DE or en-US"
+if (language.substring(2, 3).equals("-")) {
+String lang = language.substring(0, 2);
+language = Language.getInstance().getISO639Code(lang);
+}
+}
+}
+String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:pubPlace");
+if (place != null)
+place = StringUtils.deresolveXmlEntities(place);
+String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:date");
+Date date = null;
+if (yearStr != null && ! yearStr.equals("")) {
+yearStr = StringUtils.deresolveXmlEntities(yearStr);
+yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+if (yearStr != null) {
+try {
+date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
+} catch (Exception e) {
+// nothing
+}
+}
+}
+String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:textClass/*:keywords/*:term)");
+if (subject != null)
+subject = StringUtils.deresolveXmlEntities(subject);
+String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability");
+if (rights == null)
+rights = "open access";
+rights = StringUtils.deresolveXmlEntities(rights);
+String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
+String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability/@status)");
+if (accessRights == null)
+accessRights = "free";
+accessRights = StringUtils.deresolveXmlEntities(accessRights);
+mdRecord.setIdentifier(identifier);
+mdRecord.setLanguage(language);
+mdRecord.setCreator(creator);
+mdRecord.setTitle(title);
+mdRecord.setPublisher(place);
+mdRecord.setRights(rights);
+mdRecord.setDate(date);
+mdRecord.setSubject(subject);
+mdRecord.setLicense(license);
+mdRecord.setAccessRights(accessRights);
+// get echo metadata
+mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord);  // identifier is echoDir
+}
+String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
+int pageCount = Integer.valueOf(pageCountStr);
+mdRecord.setPageCount(pageCount);
+mdRecord.setSchemaName("TEI");
+return mdRecord;
+}
+private MetadataRecord getMetadataRecordHtml(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
+String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/html/head");
+if (metadataXmlStr != null) {
+String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.identifier']/@content)");
+if (identifier != null && ! identifier.isEmpty())
+identifier = StringUtils.deresolveXmlEntities(identifier);
+String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.creator']/@content)");
+if (creator != null && ! creator.isEmpty())
+creator = StringUtils.deresolveXmlEntities(creator);
+String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.title']/@content)");
+if (title != null && ! title.isEmpty())
+title = StringUtils.deresolveXmlEntities(title);
+String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.language']/@content)");
+if (language != null && language.isEmpty())
+language = null;
+if (language != null && ! language.isEmpty())
+language = StringUtils.deresolveXmlEntities(language);
+String publisher = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.publisher']/@content)");
+if (publisher != null)
+publisher = StringUtils.deresolveXmlEntities(publisher);
+String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.date']/@content)");
+Date date = null;
+if (yearStr != null && ! yearStr.equals("")) {
+yearStr = StringUtils.deresolveXmlEntities(yearStr);
+yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+if (yearStr != null) {
+try {
+date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
+} catch (Exception e) {
+// nothing
+}
+}
+}
+String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.subject']/@content)");
+if (subject != null)
+subject = StringUtils.deresolveXmlEntities(subject);
+String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.rights']/@content)");
+if (rights != null && ! rights.isEmpty())
+rights = StringUtils.deresolveXmlEntities(rights);
+String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.license']/@content)");
+if (license != null && ! license.isEmpty())
+license = StringUtils.deresolveXmlEntities(license);
+String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.accessRights']/@content)");
+if (accessRights != null && ! accessRights.isEmpty())
+accessRights = StringUtils.deresolveXmlEntities(accessRights);
+mdRecord.setIdentifier(identifier);
+mdRecord.setLanguage(language);
+mdRecord.setCreator(creator);
+mdRecord.setTitle(title);
+mdRecord.setPublisher(publisher);
+mdRecord.setRights(rights);
+mdRecord.setDate(date);
+mdRecord.setSubject(subject);
+mdRecord.setLicense(license);
+mdRecord.setAccessRights(accessRights);
+// get echo metadata
+mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord);  // identifier is echoDir
+}
+String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
+int pageCount = Integer.valueOf(pageCountStr);
+mdRecord.setPageCount(pageCount);
+mdRecord.setSchemaName("html");
+return mdRecord;
+}
+private MetadataRecord getEchoMetadata(XQueryEvaluator xQueryEvaluator, String echoDir, MetadataRecord mdRecord) throws ApplicationException {
+if (echoDir == null || echoDir.isEmpty()) {
+String docId = mdRecord.getDocId();
+echoDir = getEchoDir(xQueryEvaluator, docId);
+if (echoDir == null)
+return mdRecord;
+}
+String urLTexter = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=" + echoDir + "/index.meta";
+String echoIndexMetaStr = performGetRequest(urLTexter);
+String echoPageImageDir = null;
+String echoFiguresDir = null;
+String mpiwgDocId = null;
+if (echoIndexMetaStr != null) {
+if (echoIndexMetaStr.equals("XXXXTimeoutXXXX"))
+return null;
+else if (echoIndexMetaStr.equals("XXXXUrlErrorXXXX"))
+return mdRecord;
+echoPageImageDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/image");
+if (echoPageImageDir != null)
+echoPageImageDir = echoDir + "/" + echoPageImageDir;
+else
+echoPageImageDir = echoDir + "/" + "pageimg"; // default
+echoFiguresDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/figures");
+if (echoFiguresDir != null)
+echoFiguresDir = echoDir + "/" + echoFiguresDir;
+else
+echoFiguresDir = echoDir + "/" + "figures"; // default
+mpiwgDocId = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/dri[@type = 'mpiwg']");
+}
+mdRecord.setEchoId(echoDir);
+mdRecord.setEchoPageImageDir(echoPageImageDir);
+mdRecord.setEchoFiguresDir(echoFiguresDir);
+mdRecord.setMpiwgDocId(mpiwgDocId);
+return mdRecord;
+}
+private String getEchoDir(XQueryEvaluator xQueryEvaluator, String docId) throws ApplicationException {
+String echoDir = null;
+String urLTextUrlPath = "http://md.mpiwg-berlin.mpg.de/purls/searchSolr?text-url-path=" + docId + "&format=short";
+String resultXmlStr = performGetRequest(urLTextUrlPath);
+if (resultXmlStr != null) {
+if (resultXmlStr.equals("XXXXTimeoutXXXX"))
+return null;
+else if (resultXmlStr.equals("XXXXUrlErrorXXXX"))
+return null;
+String archivePath = xQueryEvaluator.evaluateAsStringValueJoined(resultXmlStr, "//archive-path");
+if (archivePath != null) {
+archivePath = archivePath.replaceAll("/mpiwg/online", "");
+if (archivePath.isEmpty())
+echoDir = null;
+else
+echoDir = archivePath;
+}
+}
+return echoDir;
+}
+private String getNodeType(XdmNode node) {
+String nodeType = null;
+XdmSequenceIterator iter = node.axisIterator(Axis.CHILD);
+if (iter != null) {
+while (iter.hasNext()) {
+XdmNode firstChild = (XdmNode) iter.next();
+if (firstChild != null) {
+XdmNodeKind nodeKind = firstChild.getNodeKind();
+if (nodeKind.ordinal() == XdmNodeKind.ELEMENT.ordinal()) {
+QName nodeQName = firstChild.getNodeName();
+nodeType = nodeQName.getLocalName();
+}
+}
+}
+}
+return nodeType;
+}
+public String getDocFullFileName(String docId) {
+String docDir = getDocDir(docId);
+String docFileName = getDocFileName(docId);
+String docFullFileName = docDir + "/" + docFileName;
+return docFullFileName;
+}
+public String getFullFileName(String docId, String type) {
+String docDir = getDocDir(docId);
+String docFileName = getDocFileName(docId);
+int lastDot = docFileName.lastIndexOf(".");
+String docFileNameWithoutExtension = docFileName.substring(0, lastDot);
+String fullFileName = docDir + "/" + docFileNameWithoutExtension + ".xml";
+if (type != null && ! type.equals("toc")) {
+fullFileName = docDir + "/" + docFileNameWithoutExtension + "." + type;
+} else if (type != null && type.equals("toc")) {
+fullFileName = docDir + "/toc.xml";
+}
+return fullFileName;
+}
+public String getDocDir(String docId) {
+String documentsDirectory = Constants.getInstance().getDocumentsDir();
+String subDir = docId;
+if (docId.contains(".")) {
+int index = docId.lastIndexOf(".");
+subDir = docId.substring(0, index);
+}
+if (! subDir.startsWith("/"))
+subDir = "/" + subDir;
+String docDir = documentsDirectory + subDir;
+return docDir;
+}
+public String getDocFileName(String docId) {
+String docFileName = docId;
+int index = docId.lastIndexOf("/");
+if (index != -1) {
+docFileName = docId.substring(index + 1);
+}
+return docFileName;
+}
+private String getMainLanguage(String docId) {
+String mainLang = null;
+int to = docId.lastIndexOf("/");
+if (to != -1) {
+String preStr = docId.substring(0, to);
+int from = preStr.lastIndexOf("/");
+if (from != -1)
+mainLang = preStr.substring(from + 1, to);
+}
+return mainLang;
+}
+private String deleteSpecialChars(String inputStr) {
+StringBuilder buf = new StringBuilder();
+for (int i = 0; i < inputStr.length(); i++) {
+char c = inputStr.charAt(i);
+String replace = new String();
+switch (c) {
+case '@': replace = ""; break;
+case ' ': replace = ""; break;
+case ';': replace = ""; break;
+default: replace += c; break;
+}
+buf.append(replace);
+}
+return buf.toString();
+}
+private Hashtable<Integer, StringBuilder> getFragments(String fileName, String milestoneElementName) throws ApplicationException {
+try {
+GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(milestoneElementName);
+XMLReader xmlParser = new SAXParser();
+xmlParser.setContentHandler(getFragmentsContentHandler);
+StringReader bla = new StringReader(FileUtils.readFileToString(new File(fileName), "utf-8"));
+InputSource inputSource = new InputSource(bla);
+xmlParser.parse(inputSource);
+Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
+return resultFragments;
+} catch (SAXException e) {
+throw new ApplicationException(e);
+} catch (IOException e) {
+throw new ApplicationException(e);
+}
+}
+private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException {
+StringReader strReader = new StringReader(xmlStr);
+XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader);
+xmlTokenizer.setLanguage(language);
+String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance)
+// non word breaking elements;
+// TODO examine bugs with emph, figure, hi :
+// e.g. "... der <hi rend="i">Capi-<lb n="16"/>talist.</hi> Es ..."
+// e.g. page 30 in /echo/la/Cataneo_1600.xml
+String[] nwbElements = {"lb", "br", "cb"};
+xmlTokenizer.setNWBElements(nwbElements);
+xmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
+xmlTokenizer.tokenize();
+String retStr = xmlTokenizer.getXmlResult();
+return retStr;
+}
+private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
+try {
+WordContentHandler wordContentHandler = new WordContentHandler();
+XMLReader xmlParser = new SAXParser();
+xmlParser.setContentHandler(wordContentHandler);
+StringReader strReader = new StringReader(xmlStr);
+InputSource inputSource = new InputSource(strReader);
+xmlParser.parse(inputSource);
+String result = wordContentHandler.getResult();
+return result;
+} catch (SAXException e) {
+throw new ApplicationException(e);
+} catch (IOException e) {
+throw new ApplicationException(e);
+}
+}
+private String performGetRequest(String url) throws ApplicationException {
+String resultStr = null;
+try {
+boolean urlIsOk = checkUri(url, 2000); // if url doesn't answer after 2 seconds
+if (! urlIsOk)
+return "XXXXTimeoutXXXX";
+HttpClient httpClient = new HttpClient();
+GetMethod method = new GetMethod(url);
+httpClient.executeMethod(method);
+int statusCode = method.getStatusCode();
+if (statusCode >= 400)
+return "XXXXUrlErrorXXXX";
+byte[] resultBytes = method.getResponseBody();
+resultStr = new String(resultBytes, "utf-8");
+method.releaseConnection();
+} catch (HttpException e) {
+throw new ApplicationException(e);
+} catch (IOException e) {
+throw new ApplicationException(e);
+}
+return resultStr;
+}
+private boolean checkUri(String uriStr, int timeoutMilliseconds) throws ApplicationException {
+boolean isOk = true;
+try {
+URI uri = new URI(uriStr);
+HttpGet httpGet = new HttpGet(uri);
+HttpParams httpParameters = new BasicHttpParams();
+// Set the timeout in milliseconds until a connection is established.
+// The default value is zero, that means the timeout is not used.
+int timeoutConnection = 2000;
+HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection);
+// Set the default socket timeout (SO_TIMEOUT)
+// in milliseconds which is the timeout for waiting for data.
+int timeoutSocket = 2000;
+HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket);
+DefaultHttpClient httpClient = new DefaultHttpClient(httpParameters);
+HttpResponse response = httpClient.execute(httpGet);
+} catch (IOException e) {
+isOk = false;  // if timeout exception is thrown
+} catch (URISyntaxException e) {
+throw new ApplicationException(e);
+}
+return isOk;
+}
+/**
+* Write string into destFile. If directory for that destFile does not exist
+* it creates this directory including parent directories.
+* @param str string to write
+* @param destFileName destination file name
+* @throws ApplicationException
+*/
+private void saveFile(String str, String destFileName) throws ApplicationException {
+OutputStreamWriter out = null;
+try {
+if (str == null)
+return;  // do nothing
+File destFile = new File(destFileName);
+File destDir = new File(destFile.getParent());
+if (! destDir.exists()) {
+destDir.mkdirs();  // create the directory including parent directories which do not exist
+}
+out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(destFile)), "utf-8");
+out.write(str);
+out.flush();
+} catch (FileNotFoundException e) {
+throw new ApplicationException(e);
+} catch (IOException e) {
+throw new ApplicationException(e);
+} finally {
+try {
+if (out != null)
+out.close();
+} catch (Exception e) {
+// nothing: always close the stream at the end of the method
+}
+}
+}
+private void beginOperation() {
+beginOfOperation = new Date().getTime();
+}
+private void endOperation() {
+endOfOperation = new Date().getTime();
+}
+}

Mercurial > hg > mpdl-group

comparison software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java @ 25:e9fe3186670c default tip