Mercurial > hg > mpdl-group
view software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.cms.document; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.StringReader; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Arrays; import java.util.Date; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.logging.Logger; import net.sf.saxon.s9api.Axis; import net.sf.saxon.s9api.QName; import net.sf.saxon.s9api.XdmNode; import net.sf.saxon.s9api.XdmNodeKind; import net.sf.saxon.s9api.XdmSequenceIterator; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.io.FileUtils; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpConnectionParams; import org.apache.http.params.HttpParams; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; import de.mpg.mpiwg.berlin.mpdl.cms.general.Constants; import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation; import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler; import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer; /** * Handler for documents (singleton). */ public class DocumentHandler { private static Logger LOGGER = Logger.getLogger(DocumentHandler.class.getName()); private static List<String> EXCLUDED_PROJECT_DOCS = Arrays.asList("/echo/zh/Min_chan_luyi_1_7MCGW0WG.xml", // the Saxon transfomer has heavy problems with some characters in CJK Unified Ideographs Extension B, e.g.: line 309 (second reg on page 16) "/echo/zh/Min_chan_luyi_2_U7Y9NQ9V.xml", "/echo/zh/Min_chan_luyi_3_2FP9M172.xml", "/echo/zh/Min_chan_luyi_4_FXA6FSFH.xml", "/echo/zh/Min_chan_luyi_5_VG6NY5XD.xml", "/echo/zh/Xifa_shenji.xml", "/echo/zh/Yulei_tushuo_1_HXX4MGZW.xml", "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml"); private long beginOfOperation; private long endOfOperation; public void doOperation(CmsDocOperation docOperation) throws ApplicationException { String operationName = docOperation.getName(); if (operationName.equals("create")) { create(docOperation); } else if (operationName.equals("delete")) { delete(docOperation); } else if (operationName.equals("importDirectory")) { importDirectory(docOperation); } else if (operationName.equals("createPdf")) { createPdf(docOperation); } else if (operationName.equals("createAllPdfInDirectory")) { createAllPdfInDirectory(docOperation); } } private void importDirectory(CmsDocOperation docOperation) throws ApplicationException { try { LOGGER.info("Start of DocumentHandler. This operation could be time consuming because documents are indexed (normal indexing times are 1-10 minutes for a document)"); beginOperation(); String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory String collectionNames = docOperation.getCollectionNames(); // e.g. "echo" File localDocumentsDir = new File(new URI(localDocumentsUrlStr)); boolean docDirExists = localDocumentsDir.exists(); if (! docDirExists) throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again."); String[] fileExtensions = {"xml"}; Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true); int i = 0; while(iterFiles.hasNext()) { i++; File xmlFile = iterFiles.next(); String xmlFileStr = xmlFile.getPath(); int relativePos = (int) localDocumentsDir.getPath().length(); String docId = xmlFileStr.substring(relativePos); // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml String xmlFileUrlStr = xmlFile.toURI().toURL().toString(); CmsDocOperation createDocOperation = new CmsDocOperation("create", xmlFileUrlStr, null, docId); createDocOperation.setCollectionNames(collectionNames); try { doOperation(createDocOperation); Date now = new Date(); LOGGER.info("Document " + i + ": " + docId + " successfully imported (" + now.toString() + ")"); } catch (Exception e) { LOGGER.info("Document " + i + ": " + docId + " has problems:"); e.printStackTrace(); } } endOperation(); LOGGER.info("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" ); } catch (Exception e) { throw new ApplicationException(e); } } private void createAllPdfInDirectory(CmsDocOperation docOperation) throws ApplicationException { try { LOGGER.info("Start of generating Pdf-Documents. This operation could be time consuming because Pdf generation needs some time."); beginOperation(); String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory String collectionNames = docOperation.getCollectionNames(); // e.g. "echo" File localDocumentsDir = new File(new URI(localDocumentsUrlStr)); boolean docDirExists = localDocumentsDir.exists(); if (! docDirExists) throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again."); String[] fileExtensions = {"xml"}; Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true); int i = 0; while(iterFiles.hasNext()) { i++; File xmlFile = iterFiles.next(); String xmlFileStr = xmlFile.getPath(); int relativePos = (int) localDocumentsDir.getPath().length(); String docId = xmlFileStr.substring(relativePos); // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml CmsDocOperation createPdfOperation = new CmsDocOperation("createPdf", null, null, docId); createPdfOperation.setCollectionNames(collectionNames); try { doOperation(createPdfOperation); Date now = new Date(); LOGGER.info("Pdf document " + i + ": " + docId + " successfully created (" + now.toString() + ")"); } catch (Exception e) { LOGGER.info("Pdf document " + i + ": " + docId + " has problems:"); e.printStackTrace(); } } endOperation(); LOGGER.info("The Pdf generation needed: " + (endOfOperation - beginOfOperation) + " ms" ); } catch (Exception e) { throw new ApplicationException(e); } } private boolean isProjectDoc(String docId) { boolean isProjectDoc = true; if (EXCLUDED_PROJECT_DOCS.contains(docId)) return false; return isProjectDoc; } private void create(CmsDocOperation docOperation) throws ApplicationException { try { String operationName = docOperation.getName(); String srcUrlStr = docOperation.getSrcUrl(); String docId = docOperation.getDocIdentifier(); if (! isProjectDoc(docId)) { LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc"); return; } String mainLanguage = docOperation.getMainLanguage(); if (mainLanguage == null) { mainLanguage = getMainLanguage(docId); } String[] elementNames = docOperation.getElementNames(); if (elementNames == null) { String[] defaultElementNames = {"s", "head", "caption", "variables", "description"}; docOperation.setElementNames(defaultElementNames); // default } String docDirName = getDocDir(docId); String docDestFileName = getDocFullFileName(docId); URL srcUrl = null; String protocol = null; if (srcUrlStr != null && ! srcUrlStr.equals("empty")) { srcUrl = new URL(srcUrlStr); protocol = srcUrl.getProtocol(); } File docDestFile = new File(docDestFileName); // parse validation on file XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); XdmNode docNode = xQueryEvaluator.parse(srcUrl); // if it is not parseable an exception with a detail message is thrown String docType = getNodeType(docNode); // archimedes, echo, TEI, html ... docType = docType.trim(); if (docType == null) { docOperation.setErrorMessage("file type of: " + srcUrlStr + "is not supported"); return; } // perform operation on file system if (protocol.equals("file")) { docOperation.setStatus("upload file: " + srcUrlStr + " to CMS"); } else { docOperation.setStatus("download file from: " + srcUrlStr + " to CMS"); } FileUtils.copyURLToFile(srcUrl, docDestFile, 100000, 100000); // replace anchor in echo documents and also add the number attribute to figures String docDestFileNameUpgrade = docDestFileName + ".upgrade"; File docDestFileUpgrade = new File(docDestFileNameUpgrade); XslResourceTransformer replaceAnchorTransformer = new XslResourceTransformer("replaceAnchor.xsl"); String docDestFileUrlStr = docDestFile.getPath(); String result = replaceAnchorTransformer.transform(docDestFileUrlStr); FileUtils.writeStringToFile(docDestFileUpgrade, result, "utf-8"); MetadataRecord mdRecord = new MetadataRecord(); mdRecord.setDocId(docId); mdRecord.setCollectionNames(docOperation.getCollectionNames()); mdRecord.setType("text/xml"); // generate toc file (toc, figure, handwritten) XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl"); File tocFile = new File(docDirName + "/toc.xml"); String tocResult = tocTransformer.transform(docDestFileNameUpgrade); FileUtils.writeStringToFile(tocFile, tocResult, "utf-8"); // Get metadata info of the xml document docOperation.setStatus("extract metadata of: " + srcUrlStr + " to CMS"); XQueryEvaluator xQueryEvaluator2 = new XQueryEvaluator(); mdRecord = getMetadataRecord(docDestFileUpgrade, docType, mdRecord, xQueryEvaluator2); String mdRecordLanguage = mdRecord.getLanguage(); if (mdRecordLanguage == null && mainLanguage != null) mdRecord.setLanguage(mainLanguage); // save all pages as single xml files (untokenized and tokenized) docOperation.setStatus("extract page fragments of: " + srcUrlStr + " to CMS"); File docDir = new File(docDirName + "/pages"); FileUtils.deleteQuietly(docDir); // first delete pages directory Hashtable<Integer, StringBuilder> pageFragments = getFragments(docDestFileNameUpgrade, "pb"); int pageCount = pageFragments.size(); if (pageCount == 0) { // no pb element is found: then the whole document is the first page String docXmlStr = FileUtils.readFileToString(docDestFileUpgrade, "utf-8"); docXmlStr = docXmlStr.replaceAll("<\\?xml.*?\\?>", ""); // remove the xml declaration if it exists pageFragments = new Hashtable<Integer, StringBuilder>(); pageFragments.put(new Integer(1), new StringBuilder(docXmlStr)); pageCount = 1; } PageTransformer pageTransformer = new PageTransformer(); for (int page=1; page<=pageCount; page++) { String fragment = pageFragments.get(new Integer(page)).toString(); fragment = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + fragment; String docPageFileName = docDirName + "/pages/page-" + page + ".xml"; File docPageFile = new File(docPageFileName); FileUtils.writeStringToFile(docPageFile, fragment, "utf-8"); String language = mdRecord.getLanguage(); String tokenizedXmlStr = tokenizeWithLemmas(fragment, language); // xml fragment enriched with <w> elements tokenizedXmlStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokenizedXmlStr; tokenizedXmlStr = enrichWordsOrigRegNorm(tokenizedXmlStr); // xml string: enrich <w> elements with normalization info (orig, reg, norm) String docPageTokenizedFileName = docDirName + "/pages/page-" + page + "-morph.xml"; File docPageTokenizedFile = new File(docPageTokenizedFileName); FileUtils.writeStringToFile(docPageTokenizedFile, tokenizedXmlStr, "utf-8"); String docPageHtmlFileName = docDirName + "/pages/page-" + page + ".html"; File docPageHtmlFile = new File(docPageHtmlFileName); String htmlStr = pageTransformer.transform(tokenizedXmlStr, mdRecord, page, "html"); FileUtils.writeStringToFile(docPageHtmlFile, htmlStr, "utf-8"); } // perform operation on Lucene docOperation.setStatus(operationName + " document: " + docId + " in CMS"); docOperation.setMdRecord(mdRecord); IndexHandler indexHandler = IndexHandler.getInstance(); indexHandler.indexDocument(docOperation); } catch (IOException e) { throw new ApplicationException(e); } } private void delete(CmsDocOperation docOperation) throws ApplicationException { String operationName = docOperation.getName(); String docIdentifier = docOperation.getDocIdentifier(); if (docIdentifier == null || docIdentifier.trim().equals("")) throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document."); String docDirStr = getDocDir(docIdentifier); File docDir = new File(docDirStr); boolean docExists = docDir.exists(); if (! docExists) { throw new ApplicationException("Document:" + docIdentifier + " does not exists. Please use a name that exists and perform the operation \"Delete\" again."); } // perform operation on file system docOperation.setStatus(operationName + " document: " + docIdentifier + " in CMS"); FileUtils.deleteQuietly(docDir); // perform operation on Lucene IndexHandler indexHandler = IndexHandler.getInstance(); indexHandler.deleteDocument(docOperation); } private void createPdf(CmsDocOperation docOperation) throws ApplicationException { String docId = docOperation.getDocIdentifier(); String operationName = docOperation.getName(); if (docId == null || docId.trim().equals("")) throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document."); if (! isProjectDoc(docId)) { LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc"); return; } IndexHandler indexHandler = IndexHandler.getInstance(); MetadataRecord mdRecord = indexHandler.getDocMetadata(docId); docOperation.setStatus("create PDF and HTML versions of the document: " + docId); PdfHandler pdfHandler = PdfHandler.getInstance(); pdfHandler.createFile(true, true, mdRecord); // generate Pdf + Html document } private MetadataRecord getMetadataRecord(File xmlFile, String schemaName, MetadataRecord mdRecord, XQueryEvaluator xQueryEvaluator) throws ApplicationException { if (schemaName == null) return mdRecord; try { URL srcUrl = xmlFile.toURI().toURL(); if (schemaName.equals("archimedes")) mdRecord = getMetadataRecordArch(xQueryEvaluator, srcUrl, mdRecord); else if (schemaName.equals("echo")) mdRecord = getMetadataRecordEcho(xQueryEvaluator, srcUrl, mdRecord); else if (schemaName.equals("TEI")) mdRecord = getMetadataRecordTei(xQueryEvaluator, srcUrl, mdRecord); else if (schemaName.equals("html")) mdRecord = getMetadataRecordHtml(xQueryEvaluator, srcUrl, mdRecord); else mdRecord.setSchemaName("diverse"); // all other cases: set docType to schemaName } catch (MalformedURLException e) { throw new ApplicationException(e); } mdRecord.setLastModified(new Date()); return mdRecord; } private MetadataRecord getMetadataRecordArch(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException { String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/archimedes//info"); if (metadataXmlStr != null) { String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/locator"); if (identifier != null) identifier = StringUtils.deresolveXmlEntities(identifier); String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/author"); if (creator != null) creator = StringUtils.deresolveXmlEntities(creator); String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/title"); if (title != null) title = StringUtils.deresolveXmlEntities(title); String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/lang[1]"); if (language != null) language = StringUtils.deresolveXmlEntities(language); String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/place"); if (place != null) place = StringUtils.deresolveXmlEntities(place); String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/date"); Date date = null; if (yearStr != null && ! yearStr.equals("")) { yearStr = StringUtils.deresolveXmlEntities(yearStr); yearStr = new Util().toYearStr(yearStr); // test if possible etc if (yearStr != null) { try { date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); } catch (Exception e) { // nothing } } } String rights = "open access"; String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration"; String accessRights = "free"; mdRecord.setIdentifier(identifier); mdRecord.setLanguage(language); mdRecord.setCreator(creator); mdRecord.setTitle(title); mdRecord.setPublisher(place); mdRecord.setRights(rights); mdRecord.setDate(date); mdRecord.setLicense(license); mdRecord.setAccessRights(accessRights); // get echo metadata String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/echodir"); String docId = mdRecord.getDocId(); String echoIdTmp = docId; if (docId != null && ! docId.isEmpty()) { int start = docId.lastIndexOf("/"); if (start != -1) start = start + 1; else start = 0; int end = docId.lastIndexOf("."); if (end == -1) end = docId.length(); echoIdTmp = docId.substring(start, end); } String echoId = "/permanent/archimedes/" + echoIdTmp; if (echoIdTmp == null || echoIdTmp.isEmpty()) echoId = null; if (echoDir != null && ! echoDir.isEmpty()) { echoId = echoDir; } mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord); } String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)"); int pageCount = Integer.valueOf(pageCountStr); mdRecord.setPageCount(pageCount); mdRecord.setSchemaName("archimedes"); return mdRecord; } private MetadataRecord getMetadataRecordEcho(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException { String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:echo/*:metadata"); if (metadataXmlStr != null) { String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:identifier"); if (identifier != null) { identifier = StringUtils.deresolveXmlEntities(identifier); } String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:creator"); if (creator != null) creator = StringUtils.deresolveXmlEntities(creator); String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:title"); if (title != null) title = StringUtils.deresolveXmlEntities(title); String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:language[1]"); if (language != null) language = StringUtils.deresolveXmlEntities(language); String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:date"); Date date = null; if (yearStr != null && ! yearStr.equals("")) { yearStr = StringUtils.deresolveXmlEntities(yearStr); yearStr = new Util().toYearStr(yearStr); // test if possible etc if (yearStr != null) { try { date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); } catch (Exception e) { // nothing } } } String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:rights"); if (rights != null) rights = StringUtils.deresolveXmlEntities(rights); String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:license"); if (license != null) license = StringUtils.deresolveXmlEntities(license); String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:accessRights"); if (accessRights != null) accessRights = StringUtils.deresolveXmlEntities(accessRights); mdRecord.setIdentifier(identifier); mdRecord.setLanguage(language); mdRecord.setCreator(creator); mdRecord.setTitle(title); mdRecord.setRights(rights); mdRecord.setDate(date); mdRecord.setLicense(license); mdRecord.setAccessRights(accessRights); // get echo metadata String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:echodir"); String echoIdTmp = identifier; if (identifier != null && ! identifier.isEmpty()) { int start = identifier.indexOf("ECHO:"); if (start != -1) start = start + 5; else start = 0; int end = identifier.lastIndexOf("."); if (end == -1) end = identifier.length(); echoIdTmp = identifier.substring(start, end); } String echoId = "/permanent/library/" + echoIdTmp; if (echoIdTmp == null || echoIdTmp.isEmpty()) echoId = null; if (echoDir != null && ! echoDir.isEmpty()) { echoId = echoDir; } mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord); } String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)"); int pageCount = Integer.valueOf(pageCountStr); mdRecord.setPageCount(pageCount); mdRecord.setSchemaName("echo"); return mdRecord; } private MetadataRecord getMetadataRecordTei(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException { String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:TEI/*:teiHeader"); if (metadataXmlStr != null) { String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:idno"); if (identifier != null) { identifier = StringUtils.deresolveXmlEntities(identifier); identifier = deleteSpecialChars(identifier); } String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:author"); if (creator != null) creator = StringUtils.deresolveXmlEntities(creator); String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:title"); if (title != null) title = StringUtils.deresolveXmlEntities(title); String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:langUsage/*:language[1]/@ident)"); if (language != null && language.isEmpty()) language = null; if (language != null) { language = language.toLowerCase(); if (language.length() == 5) { // e.g. "de-DE or en-US" if (language.substring(2, 3).equals("-")) { String lang = language.substring(0, 2); language = Language.getInstance().getISO639Code(lang); } } } String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:pubPlace"); if (place != null) place = StringUtils.deresolveXmlEntities(place); String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:date"); Date date = null; if (yearStr != null && ! yearStr.equals("")) { yearStr = StringUtils.deresolveXmlEntities(yearStr); yearStr = new Util().toYearStr(yearStr); // test if possible etc if (yearStr != null) { try { date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); } catch (Exception e) { // nothing } } } String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:textClass/*:keywords/*:term)"); if (subject != null) subject = StringUtils.deresolveXmlEntities(subject); String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability"); if (rights == null) rights = "open access"; rights = StringUtils.deresolveXmlEntities(rights); String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration"; String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability/@status)"); if (accessRights == null) accessRights = "free"; accessRights = StringUtils.deresolveXmlEntities(accessRights); mdRecord.setIdentifier(identifier); mdRecord.setLanguage(language); mdRecord.setCreator(creator); mdRecord.setTitle(title); mdRecord.setPublisher(place); mdRecord.setRights(rights); mdRecord.setDate(date); mdRecord.setSubject(subject); mdRecord.setLicense(license); mdRecord.setAccessRights(accessRights); // get echo metadata mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord); // identifier is echoDir } String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)"); int pageCount = Integer.valueOf(pageCountStr); mdRecord.setPageCount(pageCount); mdRecord.setSchemaName("TEI"); return mdRecord; } private MetadataRecord getMetadataRecordHtml(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException { String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/html/head"); if (metadataXmlStr != null) { String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.identifier']/@content)"); if (identifier != null && ! identifier.isEmpty()) identifier = StringUtils.deresolveXmlEntities(identifier); String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.creator']/@content)"); if (creator != null && ! creator.isEmpty()) creator = StringUtils.deresolveXmlEntities(creator); String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.title']/@content)"); if (title != null && ! title.isEmpty()) title = StringUtils.deresolveXmlEntities(title); String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.language']/@content)"); if (language != null && language.isEmpty()) language = null; if (language != null && ! language.isEmpty()) language = StringUtils.deresolveXmlEntities(language); String publisher = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.publisher']/@content)"); if (publisher != null) publisher = StringUtils.deresolveXmlEntities(publisher); String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.date']/@content)"); Date date = null; if (yearStr != null && ! yearStr.equals("")) { yearStr = StringUtils.deresolveXmlEntities(yearStr); yearStr = new Util().toYearStr(yearStr); // test if possible etc if (yearStr != null) { try { date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); } catch (Exception e) { // nothing } } } String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.subject']/@content)"); if (subject != null) subject = StringUtils.deresolveXmlEntities(subject); String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.rights']/@content)"); if (rights != null && ! rights.isEmpty()) rights = StringUtils.deresolveXmlEntities(rights); String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.license']/@content)"); if (license != null && ! license.isEmpty()) license = StringUtils.deresolveXmlEntities(license); String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.accessRights']/@content)"); if (accessRights != null && ! accessRights.isEmpty()) accessRights = StringUtils.deresolveXmlEntities(accessRights); mdRecord.setIdentifier(identifier); mdRecord.setLanguage(language); mdRecord.setCreator(creator); mdRecord.setTitle(title); mdRecord.setPublisher(publisher); mdRecord.setRights(rights); mdRecord.setDate(date); mdRecord.setSubject(subject); mdRecord.setLicense(license); mdRecord.setAccessRights(accessRights); // get echo metadata mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord); // identifier is echoDir } String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)"); int pageCount = Integer.valueOf(pageCountStr); mdRecord.setPageCount(pageCount); mdRecord.setSchemaName("html"); return mdRecord; } private MetadataRecord getEchoMetadata(XQueryEvaluator xQueryEvaluator, String echoDir, MetadataRecord mdRecord) throws ApplicationException { if (echoDir == null || echoDir.isEmpty()) { String docId = mdRecord.getDocId(); echoDir = getEchoDir(xQueryEvaluator, docId); if (echoDir == null) return mdRecord; } String urLTexter = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=" + echoDir + "/index.meta"; String echoIndexMetaStr = performGetRequest(urLTexter); String echoPageImageDir = null; String echoFiguresDir = null; String mpiwgDocId = null; if (echoIndexMetaStr != null) { if (echoIndexMetaStr.equals("XXXXTimeoutXXXX")) return null; else if (echoIndexMetaStr.equals("XXXXUrlErrorXXXX")) return mdRecord; echoPageImageDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/image"); if (echoPageImageDir != null) echoPageImageDir = echoDir + "/" + echoPageImageDir; else echoPageImageDir = echoDir + "/" + "pageimg"; // default echoFiguresDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/figures"); if (echoFiguresDir != null) echoFiguresDir = echoDir + "/" + echoFiguresDir; else echoFiguresDir = echoDir + "/" + "figures"; // default mpiwgDocId = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/dri[@type = 'mpiwg']"); } mdRecord.setEchoId(echoDir); mdRecord.setEchoPageImageDir(echoPageImageDir); mdRecord.setEchoFiguresDir(echoFiguresDir); mdRecord.setMpiwgDocId(mpiwgDocId); return mdRecord; } private String getEchoDir(XQueryEvaluator xQueryEvaluator, String docId) throws ApplicationException { String echoDir = null; String urLTextUrlPath = "http://md.mpiwg-berlin.mpg.de/purls/searchSolr?text-url-path=" + docId + "&format=short"; String resultXmlStr = performGetRequest(urLTextUrlPath); if (resultXmlStr != null) { if (resultXmlStr.equals("XXXXTimeoutXXXX")) return null; else if (resultXmlStr.equals("XXXXUrlErrorXXXX")) return null; String archivePath = xQueryEvaluator.evaluateAsStringValueJoined(resultXmlStr, "//archive-path"); if (archivePath != null) { archivePath = archivePath.replaceAll("/mpiwg/online", ""); if (archivePath.isEmpty()) echoDir = null; else echoDir = archivePath; } } return echoDir; } private String getNodeType(XdmNode node) { String nodeType = null; XdmSequenceIterator iter = node.axisIterator(Axis.CHILD); if (iter != null) { while (iter.hasNext()) { XdmNode firstChild = (XdmNode) iter.next(); if (firstChild != null) { XdmNodeKind nodeKind = firstChild.getNodeKind(); if (nodeKind.ordinal() == XdmNodeKind.ELEMENT.ordinal()) { QName nodeQName = firstChild.getNodeName(); nodeType = nodeQName.getLocalName(); } } } } return nodeType; } public String getDocFullFileName(String docId) { String docDir = getDocDir(docId); String docFileName = getDocFileName(docId); String docFullFileName = docDir + "/" + docFileName; return docFullFileName; } public String getFullFileName(String docId, String type) { String docDir = getDocDir(docId); String docFileName = getDocFileName(docId); int lastDot = docFileName.lastIndexOf("."); String docFileNameWithoutExtension = docFileName.substring(0, lastDot); String fullFileName = docDir + "/" + docFileNameWithoutExtension + ".xml"; if (type != null && ! type.equals("toc")) { fullFileName = docDir + "/" + docFileNameWithoutExtension + "." + type; } else if (type != null && type.equals("toc")) { fullFileName = docDir + "/toc.xml"; } return fullFileName; } public String getDocDir(String docId) { String documentsDirectory = Constants.getInstance().getDocumentsDir(); String subDir = docId; if (docId.contains(".")) { int index = docId.lastIndexOf("."); subDir = docId.substring(0, index); } if (! subDir.startsWith("/")) subDir = "/" + subDir; String docDir = documentsDirectory + subDir; return docDir; } public String getDocFileName(String docId) { String docFileName = docId; int index = docId.lastIndexOf("/"); if (index != -1) { docFileName = docId.substring(index + 1); } return docFileName; } private String getMainLanguage(String docId) { String mainLang = null; int to = docId.lastIndexOf("/"); if (to != -1) { String preStr = docId.substring(0, to); int from = preStr.lastIndexOf("/"); if (from != -1) mainLang = preStr.substring(from + 1, to); } return mainLang; } private String deleteSpecialChars(String inputStr) { StringBuilder buf = new StringBuilder(); for (int i = 0; i < inputStr.length(); i++) { char c = inputStr.charAt(i); String replace = new String(); switch (c) { case '@': replace = ""; break; case ' ': replace = ""; break; case ';': replace = ""; break; default: replace += c; break; } buf.append(replace); } return buf.toString(); } private Hashtable<Integer, StringBuilder> getFragments(String fileName, String milestoneElementName) throws ApplicationException { try { GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(milestoneElementName); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(getFragmentsContentHandler); StringReader bla = new StringReader(FileUtils.readFileToString(new File(fileName), "utf-8")); InputSource inputSource = new InputSource(bla); xmlParser.parse(inputSource); Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages(); return resultFragments; } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } } private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException { StringReader strReader = new StringReader(xmlStr); XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader); xmlTokenizer.setLanguage(language); String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance) // non word breaking elements; // TODO examine bugs with emph, figure, hi : // e.g. "... der <hi rend="i">Capi-<lb n="16"/>talist.</hi> Es ..." // e.g. page 30 in /echo/la/Cataneo_1600.xml String[] nwbElements = {"lb", "br", "cb"}; xmlTokenizer.setNWBElements(nwbElements); xmlTokenizer.setOutputOptions(outputOptionsWithLemmas); xmlTokenizer.tokenize(); String retStr = xmlTokenizer.getXmlResult(); return retStr; } private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException { try { WordContentHandler wordContentHandler = new WordContentHandler(); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(wordContentHandler); StringReader strReader = new StringReader(xmlStr); InputSource inputSource = new InputSource(strReader); xmlParser.parse(inputSource); String result = wordContentHandler.getResult(); return result; } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } } private String performGetRequest(String url) throws ApplicationException { String resultStr = null; try { boolean urlIsOk = checkUri(url, 2000); // if url doesn't answer after 2 seconds if (! urlIsOk) return "XXXXTimeoutXXXX"; HttpClient httpClient = new HttpClient(); GetMethod method = new GetMethod(url); httpClient.executeMethod(method); int statusCode = method.getStatusCode(); if (statusCode >= 400) return "XXXXUrlErrorXXXX"; byte[] resultBytes = method.getResponseBody(); resultStr = new String(resultBytes, "utf-8"); method.releaseConnection(); } catch (HttpException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } return resultStr; } private boolean checkUri(String uriStr, int timeoutMilliseconds) throws ApplicationException { boolean isOk = true; try { URI uri = new URI(uriStr); HttpGet httpGet = new HttpGet(uri); HttpParams httpParameters = new BasicHttpParams(); // Set the timeout in milliseconds until a connection is established. // The default value is zero, that means the timeout is not used. int timeoutConnection = 2000; HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection); // Set the default socket timeout (SO_TIMEOUT) // in milliseconds which is the timeout for waiting for data. int timeoutSocket = 2000; HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket); DefaultHttpClient httpClient = new DefaultHttpClient(httpParameters); HttpResponse response = httpClient.execute(httpGet); } catch (IOException e) { isOk = false; // if timeout exception is thrown } catch (URISyntaxException e) { throw new ApplicationException(e); } return isOk; } /** * Write string into destFile. If directory for that destFile does not exist * it creates this directory including parent directories. * @param str string to write * @param destFileName destination file name * @throws ApplicationException */ private void saveFile(String str, String destFileName) throws ApplicationException { OutputStreamWriter out = null; try { if (str == null) return; // do nothing File destFile = new File(destFileName); File destDir = new File(destFile.getParent()); if (! destDir.exists()) { destDir.mkdirs(); // create the directory including parent directories which do not exist } out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(destFile)), "utf-8"); out.write(str); out.flush(); } catch (FileNotFoundException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } finally { try { if (out != null) out.close(); } catch (Exception e) { // nothing: always close the stream at the end of the method } } } private void beginOperation() { beginOfOperation = new Date().getTime(); } private void endOperation() { endOfOperation = new Date().getTime(); } }