Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; import java.util.Hashtable; import javax.xml.namespace.NamespaceContext; import org.w3c.dom.Node; import org.xml.sax.InputSource; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; import de.mpg.mpiwg.berlin.mpdl.lucene.LuceneUtil; import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; import de.mpg.mpiwg.berlin.mpdl.util.FilenameFilterExtension; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; import de.mpg.mpiwg.berlin.mpdl.xml.SchemaHandler; public class RegularizationManager { private static RegularizationManager instance; private static String MPDL_DOC_DIR = MpdlConstants.MPDL_DOC_DIR; private static String MPDL_EXIST_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR; private static String ECHO_DOC_DIR = MPDL_DOC_DIR + "/documents/echo"; private static String REGULARIZATION_DATA_DIR = MPDL_EXIST_DATA_DIR + "/dataFiles/regularization"; private static String REGULARIZATION_DB_DIR = MPDL_EXIST_DATA_DIR + "/dataBerkeleyDB/regularization"; private static String[] LANGUAGES = {"ar", "de", "el", "en", "fr", "it", "la", "nl", "zh"}; private DBRegularizationHandler dbRegHandler; private Hashtable<String, ArrayList<Regularization>> regsOrig; private Hashtable<String, ArrayList<Regularization>> regsNorm; private Date beginOfOperation; private Date endOfOperation; public static RegularizationManager getInstance() throws ApplicationException { if (instance == null) { instance = new RegularizationManager(); instance.init(); } return instance; } public static void main(String[] args) throws ApplicationException { getInstance(); instance.beginOperation(); System.out.print("Start ..."); // instance.writeAllRegs(); ArrayList<Regularization> regs = instance.findRegsByNorm("la", "Illiusque"); ArrayList<Regularization> regs2 = instance.findRegsByNorm("la", "Itaque"); Regularization bla = regs.get(0); Regularization bla2 = regs2.get(0); instance.end(); instance.endOperation(); Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); System.out.println("End."); System.out.println("Needed time: " + elapsedTime + " seconds"); } private void init() throws ApplicationException { regsOrig = new Hashtable<String, ArrayList<Regularization>>(); regsNorm = new Hashtable<String, ArrayList<Regularization>>(); dbRegHandler = new DBRegularizationHandler(REGULARIZATION_DB_DIR); dbRegHandler.start(); dbRegHandler.openDatabases(); } public ArrayList<Regularization> findRegsByOrig(String language, String orig) throws ApplicationException { orig = orig.toLowerCase(); String hashKey = language + "###" + orig; ArrayList<Regularization> regs = regsOrig.get(hashKey); if (regs == null) { regs = dbRegHandler.readRegsByOrig(language, orig); if (regs == null || regs.isEmpty()) regsOrig.put(hashKey, new ArrayList<Regularization>()); else regsOrig.put(hashKey, regs); } return regs; } public ArrayList<Regularization> findRegsByNorm(String language, String norm) throws ApplicationException { norm = norm.toLowerCase(); String hashKey = language + "###" + norm; ArrayList<Regularization> regs = regsNorm.get(hashKey); if (regs == null || regs.isEmpty()) { regs = dbRegHandler.readRegsByNorm(language, norm); if (regs == null) regsNorm.put(hashKey, new ArrayList<Regularization>()); else regsNorm.put(hashKey, regs); } return regs; } public ArrayList<String> getRegOrigsByNormLuceneQueryString(String language, String luceneQueryString) throws ApplicationException { ArrayList<String> regForms = new ArrayList<String>(); LuceneUtil luceneUtil = LuceneUtil.getInstance(); ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(luceneQueryString); if (variants != null) { for (int i=0; i<variants.size(); i++) { String variant = variants.get(i); ArrayList<Regularization> regs = findRegsByNorm(language, variant); if (regs != null) { for (int j=0; j<regs.size(); j++) { Regularization reg = regs.get(j); String orig = reg.getOrig(); regForms.add(orig); } } } } return regForms; } public void saveRegularizations(String language, String docFileName) throws ApplicationException { Hashtable<String, Regularization> hashTableRegOrig = new Hashtable<String, Regularization>(); Hashtable<String, Regularization> hashTableRegNorm = new Hashtable<String, Regularization>(); File docFile = new File(docFileName); // hack: in the two hashTables all regs are hold getRegs(language, docFile, hashTableRegOrig, hashTableRegNorm); // write all regs to DB writeRegsOrigToDb(hashTableRegOrig); writeRegsNormToDb(hashTableRegNorm); } private void getRegs(String language, File docFile, Hashtable<String, Regularization> hashTableRegOrig, Hashtable<String, Regularization> hashTableRegNorm) throws ApplicationException { XmlUtil xmlUtil = XmlUtil.getInstance(); InputSource docFileInputSource = new InputSource(docFile.toURI().getPath()); SchemaHandler echoSchemaHandler = new SchemaHandler(); NamespaceContext echoNS = echoSchemaHandler.getEchoNsContext(); ArrayList<Node> regsArray = xmlUtil.evaluateToNodeArray(docFileInputSource, "//echo:reg", echoNS); if (regsArray != null) { String docFileName = docFile.getName(); for (int i=0; i<regsArray.size(); i++) { Node regNode = regsArray.get(i); String regOrigStr = xmlUtil.getNodeValue(regNode); String regNormStr = xmlUtil.getNodeAttributeValue(regNode, "norm"); if (regOrigStr != null && (! regOrigStr.equals("")) && regNormStr != null && (! regNormStr.equals(""))) { regOrigStr = regOrigStr.toLowerCase(); regNormStr = regNormStr.toLowerCase(); Regularization reg = new Regularization(language, regOrigStr, regNormStr, docFileName); reg.setSourcePosition(i); Regularization regByOrigStr = hashTableRegOrig.get(regOrigStr); if (regByOrigStr == null) hashTableRegOrig.put(regOrigStr, reg); Regularization regByNormStr = hashTableRegNorm.get(regNormStr); if (regByNormStr == null) hashTableRegNorm.put(regNormStr, reg); } } } } private void writeRegsOrigToDb(Hashtable<String, Regularization> hashTableRegOrig) throws ApplicationException { Enumeration<Regularization> regElements = hashTableRegOrig.elements(); while (regElements.hasMoreElements()) { Regularization reg = regElements.nextElement(); boolean regAlreadyExists = false; String language = reg.getLanguage(); String orig = reg.getOrig(); String norm = reg.getNorm(); ArrayList<Regularization> existingRegs = dbRegHandler.readRegsByOrig(language, orig); if (existingRegs != null && existingRegs.size() > 0) { for (int i=0; i<existingRegs.size(); i++) { Regularization r = existingRegs.get(i); String rNorm = r.getNorm(); if (rNorm.equals(norm)) regAlreadyExists = true; } } if (! regAlreadyExists) dbRegHandler.writeOrigReg(reg); } } private void writeRegsNormToDb(Hashtable<String, Regularization> hashTableRegNorm) throws ApplicationException { Enumeration<Regularization> regElements = hashTableRegNorm.elements(); while (regElements.hasMoreElements()) { Regularization reg = regElements.nextElement(); boolean regAlreadyExists = false; String language = reg.getLanguage(); String orig = reg.getOrig(); String norm = reg.getNorm(); ArrayList<Regularization> existingRegs = dbRegHandler.readRegsByNorm(language, norm); if (existingRegs != null && existingRegs.size() > 0) { for (int i=0; i<existingRegs.size(); i++) { Regularization r = existingRegs.get(i); String rOrig = r.getOrig(); if (rOrig.equals(orig)) regAlreadyExists = true; } } if (! regAlreadyExists) dbRegHandler.writeNormReg(reg); } } private void writeAllRegs() throws ApplicationException { BufferedOutputStream regOut = null; try { for (int i=0; i<LANGUAGES.length; i++) { String language = LANGUAGES[i]; String docDir = ECHO_DOC_DIR + "/" + language; FileUtil fileUtil = FileUtil.getInstance(); FilenameFilter filter = new FilenameFilterExtension("xml"); File[] docFiles = fileUtil.getFiles(docDir, filter); if (docFiles != null && docFiles.length > 0) { Hashtable<String, Regularization> hashTableRegOrig = new Hashtable<String, Regularization>(); Hashtable<String, Regularization> hashTableRegNorm = new Hashtable<String, Regularization>(); for (int j=0; j<docFiles.length; j++) { File docFile = docFiles[j]; getRegs(language, docFile, hashTableRegOrig, hashTableRegNorm); } String regOutFileName = REGULARIZATION_DATA_DIR + "/" + "regularization-" + language + ".xml"; File regOutFile = new File(regOutFileName); regOut = new BufferedOutputStream(new FileOutputStream(regOutFile)); write("<regularizations>\n", regOut); writeRegs(hashTableRegOrig, regOut); writeRegsToDb(hashTableRegOrig, hashTableRegNorm); write("</regularizations>", regOut); } } } catch (FileNotFoundException e) { throw new ApplicationException(e); } finally { // always close the stream if (regOut != null) try { regOut.close(); } catch (Exception e) { } } } private void writeRegs(Hashtable<String, Regularization> hashTableReg, BufferedOutputStream regOut) throws ApplicationException { Enumeration<Regularization> regElements = hashTableReg.elements(); while (regElements.hasMoreElements()) { Regularization reg = regElements.nextElement(); String regXmlStr = reg.getXmlString(); write(regXmlStr, regOut); } } private void writeRegsToDb(Hashtable<String, Regularization> hashTableRegOrig, Hashtable<String, Regularization> hashTableRegNorm) throws ApplicationException { Enumeration<Regularization> regElements = hashTableRegOrig.elements(); while (regElements.hasMoreElements()) { Regularization reg = regElements.nextElement(); dbRegHandler.writeOrigReg(reg); } regElements = hashTableRegNorm.elements(); while (regElements.hasMoreElements()) { Regularization reg = regElements.nextElement(); dbRegHandler.writeNormReg(reg); } } private void deleteDbData() throws ApplicationException { dbRegHandler.deleteData(); } private void write(String inputString, BufferedOutputStream out) throws ApplicationException { try { byte[] bytes = inputString.getBytes("utf-8"); out.write(bytes, 0, bytes.length); out.flush(); } catch (IOException e) { throw new ApplicationException(e); } } public void end() throws ApplicationException { dbRegHandler.closeDatabases(); } private void beginOperation() { beginOfOperation = new Date(); } private void endOperation() { endOfOperation = new Date(); } }