view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.Hashtable;

import javax.xml.namespace.NamespaceContext;

import org.w3c.dom.Node;
import org.xml.sax.InputSource;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
import de.mpg.mpiwg.berlin.mpdl.lucene.LuceneUtil;
import de.mpg.mpiwg.berlin.mpdl.util.FileUtil;
import de.mpg.mpiwg.berlin.mpdl.util.FilenameFilterExtension;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil;
import de.mpg.mpiwg.berlin.mpdl.xml.SchemaHandler;

public class RegularizationManager {
  private static RegularizationManager instance;
  private static String MPDL_DOC_DIR = MpdlConstants.MPDL_DOC_DIR;
  private static String MPDL_EXIST_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR;
  private static String ECHO_DOC_DIR = MPDL_DOC_DIR + "/documents/echo";
  private static String REGULARIZATION_DATA_DIR = MPDL_EXIST_DATA_DIR + "/dataFiles/regularization";
  private static String REGULARIZATION_DB_DIR = MPDL_EXIST_DATA_DIR + "/dataBerkeleyDB/regularization";
  private static String[] LANGUAGES = {"ar", "de", "el", "en", "fr", "it", "la", "nl", "zh"};
  private DBRegularizationHandler dbRegHandler;
  private Hashtable<String, ArrayList<Regularization>> regsOrig;
  private Hashtable<String, ArrayList<Regularization>> regsNorm;
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static RegularizationManager getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new RegularizationManager();
      instance.init();
    }
    return instance;
  }

  public static void main(String[] args) throws ApplicationException {
    getInstance();
    instance.beginOperation();
    System.out.print("Start ...");

    // instance.writeAllRegs();

    ArrayList<Regularization> regs = instance.findRegsByNorm("la", "Illiusque");
    ArrayList<Regularization> regs2 = instance.findRegsByNorm("la", "Itaque");
    Regularization bla = regs.get(0);
    Regularization bla2 = regs2.get(0);
    
    instance.end();
    instance.endOperation();
    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    System.out.println("End.");
    System.out.println("Needed time: " + elapsedTime + " seconds");
  }

  private void init() throws ApplicationException {
    regsOrig = new Hashtable<String, ArrayList<Regularization>>();
    regsNorm = new Hashtable<String, ArrayList<Regularization>>();
    dbRegHandler = new DBRegularizationHandler(REGULARIZATION_DB_DIR);
    dbRegHandler.start();
    dbRegHandler.openDatabases();
  }

  public ArrayList<Regularization> findRegsByOrig(String language, String orig) throws ApplicationException {
    orig = orig.toLowerCase();
    String hashKey = language + "###" + orig;
    ArrayList<Regularization> regs = regsOrig.get(hashKey);
    if (regs == null) {
      regs = dbRegHandler.readRegsByOrig(language, orig);
      if (regs == null || regs.isEmpty())
        regsOrig.put(hashKey, new ArrayList<Regularization>());
      else
        regsOrig.put(hashKey, regs);
    }
    return regs;
  }
  
  public ArrayList<Regularization> findRegsByNorm(String language, String norm) throws ApplicationException {
    norm = norm.toLowerCase();
    String hashKey = language + "###" + norm;
    ArrayList<Regularization> regs = regsNorm.get(hashKey);
    if (regs == null || regs.isEmpty()) {
      regs = dbRegHandler.readRegsByNorm(language, norm);
      if (regs == null)
        regsNorm.put(hashKey, new ArrayList<Regularization>());
      else
        regsNorm.put(hashKey, regs);
    }
    return regs;
  }
  
  public ArrayList<String> getRegOrigsByNormLuceneQueryString(String language, String luceneQueryString) throws ApplicationException {
    ArrayList<String> regForms = new ArrayList<String>();
    LuceneUtil luceneUtil = LuceneUtil.getInstance();
    ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(luceneQueryString);
    if (variants != null) {
      for (int i=0; i<variants.size(); i++) {
        String variant = variants.get(i);
        ArrayList<Regularization> regs = findRegsByNorm(language, variant);
        if (regs != null) {
          for (int j=0; j<regs.size(); j++) {
            Regularization reg = regs.get(j);
            String orig = reg.getOrig();
            regForms.add(orig);
          }
        }
      }
    }
    return regForms;
  }
  
  public void saveRegularizations(String language, String docFileName) throws ApplicationException {
    Hashtable<String, Regularization> hashTableRegOrig = new Hashtable<String, Regularization>();
    Hashtable<String, Regularization> hashTableRegNorm = new Hashtable<String, Regularization>();
    File docFile = new File(docFileName);
    // hack: in the two hashTables all regs are hold
    getRegs(language, docFile, hashTableRegOrig, hashTableRegNorm);
    // write all regs to DB
    writeRegsOrigToDb(hashTableRegOrig);
    writeRegsNormToDb(hashTableRegNorm);
  }
  
  private void getRegs(String language, File docFile, Hashtable<String, Regularization> hashTableRegOrig, Hashtable<String, Regularization> hashTableRegNorm) throws ApplicationException {
    XmlUtil xmlUtil = XmlUtil.getInstance();
    InputSource docFileInputSource = new InputSource(docFile.toURI().getPath());
    SchemaHandler echoSchemaHandler = new SchemaHandler();
    NamespaceContext echoNS = echoSchemaHandler.getEchoNsContext();
    ArrayList<Node> regsArray = xmlUtil.evaluateToNodeArray(docFileInputSource, "//echo:reg", echoNS);
    if (regsArray != null) {
      String docFileName = docFile.getName();
      for (int i=0; i<regsArray.size(); i++) {
        Node regNode = regsArray.get(i);
        String regOrigStr = xmlUtil.getNodeValue(regNode);
        String regNormStr = xmlUtil.getNodeAttributeValue(regNode, "norm");
        if (regOrigStr != null && (! regOrigStr.equals("")) && regNormStr != null && (! regNormStr.equals(""))) {
          regOrigStr = regOrigStr.toLowerCase();
          regNormStr = regNormStr.toLowerCase();
          Regularization reg = new Regularization(language, regOrigStr, regNormStr, docFileName);
          reg.setSourcePosition(i);
          Regularization regByOrigStr = hashTableRegOrig.get(regOrigStr);
          if (regByOrigStr == null)
            hashTableRegOrig.put(regOrigStr, reg);
          Regularization regByNormStr = hashTableRegNorm.get(regNormStr);
          if (regByNormStr == null)
            hashTableRegNorm.put(regNormStr, reg);
        }
      }
    }
  }

  private void writeRegsOrigToDb(Hashtable<String, Regularization> hashTableRegOrig) throws ApplicationException {
    Enumeration<Regularization> regElements = hashTableRegOrig.elements();
    while (regElements.hasMoreElements()) {
      Regularization reg = regElements.nextElement();
      boolean regAlreadyExists = false;
      String language = reg.getLanguage();
      String orig = reg.getOrig();
      String norm = reg.getNorm();
      ArrayList<Regularization> existingRegs = dbRegHandler.readRegsByOrig(language, orig);
      if (existingRegs != null && existingRegs.size() > 0) {
        for (int i=0; i<existingRegs.size(); i++) {
          Regularization r = existingRegs.get(i);
          String rNorm = r.getNorm();
          if (rNorm.equals(norm))
            regAlreadyExists = true;
        }
      }
      if (! regAlreadyExists)
        dbRegHandler.writeOrigReg(reg);
    }
  }
    
  private void writeRegsNormToDb(Hashtable<String, Regularization> hashTableRegNorm) throws ApplicationException {
    Enumeration<Regularization> regElements = hashTableRegNorm.elements();
    while (regElements.hasMoreElements()) {
      Regularization reg = regElements.nextElement();
      boolean regAlreadyExists = false;
      String language = reg.getLanguage();
      String orig = reg.getOrig();
      String norm = reg.getNorm();
      ArrayList<Regularization> existingRegs = dbRegHandler.readRegsByNorm(language, norm);
      if (existingRegs != null && existingRegs.size() > 0) {
        for (int i=0; i<existingRegs.size(); i++) {
          Regularization r = existingRegs.get(i);
          String rOrig = r.getOrig();
          if (rOrig.equals(orig))
            regAlreadyExists = true;
        }
      }
      if (! regAlreadyExists)
        dbRegHandler.writeNormReg(reg);
    }
  }
    
  private void writeAllRegs() throws ApplicationException {
    BufferedOutputStream regOut = null;
    try {
      for (int i=0; i<LANGUAGES.length; i++) {
        String language = LANGUAGES[i];
        String docDir = ECHO_DOC_DIR + "/" + language;
        FileUtil fileUtil = FileUtil.getInstance();
        FilenameFilter filter = new FilenameFilterExtension("xml");
        File[] docFiles = fileUtil.getFiles(docDir, filter); 
        if (docFiles != null && docFiles.length > 0) {
          Hashtable<String, Regularization> hashTableRegOrig = new Hashtable<String, Regularization>();
          Hashtable<String, Regularization> hashTableRegNorm = new Hashtable<String, Regularization>();
          for (int j=0; j<docFiles.length; j++) {
            File docFile = docFiles[j];
            getRegs(language, docFile, hashTableRegOrig, hashTableRegNorm);
          }
          String regOutFileName = REGULARIZATION_DATA_DIR + "/" + "regularization-" + language + ".xml";
          File regOutFile = new File(regOutFileName);
          regOut = new BufferedOutputStream(new FileOutputStream(regOutFile));
          write("<regularizations>\n", regOut);
          writeRegs(hashTableRegOrig, regOut);
          writeRegsToDb(hashTableRegOrig, hashTableRegNorm);
          write("</regularizations>", regOut);
        }
      }
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } finally {
      // always close the stream 
      if (regOut != null) try { regOut.close(); } catch (Exception e) { }
    }
  }
  
  private void writeRegs(Hashtable<String, Regularization> hashTableReg, BufferedOutputStream regOut) throws ApplicationException {
    Enumeration<Regularization> regElements = hashTableReg.elements();
    while (regElements.hasMoreElements()) {
      Regularization reg = regElements.nextElement();
      String regXmlStr = reg.getXmlString();
      write(regXmlStr, regOut);
    }
  }
  
  private void writeRegsToDb(Hashtable<String, Regularization> hashTableRegOrig, Hashtable<String, Regularization> hashTableRegNorm) throws ApplicationException {
    Enumeration<Regularization> regElements = hashTableRegOrig.elements();
    while (regElements.hasMoreElements()) {
      Regularization reg = regElements.nextElement();
      dbRegHandler.writeOrigReg(reg);
    }
    regElements = hashTableRegNorm.elements();
    while (regElements.hasMoreElements()) {
      Regularization reg = regElements.nextElement();
      dbRegHandler.writeNormReg(reg);
    }
  }
  
  private void deleteDbData() throws ApplicationException {
    dbRegHandler.deleteData();
  }
  
  private void write(String inputString, BufferedOutputStream out) throws ApplicationException {
    try {
      byte[] bytes = inputString.getBytes("utf-8");
      out.write(bytes, 0, bytes.length);
      out.flush();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  public void end() throws ApplicationException {
    dbRegHandler.closeDatabases();
  }

  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }

}