view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lucene/MorphQueryParser.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lucene;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;

public class MorphQueryParser extends QueryParser {
  String language;

  public MorphQueryParser(String f, MpdlMorphAnalyzer a) {
    super(f, a);
    this.language = a.getLanguage();
  }

  /**
   * @exception ParseException throw in overridden method to disallow
   */
  protected Query getFieldQuery(String field, String queryText)  throws ParseException {
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or nothing based on the term count
    Analyzer analyzer = getAnalyzer();
    TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
    Vector v = new Vector();
    org.apache.lucene.analysis.Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    while (true) {
      try {
        t = source.next();
      }
      catch (IOException e) {
        t = null;
      }
      if (t == null)
        break;
      v.addElement(t);
      if (t.getPositionIncrement() != 0)
        positionCount += t.getPositionIncrement();
      else
        severalTokensAtSamePosition = true;
    }
    try {
      source.close();
    }
    catch (IOException e) {
      // ignore
    }

    if (v.size() == 0)
      return null;
    else if (v.size() == 1) {
      t = (org.apache.lucene.analysis.Token) v.elementAt(0);
      // BEGIN MPDL specific extensions
      Query retMorphQuery = null;
      try {
        String termText = t.termText();
        ArrayList<String> lemmaNames = null;
        if (termText != null && ! termText.trim().equals("")) {
          // lemma mode: if term contains "lemmalemma" then the lemma itself is fetched
          if (termText.startsWith("lemmalemma")) {
            lemmaNames = new ArrayList<String>();
            String lemmaName = termText.substring(10);
            lemmaNames.add(lemmaName);
          } else {
            String[] lemmasStrArray = termText.split("\\+\\+\\+");
            if (lemmasStrArray != null)
              lemmaNames = new ArrayList<String>();
            for (int i=0; i<lemmasStrArray.length; i++) {
              String lemmaStr = lemmasStrArray[i];
              if (! lemmaStr.trim().equals(""))
                lemmaNames.add(lemmaStr);
            }
          }
        } else {
          return null;
        }
        MorphologyCache morphologyCache = MorphologyCache.getInstance();
        ArrayList<String> morphIndexKeys = morphologyCache.getIndexKeysByLemmaNames(language, lemmaNames);
        if (morphIndexKeys == null) {
          return null;
        } else if (morphIndexKeys.size() == 1) {
          String morphIndexKey = morphIndexKeys.get(0);
          retMorphQuery = new TermQuery(new Term(field, morphIndexKey));
        } else if (morphIndexKeys.size() > 1) {
          BooleanQuery retMorphQueryBoolean = new BooleanQuery(true);
          for (int i=0; i<morphIndexKeys.size(); i++) {
            String morphIndexKey = morphIndexKeys.get(i);
            TermQuery morpTermQuery = new TermQuery(new Term(field, morphIndexKey));
            retMorphQueryBoolean.add(morpTermQuery, BooleanClause.Occur.SHOULD);
            retMorphQuery = retMorphQueryBoolean;
          }
        }
      } catch (ApplicationException e) {
        throw new ParseException(e.getMessage());
      }
      return retMorphQuery;
      // END MPDL specific extensions
    } else {
      if (severalTokensAtSamePosition) {
        if (positionCount == 1) {
          // no phrase query:
          BooleanQuery q = new BooleanQuery(true);
          for (int i = 0; i < v.size(); i++) {
            t = (org.apache.lucene.analysis.Token) v.elementAt(i);
            TermQuery currentQuery = new TermQuery(
                new Term(field, t.termText()));
            q.add(currentQuery, BooleanClause.Occur.SHOULD);
          }
          return q;
        }
        else {
          // phrase query:
          MultiPhraseQuery mpq = new MultiPhraseQuery();
          mpq.setSlop(getPhraseSlop());
          List multiTerms = new ArrayList();
          int position = -1;
          for (int i = 0; i < v.size(); i++) {
            t = (org.apache.lucene.analysis.Token) v.elementAt(i);
            if (t.getPositionIncrement() > 0 && multiTerms.size() > 0) {
              if (getEnablePositionIncrements()) {
                mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
              } else {
                mpq.add((Term[])multiTerms.toArray(new Term[0]));
              }
              multiTerms.clear();
            }
            position += t.getPositionIncrement();
            multiTerms.add(new Term(field, t.termText()));
          }
          if (getEnablePositionIncrements()) {
            mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
          } else {
            mpq.add((Term[])multiTerms.toArray(new Term[0]));
          }
          return mpq;
        }
      }
      else {
        PhraseQuery pq = new PhraseQuery();
        pq.setSlop(getPhraseSlop());
        int position = -1;
        for (int i = 0; i < v.size(); i++) {
          t = (org.apache.lucene.analysis.Token) v.elementAt(i);
          if (getEnablePositionIncrements()) {
            position += t.getPositionIncrement();
            pq.add(new Term(field, t.termText()),position);
          } else {
            pq.add(new Term(field, t.termText()));
          }
        }
        return pq;
      }
    }
  }
  
}