Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lucene/MorphQueryParser.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lucene; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Vector; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; public class MorphQueryParser extends QueryParser { String language; public MorphQueryParser(String f, MpdlMorphAnalyzer a) { super(f, a); this.language = a.getLanguage(); } /** * @exception ParseException throw in overridden method to disallow */ protected Query getFieldQuery(String field, String queryText) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count Analyzer analyzer = getAnalyzer(); TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); Vector v = new Vector(); org.apache.lucene.analysis.Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { t = source.next(); } catch (IOException e) { t = null; } if (t == null) break; v.addElement(t); if (t.getPositionIncrement() != 0) positionCount += t.getPositionIncrement(); else severalTokensAtSamePosition = true; } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) return null; else if (v.size() == 1) { t = (org.apache.lucene.analysis.Token) v.elementAt(0); // BEGIN MPDL specific extensions Query retMorphQuery = null; try { String termText = t.termText(); ArrayList<String> lemmaNames = null; if (termText != null && ! termText.trim().equals("")) { // lemma mode: if term contains "lemmalemma" then the lemma itself is fetched if (termText.startsWith("lemmalemma")) { lemmaNames = new ArrayList<String>(); String lemmaName = termText.substring(10); lemmaNames.add(lemmaName); } else { String[] lemmasStrArray = termText.split("\\+\\+\\+"); if (lemmasStrArray != null) lemmaNames = new ArrayList<String>(); for (int i=0; i<lemmasStrArray.length; i++) { String lemmaStr = lemmasStrArray[i]; if (! lemmaStr.trim().equals("")) lemmaNames.add(lemmaStr); } } } else { return null; } MorphologyCache morphologyCache = MorphologyCache.getInstance(); ArrayList<String> morphIndexKeys = morphologyCache.getIndexKeysByLemmaNames(language, lemmaNames); if (morphIndexKeys == null) { return null; } else if (morphIndexKeys.size() == 1) { String morphIndexKey = morphIndexKeys.get(0); retMorphQuery = new TermQuery(new Term(field, morphIndexKey)); } else if (morphIndexKeys.size() > 1) { BooleanQuery retMorphQueryBoolean = new BooleanQuery(true); for (int i=0; i<morphIndexKeys.size(); i++) { String morphIndexKey = morphIndexKeys.get(i); TermQuery morpTermQuery = new TermQuery(new Term(field, morphIndexKey)); retMorphQueryBoolean.add(morpTermQuery, BooleanClause.Occur.SHOULD); retMorphQuery = retMorphQueryBoolean; } } } catch (ApplicationException e) { throw new ParseException(e.getMessage()); } return retMorphQuery; // END MPDL specific extensions } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = new BooleanQuery(true); for (int i = 0; i < v.size(); i++) { t = (org.apache.lucene.analysis.Token) v.elementAt(i); TermQuery currentQuery = new TermQuery( new Term(field, t.termText())); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // phrase query: MultiPhraseQuery mpq = new MultiPhraseQuery(); mpq.setSlop(getPhraseSlop()); List multiTerms = new ArrayList(); int position = -1; for (int i = 0; i < v.size(); i++) { t = (org.apache.lucene.analysis.Token) v.elementAt(i); if (t.getPositionIncrement() > 0 && multiTerms.size() > 0) { if (getEnablePositionIncrements()) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); } else { mpq.add((Term[])multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += t.getPositionIncrement(); multiTerms.add(new Term(field, t.termText())); } if (getEnablePositionIncrements()) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); } else { mpq.add((Term[])multiTerms.toArray(new Term[0])); } return mpq; } } else { PhraseQuery pq = new PhraseQuery(); pq.setSlop(getPhraseSlop()); int position = -1; for (int i = 0; i < v.size(); i++) { t = (org.apache.lucene.analysis.Token) v.elementAt(i); if (getEnablePositionIncrements()) { position += t.getPositionIncrement(); pq.add(new Term(field, t.termText()),position); } else { pq.add(new Term(field, t.termText())); } } return pq; } } } }