Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetBig5EncodedTerms.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line source
/* * eXist Open Source Native XML Database: Extension module * Copyright (C) 2008 Josef Willenborg * jwillenborg@mpiwg-berlin.mpg.de * http://www.mpiwg-berlin.mpg.de * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: TextModule.java $ */ package org.exist.xquery.modules.mpdltext; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import org.exist.dom.QName; import org.exist.memtree.DocumentImpl; import org.exist.memtree.MemTreeBuilder; import org.exist.xquery.BasicFunction; import org.exist.xquery.Cardinality; import org.exist.xquery.FunctionSignature; import org.exist.xquery.XPathException; import org.exist.xquery.XQueryContext; import org.exist.xquery.value.Sequence; import org.exist.xquery.value.SequenceType; import org.exist.xquery.value.Type; /** * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) */ public class GetBig5EncodedTerms extends BasicFunction { public final static FunctionSignature signature = new FunctionSignature( new QName("get-big5-encoded-terms", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX), "bla bla", new SequenceType[] { new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE) }, new SequenceType(Type.NODE, Cardinality.EXACTLY_ONE)); public GetBig5EncodedTerms(XQueryContext context) { super(context, signature); } public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { Sequence luceneQueryStringSeq = args[0]; String luceneQueryString = ""; if (luceneQueryStringSeq.isEmpty()) return Sequence.EMPTY_SEQUENCE; luceneQueryString = luceneQueryStringSeq.getStringValue(); ArrayList<String> queryTerms = getTermsFromLuceneQuery(luceneQueryString); int size = queryTerms.size(); MemTreeBuilder builder = context.getDocumentBuilder(); builder.startElement("", "big5-mappings", "big5-mappings", null); for (int i=0; i<size; i++) { String queryTerm = queryTerms.get(i); String big5EncodedQueryTerm = encodeBig5(queryTerm); builder.startElement("", "big5-mapping", "big5-mapping", null); builder.startElement("", "term", "term", null); builder.characters(queryTerm); builder.endElement(); builder.startElement("", "mapping", "mapping", null); builder.characters(big5EncodedQueryTerm); builder.endElement(); builder.endElement(); } builder.endElement(); DocumentImpl doc = ((DocumentImpl)builder.getDocument()); return doc; } // TODO method is only simple: proof all Lucene cases private ArrayList<String> getTermsFromLuceneQuery(String queryString) { ArrayList<String> terms = new ArrayList<String>(); String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") for (int i = 0; i < variantTokens.length; i++) { String token = variantTokens[i]; if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { terms.add(token); } } return terms; } private String encodeBig5(String inputStr) { String resultStr = ""; String charset = "big5"; try { byte[] resultBytes = inputStr.getBytes(charset); for (int i=0; i < resultBytes.length; i++) { byte b = resultBytes[i]; int unsigned = unsignedByteToInt(b); String hexStr = Integer.toHexString(unsigned); resultStr = resultStr + "%" + hexStr; } } catch (UnsupportedEncodingException e) { } return resultStr; } private int unsignedByteToInt(byte b) { return (int) b & 0xFF; } }