Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetBig5EncodedTerms.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetBig5EncodedTerms.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,118 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdltext; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; + +import org.exist.dom.QName; +import org.exist.memtree.DocumentImpl; +import org.exist.memtree.MemTreeBuilder; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.Type; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class GetBig5EncodedTerms extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("get-big5-encoded-terms", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX), + "bla bla", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE) }, + new SequenceType(Type.NODE, Cardinality.EXACTLY_ONE)); + + public GetBig5EncodedTerms(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence luceneQueryStringSeq = args[0]; + String luceneQueryString = ""; + if (luceneQueryStringSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + luceneQueryString = luceneQueryStringSeq.getStringValue(); + ArrayList<String> queryTerms = getTermsFromLuceneQuery(luceneQueryString); + int size = queryTerms.size(); + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "big5-mappings", "big5-mappings", null); + for (int i=0; i<size; i++) { + String queryTerm = queryTerms.get(i); + String big5EncodedQueryTerm = encodeBig5(queryTerm); + builder.startElement("", "big5-mapping", "big5-mapping", null); + builder.startElement("", "term", "term", null); + builder.characters(queryTerm); + builder.endElement(); + builder.startElement("", "mapping", "mapping", null); + builder.characters(big5EncodedQueryTerm); + builder.endElement(); + builder.endElement(); + } + builder.endElement(); + DocumentImpl doc = ((DocumentImpl)builder.getDocument()); + + return doc; + } + + // TODO method is only simple: proof all Lucene cases + private ArrayList<String> getTermsFromLuceneQuery(String queryString) { + ArrayList<String> terms = new ArrayList<String>(); + String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") + for (int i = 0; i < variantTokens.length; i++) { + String token = variantTokens[i]; + if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { + terms.add(token); + } + } + return terms; + } + + private String encodeBig5(String inputStr) { + String resultStr = ""; + String charset = "big5"; + try { + byte[] resultBytes = inputStr.getBytes(charset); + for (int i=0; i < resultBytes.length; i++) { + byte b = resultBytes[i]; + int unsigned = unsignedByteToInt(b); + String hexStr = Integer.toHexString(unsigned); + resultStr = resultStr + "%" + hexStr; + } + } catch (UnsupportedEncodingException e) { + + } + return resultStr; + } + + private int unsignedByteToInt(byte b) { + return (int) b & 0xFF; + } + +}