diff software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetBig5EncodedTerms.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetBig5EncodedTerms.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,118 @@
+/*
+ *  eXist Open Source Native XML Database: Extension module
+ *  Copyright (C) 2008 Josef Willenborg
+ *  jwillenborg@mpiwg-berlin.mpg.de
+ *  http://www.mpiwg-berlin.mpg.de
+ *  
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  
+ *  $Id: TextModule.java $
+ */
+package org.exist.xquery.modules.mpdltext;
+
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+
+import org.exist.dom.QName;
+import org.exist.memtree.DocumentImpl;
+import org.exist.memtree.MemTreeBuilder;
+import org.exist.xquery.BasicFunction;
+import org.exist.xquery.Cardinality;
+import org.exist.xquery.FunctionSignature;
+import org.exist.xquery.XPathException;
+import org.exist.xquery.XQueryContext;
+import org.exist.xquery.value.Sequence;
+import org.exist.xquery.value.SequenceType;
+import org.exist.xquery.value.Type;
+
+/**
+ * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de)
+ */
+public class GetBig5EncodedTerms extends BasicFunction {
+
+	public final static FunctionSignature signature =
+		new FunctionSignature(
+			new QName("get-big5-encoded-terms", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX),
+			"bla bla",
+			new SequenceType[] { new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE) },
+			new SequenceType(Type.NODE, Cardinality.EXACTLY_ONE));
+
+	public GetBig5EncodedTerms(XQueryContext context) {
+		super(context, signature);
+	}
+
+	public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException {
+    Sequence luceneQueryStringSeq = args[0];
+    String luceneQueryString = "";
+		if (luceneQueryStringSeq.isEmpty())
+			return Sequence.EMPTY_SEQUENCE;
+		luceneQueryString = luceneQueryStringSeq.getStringValue();
+		ArrayList<String> queryTerms = getTermsFromLuceneQuery(luceneQueryString);
+    int size = queryTerms.size();
+    MemTreeBuilder builder = context.getDocumentBuilder();
+    builder.startElement("", "big5-mappings", "big5-mappings", null);
+    for (int i=0; i<size; i++) {
+      String queryTerm = queryTerms.get(i);
+      String big5EncodedQueryTerm = encodeBig5(queryTerm);
+      builder.startElement("", "big5-mapping", "big5-mapping", null);
+      builder.startElement("", "term", "term", null);
+      builder.characters(queryTerm);
+      builder.endElement();
+      builder.startElement("", "mapping", "mapping", null);
+      builder.characters(big5EncodedQueryTerm);
+      builder.endElement();
+      builder.endElement();
+    }
+    builder.endElement();
+    DocumentImpl doc = ((DocumentImpl)builder.getDocument());
+
+		return doc;
+	}
+	
+  // TODO method is only simple: proof all Lucene cases
+  private ArrayList<String> getTermsFromLuceneQuery(String queryString) {
+    ArrayList<String> terms = new ArrayList<String>();
+    String[] variantTokens = queryString.split(" ");  // TODO throw the phrases away (e.g.: "bla bla bla")
+    for (int i = 0; i < variantTokens.length; i++) {
+      String token = variantTokens[i];
+      if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) {
+        terms.add(token);
+      }
+    }
+    return terms;
+  }
+
+  private String encodeBig5(String inputStr) {
+    String resultStr = "";
+    String charset = "big5";
+    try {
+      byte[] resultBytes = inputStr.getBytes(charset);
+      for (int i=0; i < resultBytes.length; i++) {
+        byte b = resultBytes[i];
+        int unsigned = unsignedByteToInt(b);
+        String hexStr = Integer.toHexString(unsigned);
+        resultStr = resultStr + "%" + hexStr;
+      }
+    } catch (UnsupportedEncodingException e) {
+      
+    }
+    return resultStr;
+  }
+
+  private int unsignedByteToInt(byte b) {
+    return (int) b & 0xFF;
+  }
+
+}