Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.test; import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.net.URL; import java.util.ArrayList; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.TokenizerNew; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; public class TestLocal { private LexHandler lexHandler; public static void main(String[] args) throws ApplicationException { try { TestLocal test = new TestLocal(); // test.init(); // test.testCalls(); test.tokenizeString(); // test.tokenizeXmlFragment(); // test.normalizeWords(13); // test.getLexEntriesByLexiconBeginningWith("ls", "a"); // test.end(); } catch (Exception e) { e.printStackTrace(); } } private void init() throws ApplicationException { lexHandler = LexHandler.getInstance(); } private void end() throws ApplicationException { lexHandler.end(); } private ArrayList<Token> tokenizeString() throws ApplicationException { // StringReader reader = new StringReader("edo philoſophi ä bla"); // StringReader reader = new StringReader(""); StringReader reader = new StringReader("扞盗則李兗州"); TokenizerNew tokenizer = new TokenizerNew(reader, "zh"); ArrayList<Token> tokens = tokenizer.tokenize(); System.out.println(tokens); return tokens; } private String tokenizeXmlFragment() throws ApplicationException { String result = null; try { String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8"); String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13"; URL srcUrl = new URL(srcUrlStr); InputStream inputStream = srcUrl.openStream(); BufferedInputStream in = new BufferedInputStream(inputStream); xmlFragment = IOUtils.toString(in, "utf-8"); in.close(); XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); xmlTokenizer.setLanguage("lat"); String[] stopElements = {"var"}; // xmlTokenizer.setOutputFormat("string"); String[] outputOptions = {"withLemmas"}; xmlTokenizer.setOutputOptions(outputOptions); xmlTokenizer.setStopElements(stopElements); xmlTokenizer.tokenize(); result = xmlTokenizer.getXmlResult(); System.out.println(result); } catch (Exception e) { throw new ApplicationException(e); } return result; } private void testCalls() throws ApplicationException { String query = "vergewissernd"; String language = "deu"; // String query = "ἱκανῶσ"; // String language = "el"; String inputType = "form"; String outputType = null; String outputFormat = "html"; String dictionaryName = null; int normMode = Normalizer.DICTIONARY; getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normMode); } private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, int normMode) throws ApplicationException { ArrayList<Lemma> lemmas = lexHandler.getLemmas(query, inputType, language, normMode, false); ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName, query); // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); String result = ""; result = result + "<dictionaries>"; for (int i=0; i<dictionaries.size(); i++) { Lexicon lexicon = dictionaries.get(i); result = result + lexicon.toXmlString(); } result = result + "</dictionaries>"; System.out.println(result); } private void getLexEntriesByLexiconBeginningWith(String lexiconName, String prefix) throws ApplicationException { ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesByLexiconBeginningWith(lexiconName, prefix, 1, 50); System.out.println(lexEntries); } private void getLexEntriesBeginningWith(String language, String prefix) throws ApplicationException { ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1, 50); System.out.println(lexEntries); } private String normalizeWords(int page) throws ApplicationException { try { String docDir = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585"; String pageFileName = docDir + "/" + "pages" + "/page-" + page + "-morph.xml"; File pageFile = new File(pageFileName); String fragment = FileUtils.readFileToString(pageFile, "utf-8"); WordContentHandler wordContentHandler = new WordContentHandler(); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(wordContentHandler); StringReader strReader = new StringReader(fragment); InputSource inputSource = new InputSource(strReader); xmlParser.parse(inputSource); String result = wordContentHandler.getResult(); return result; } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } } }