Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
line wrap: on
line diff
--- a/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java Wed Dec 14 13:57:09 2011 +0100 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java Tue Nov 27 12:35:19 2012 +0100 @@ -10,6 +10,11 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; @@ -19,6 +24,8 @@ import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.TokenizerNew; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; public class TestLocal { @@ -27,10 +34,11 @@ public static void main(String[] args) throws ApplicationException { try { TestLocal test = new TestLocal(); - test.init(); - test.testCalls(); - // test.tokenizeString(); + // test.init(); + // test.testCalls(); + test.tokenizeString(); // test.tokenizeXmlFragment(); + // test.normalizeWords(13); // test.getLexEntriesByLexiconBeginningWith("ls", "a"); // test.end(); } catch (Exception e) { @@ -47,22 +55,12 @@ } private ArrayList<Token> tokenizeString() throws ApplicationException { - ArrayList<Token> tokens = new ArrayList<Token>(); - try { - StringReader reader = new StringReader("edo philoſophi"); - // StringReader reader = new StringReader("扞盗則李兗州"); - Tokenizer tokenizer = new Tokenizer(reader); - tokenizer.setLanguage("lat"); - // tokenizer.setLanguage("zho"); - String[] normFunctions = new String[1]; - normFunctions[0] = "norm"; - tokenizer.setNormFunctions(normFunctions); - tokens = tokenizer.getTokens(); - tokenizer.end(); - tokenizer.close(); - } catch (IOException e) { - throw new ApplicationException(e); - } + // StringReader reader = new StringReader("edo philoſophi ä bla"); + // StringReader reader = new StringReader(""); + StringReader reader = new StringReader("扞盗則李兗州"); + TokenizerNew tokenizer = new TokenizerNew(reader, "zh"); + ArrayList<Token> tokens = tokenizer.tokenize(); + System.out.println(tokens); return tokens; } @@ -79,13 +77,13 @@ XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); xmlTokenizer.setLanguage("lat"); - String[] normFunctions = new String[1]; - normFunctions[0] = "norm"; - String[] stopElements = new String[1]; - stopElements[0] = "var"; - xmlTokenizer.setNormFunctions(normFunctions); + String[] stopElements = {"var"}; + // xmlTokenizer.setOutputFormat("string"); + String[] outputOptions = {"withLemmas"}; + xmlTokenizer.setOutputOptions(outputOptions); xmlTokenizer.setStopElements(stopElements); - result = xmlTokenizer.tokenize(); + xmlTokenizer.tokenize(); + result = xmlTokenizer.getXmlResult(); System.out.println(result); } catch (Exception e) { throw new ApplicationException(e); @@ -107,7 +105,7 @@ } private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, int normMode) throws ApplicationException { - ArrayList<Lemma> lemmas = lexHandler.getLemmas(query, inputType, language, normMode); + ArrayList<Lemma> lemmas = lexHandler.getLemmas(query, inputType, language, normMode, false); ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName, query); // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); String result = ""; @@ -129,4 +127,25 @@ ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1, 50); System.out.println(lexEntries); } + + private String normalizeWords(int page) throws ApplicationException { + try { + String docDir = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585"; + String pageFileName = docDir + "/" + "pages" + "/page-" + page + "-morph.xml"; + File pageFile = new File(pageFileName); + String fragment = FileUtils.readFileToString(pageFile, "utf-8"); + WordContentHandler wordContentHandler = new WordContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(wordContentHandler); + StringReader strReader = new StringReader(fragment); + InputSource inputSource = new InputSource(strReader); + xmlParser.parse(inputSource); + String result = wordContentHandler.getResult(); + return result; + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } }