Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
8 import java.net.URL; | 8 import java.net.URL; |
9 import java.util.ArrayList; | 9 import java.util.ArrayList; |
10 | 10 |
11 import org.apache.commons.io.FileUtils; | 11 import org.apache.commons.io.FileUtils; |
12 import org.apache.commons.io.IOUtils; | 12 import org.apache.commons.io.IOUtils; |
13 import org.xml.sax.InputSource; | |
14 import org.xml.sax.SAXException; | |
15 import org.xml.sax.XMLReader; | |
16 | |
17 import com.sun.org.apache.xerces.internal.parsers.SAXParser; | |
13 | 18 |
14 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | 19 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; |
15 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; | 20 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; |
16 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; | 21 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; |
17 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; | 22 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; |
18 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; | 23 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; |
19 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; | 24 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; |
20 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; | 25 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; |
21 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; | 26 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; |
27 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.TokenizerNew; | |
28 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; | |
22 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; | 29 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; |
23 | 30 |
24 public class TestLocal { | 31 public class TestLocal { |
25 private LexHandler lexHandler; | 32 private LexHandler lexHandler; |
26 | 33 |
27 public static void main(String[] args) throws ApplicationException { | 34 public static void main(String[] args) throws ApplicationException { |
28 try { | 35 try { |
29 TestLocal test = new TestLocal(); | 36 TestLocal test = new TestLocal(); |
30 test.init(); | 37 // test.init(); |
31 test.testCalls(); | 38 // test.testCalls(); |
32 // test.tokenizeString(); | 39 test.tokenizeString(); |
33 // test.tokenizeXmlFragment(); | 40 // test.tokenizeXmlFragment(); |
41 // test.normalizeWords(13); | |
34 // test.getLexEntriesByLexiconBeginningWith("ls", "a"); | 42 // test.getLexEntriesByLexiconBeginningWith("ls", "a"); |
35 // test.end(); | 43 // test.end(); |
36 } catch (Exception e) { | 44 } catch (Exception e) { |
37 e.printStackTrace(); | 45 e.printStackTrace(); |
38 } | 46 } |
45 private void end() throws ApplicationException { | 53 private void end() throws ApplicationException { |
46 lexHandler.end(); | 54 lexHandler.end(); |
47 } | 55 } |
48 | 56 |
49 private ArrayList<Token> tokenizeString() throws ApplicationException { | 57 private ArrayList<Token> tokenizeString() throws ApplicationException { |
50 ArrayList<Token> tokens = new ArrayList<Token>(); | 58 // StringReader reader = new StringReader("edo philoſophi ä bla"); |
51 try { | 59 // StringReader reader = new StringReader(""); |
52 StringReader reader = new StringReader("edo philoſophi"); | 60 StringReader reader = new StringReader("扞盗則李兗州"); |
53 // StringReader reader = new StringReader("扞盗則李兗州"); | 61 TokenizerNew tokenizer = new TokenizerNew(reader, "zh"); |
54 Tokenizer tokenizer = new Tokenizer(reader); | 62 ArrayList<Token> tokens = tokenizer.tokenize(); |
55 tokenizer.setLanguage("lat"); | 63 System.out.println(tokens); |
56 // tokenizer.setLanguage("zho"); | |
57 String[] normFunctions = new String[1]; | |
58 normFunctions[0] = "norm"; | |
59 tokenizer.setNormFunctions(normFunctions); | |
60 tokens = tokenizer.getTokens(); | |
61 tokenizer.end(); | |
62 tokenizer.close(); | |
63 } catch (IOException e) { | |
64 throw new ApplicationException(e); | |
65 } | |
66 return tokens; | 64 return tokens; |
67 } | 65 } |
68 | 66 |
69 private String tokenizeXmlFragment() throws ApplicationException { | 67 private String tokenizeXmlFragment() throws ApplicationException { |
70 String result = null; | 68 String result = null; |
77 xmlFragment = IOUtils.toString(in, "utf-8"); | 75 xmlFragment = IOUtils.toString(in, "utf-8"); |
78 in.close(); | 76 in.close(); |
79 | 77 |
80 XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); | 78 XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); |
81 xmlTokenizer.setLanguage("lat"); | 79 xmlTokenizer.setLanguage("lat"); |
82 String[] normFunctions = new String[1]; | 80 String[] stopElements = {"var"}; |
83 normFunctions[0] = "norm"; | 81 // xmlTokenizer.setOutputFormat("string"); |
84 String[] stopElements = new String[1]; | 82 String[] outputOptions = {"withLemmas"}; |
85 stopElements[0] = "var"; | 83 xmlTokenizer.setOutputOptions(outputOptions); |
86 xmlTokenizer.setNormFunctions(normFunctions); | |
87 xmlTokenizer.setStopElements(stopElements); | 84 xmlTokenizer.setStopElements(stopElements); |
88 result = xmlTokenizer.tokenize(); | 85 xmlTokenizer.tokenize(); |
86 result = xmlTokenizer.getXmlResult(); | |
89 System.out.println(result); | 87 System.out.println(result); |
90 } catch (Exception e) { | 88 } catch (Exception e) { |
91 throw new ApplicationException(e); | 89 throw new ApplicationException(e); |
92 } | 90 } |
93 return result; | 91 return result; |
105 int normMode = Normalizer.DICTIONARY; | 103 int normMode = Normalizer.DICTIONARY; |
106 getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normMode); | 104 getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normMode); |
107 } | 105 } |
108 | 106 |
109 private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, int normMode) throws ApplicationException { | 107 private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, int normMode) throws ApplicationException { |
110 ArrayList<Lemma> lemmas = lexHandler.getLemmas(query, inputType, language, normMode); | 108 ArrayList<Lemma> lemmas = lexHandler.getLemmas(query, inputType, language, normMode, false); |
111 ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName, query); | 109 ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName, query); |
112 // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); | 110 // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); |
113 String result = ""; | 111 String result = ""; |
114 result = result + "<dictionaries>"; | 112 result = result + "<dictionaries>"; |
115 for (int i=0; i<dictionaries.size(); i++) { | 113 for (int i=0; i<dictionaries.size(); i++) { |
127 | 125 |
128 private void getLexEntriesBeginningWith(String language, String prefix) throws ApplicationException { | 126 private void getLexEntriesBeginningWith(String language, String prefix) throws ApplicationException { |
129 ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1, 50); | 127 ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1, 50); |
130 System.out.println(lexEntries); | 128 System.out.println(lexEntries); |
131 } | 129 } |
130 | |
131 private String normalizeWords(int page) throws ApplicationException { | |
132 try { | |
133 String docDir = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585"; | |
134 String pageFileName = docDir + "/" + "pages" + "/page-" + page + "-morph.xml"; | |
135 File pageFile = new File(pageFileName); | |
136 String fragment = FileUtils.readFileToString(pageFile, "utf-8"); | |
137 WordContentHandler wordContentHandler = new WordContentHandler(); | |
138 XMLReader xmlParser = new SAXParser(); | |
139 xmlParser.setContentHandler(wordContentHandler); | |
140 StringReader strReader = new StringReader(fragment); | |
141 InputSource inputSource = new InputSource(strReader); | |
142 xmlParser.parse(inputSource); | |
143 String result = wordContentHandler.getResult(); | |
144 return result; | |
145 } catch (SAXException e) { | |
146 throw new ApplicationException(e); | |
147 } catch (IOException e) { | |
148 throw new ApplicationException(e); | |
149 } | |
150 } | |
132 } | 151 } |