Mercurial > hg > mpdl-group
annotate software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
rev | line source |
---|---|
19 | 1 package de.mpg.mpiwg.berlin.mpdl.test; |
2 | |
3 import java.io.BufferedInputStream; | |
4 import java.io.File; | |
5 import java.io.IOException; | |
6 import java.io.InputStream; | |
7 import java.io.StringReader; | |
8 import java.net.URL; | |
9 import java.util.ArrayList; | |
10 | |
11 import org.apache.commons.io.FileUtils; | |
12 import org.apache.commons.io.IOUtils; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
13 import org.xml.sax.InputSource; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
14 import org.xml.sax.SAXException; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
15 import org.xml.sax.XMLReader; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
16 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
17 import com.sun.org.apache.xerces.internal.parsers.SAXParser; |
19 | 18 |
19 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
20 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; | |
21 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; | |
22 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; | |
23 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
24 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; |
19 | 25 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; |
26 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
27 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.TokenizerNew; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
28 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; |
19 | 29 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; |
30 | |
31 public class TestLocal { | |
32 private LexHandler lexHandler; | |
33 | |
34 public static void main(String[] args) throws ApplicationException { | |
35 try { | |
36 TestLocal test = new TestLocal(); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
37 // test.init(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
38 // test.testCalls(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
39 test.tokenizeString(); |
19 | 40 // test.tokenizeXmlFragment(); |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
41 // test.normalizeWords(13); |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
42 // test.getLexEntriesByLexiconBeginningWith("ls", "a"); |
19 | 43 // test.end(); |
44 } catch (Exception e) { | |
45 e.printStackTrace(); | |
46 } | |
47 } | |
48 | |
49 private void init() throws ApplicationException { | |
50 lexHandler = LexHandler.getInstance(); | |
51 } | |
52 | |
53 private void end() throws ApplicationException { | |
54 lexHandler.end(); | |
55 } | |
56 | |
57 private ArrayList<Token> tokenizeString() throws ApplicationException { | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
58 // StringReader reader = new StringReader("edo philoſophi ä bla"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
59 // StringReader reader = new StringReader(""); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
60 StringReader reader = new StringReader("扞盗則李兗州"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
61 TokenizerNew tokenizer = new TokenizerNew(reader, "zh"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
62 ArrayList<Token> tokens = tokenizer.tokenize(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
63 System.out.println(tokens); |
19 | 64 return tokens; |
65 } | |
66 | |
67 private String tokenizeXmlFragment() throws ApplicationException { | |
68 String result = null; | |
69 try { | |
70 String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8"); | |
71 String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13"; | |
72 URL srcUrl = new URL(srcUrlStr); | |
73 InputStream inputStream = srcUrl.openStream(); | |
74 BufferedInputStream in = new BufferedInputStream(inputStream); | |
75 xmlFragment = IOUtils.toString(in, "utf-8"); | |
76 in.close(); | |
77 | |
78 XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); | |
79 xmlTokenizer.setLanguage("lat"); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
80 String[] stopElements = {"var"}; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
81 // xmlTokenizer.setOutputFormat("string"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
82 String[] outputOptions = {"withLemmas"}; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
83 xmlTokenizer.setOutputOptions(outputOptions); |
19 | 84 xmlTokenizer.setStopElements(stopElements); |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
85 xmlTokenizer.tokenize(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
86 result = xmlTokenizer.getXmlResult(); |
19 | 87 System.out.println(result); |
88 } catch (Exception e) { | |
89 throw new ApplicationException(e); | |
90 } | |
91 return result; | |
92 } | |
93 | |
94 private void testCalls() throws ApplicationException { | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
95 String query = "vergewissernd"; |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
96 String language = "deu"; |
19 | 97 // String query = "ἱκανῶσ"; |
98 // String language = "el"; | |
99 String inputType = "form"; | |
100 String outputType = null; | |
101 String outputFormat = "html"; | |
102 String dictionaryName = null; | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
103 int normMode = Normalizer.DICTIONARY; |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
104 getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normMode); |
19 | 105 } |
106 | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
107 private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, int normMode) throws ApplicationException { |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
108 ArrayList<Lemma> lemmas = lexHandler.getLemmas(query, inputType, language, normMode, false); |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
109 ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName, query); |
19 | 110 // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); |
111 String result = ""; | |
112 result = result + "<dictionaries>"; | |
113 for (int i=0; i<dictionaries.size(); i++) { | |
114 Lexicon lexicon = dictionaries.get(i); | |
115 result = result + lexicon.toXmlString(); | |
116 } | |
117 result = result + "</dictionaries>"; | |
118 System.out.println(result); | |
119 } | |
120 | |
121 private void getLexEntriesByLexiconBeginningWith(String lexiconName, String prefix) throws ApplicationException { | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
122 ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesByLexiconBeginningWith(lexiconName, prefix, 1, 50); |
19 | 123 System.out.println(lexEntries); |
124 } | |
125 | |
126 private void getLexEntriesBeginningWith(String language, String prefix) throws ApplicationException { | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
127 ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1, 50); |
19 | 128 System.out.println(lexEntries); |
129 } | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
130 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
131 private String normalizeWords(int page) throws ApplicationException { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
132 try { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
133 String docDir = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585"; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
134 String pageFileName = docDir + "/" + "pages" + "/page-" + page + "-morph.xml"; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
135 File pageFile = new File(pageFileName); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
136 String fragment = FileUtils.readFileToString(pageFile, "utf-8"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
137 WordContentHandler wordContentHandler = new WordContentHandler(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
138 XMLReader xmlParser = new SAXParser(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
139 xmlParser.setContentHandler(wordContentHandler); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
140 StringReader strReader = new StringReader(fragment); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
141 InputSource inputSource = new InputSource(strReader); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
142 xmlParser.parse(inputSource); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
143 String result = wordContentHandler.getResult(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
144 return result; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
145 } catch (SAXException e) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
146 throw new ApplicationException(e); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
147 } catch (IOException e) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
148 throw new ApplicationException(e); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
149 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
150 } |
19 | 151 } |