comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 7d6d969b10cf
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
8 import java.net.URL; 8 import java.net.URL;
9 import java.util.ArrayList; 9 import java.util.ArrayList;
10 10
11 import org.apache.commons.io.FileUtils; 11 import org.apache.commons.io.FileUtils;
12 import org.apache.commons.io.IOUtils; 12 import org.apache.commons.io.IOUtils;
13 import org.xml.sax.InputSource;
14 import org.xml.sax.SAXException;
15 import org.xml.sax.XMLReader;
16
17 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
13 18
14 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; 19 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
15 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; 20 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon;
16 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; 21 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry;
17 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; 22 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
18 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; 23 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
19 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; 24 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
20 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; 25 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token;
21 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; 26 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer;
27 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.TokenizerNew;
28 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
22 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; 29 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
23 30
24 public class TestLocal { 31 public class TestLocal {
25 private LexHandler lexHandler; 32 private LexHandler lexHandler;
26 33
27 public static void main(String[] args) throws ApplicationException { 34 public static void main(String[] args) throws ApplicationException {
28 try { 35 try {
29 TestLocal test = new TestLocal(); 36 TestLocal test = new TestLocal();
30 test.init(); 37 // test.init();
31 test.testCalls(); 38 // test.testCalls();
32 // test.tokenizeString(); 39 test.tokenizeString();
33 // test.tokenizeXmlFragment(); 40 // test.tokenizeXmlFragment();
41 // test.normalizeWords(13);
34 // test.getLexEntriesByLexiconBeginningWith("ls", "a"); 42 // test.getLexEntriesByLexiconBeginningWith("ls", "a");
35 // test.end(); 43 // test.end();
36 } catch (Exception e) { 44 } catch (Exception e) {
37 e.printStackTrace(); 45 e.printStackTrace();
38 } 46 }
45 private void end() throws ApplicationException { 53 private void end() throws ApplicationException {
46 lexHandler.end(); 54 lexHandler.end();
47 } 55 }
48 56
49 private ArrayList<Token> tokenizeString() throws ApplicationException { 57 private ArrayList<Token> tokenizeString() throws ApplicationException {
50 ArrayList<Token> tokens = new ArrayList<Token>(); 58 // StringReader reader = new StringReader("edo philoſophi ä bla");
51 try { 59 // StringReader reader = new StringReader("");
52 StringReader reader = new StringReader("edo philoſophi"); 60 StringReader reader = new StringReader("扞盗則李兗州");
53 // StringReader reader = new StringReader("扞盗則李兗州"); 61 TokenizerNew tokenizer = new TokenizerNew(reader, "zh");
54 Tokenizer tokenizer = new Tokenizer(reader); 62 ArrayList<Token> tokens = tokenizer.tokenize();
55 tokenizer.setLanguage("lat"); 63 System.out.println(tokens);
56 // tokenizer.setLanguage("zho");
57 String[] normFunctions = new String[1];
58 normFunctions[0] = "norm";
59 tokenizer.setNormFunctions(normFunctions);
60 tokens = tokenizer.getTokens();
61 tokenizer.end();
62 tokenizer.close();
63 } catch (IOException e) {
64 throw new ApplicationException(e);
65 }
66 return tokens; 64 return tokens;
67 } 65 }
68 66
69 private String tokenizeXmlFragment() throws ApplicationException { 67 private String tokenizeXmlFragment() throws ApplicationException {
70 String result = null; 68 String result = null;
77 xmlFragment = IOUtils.toString(in, "utf-8"); 75 xmlFragment = IOUtils.toString(in, "utf-8");
78 in.close(); 76 in.close();
79 77
80 XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); 78 XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment));
81 xmlTokenizer.setLanguage("lat"); 79 xmlTokenizer.setLanguage("lat");
82 String[] normFunctions = new String[1]; 80 String[] stopElements = {"var"};
83 normFunctions[0] = "norm"; 81 // xmlTokenizer.setOutputFormat("string");
84 String[] stopElements = new String[1]; 82 String[] outputOptions = {"withLemmas"};
85 stopElements[0] = "var"; 83 xmlTokenizer.setOutputOptions(outputOptions);
86 xmlTokenizer.setNormFunctions(normFunctions);
87 xmlTokenizer.setStopElements(stopElements); 84 xmlTokenizer.setStopElements(stopElements);
88 result = xmlTokenizer.tokenize(); 85 xmlTokenizer.tokenize();
86 result = xmlTokenizer.getXmlResult();
89 System.out.println(result); 87 System.out.println(result);
90 } catch (Exception e) { 88 } catch (Exception e) {
91 throw new ApplicationException(e); 89 throw new ApplicationException(e);
92 } 90 }
93 return result; 91 return result;
105 int normMode = Normalizer.DICTIONARY; 103 int normMode = Normalizer.DICTIONARY;
106 getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normMode); 104 getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normMode);
107 } 105 }
108 106
109 private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, int normMode) throws ApplicationException { 107 private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, int normMode) throws ApplicationException {
110 ArrayList<Lemma> lemmas = lexHandler.getLemmas(query, inputType, language, normMode); 108 ArrayList<Lemma> lemmas = lexHandler.getLemmas(query, inputType, language, normMode, false);
111 ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName, query); 109 ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName, query);
112 // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); 110 // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization);
113 String result = ""; 111 String result = "";
114 result = result + "<dictionaries>"; 112 result = result + "<dictionaries>";
115 for (int i=0; i<dictionaries.size(); i++) { 113 for (int i=0; i<dictionaries.size(); i++) {
127 125
128 private void getLexEntriesBeginningWith(String language, String prefix) throws ApplicationException { 126 private void getLexEntriesBeginningWith(String language, String prefix) throws ApplicationException {
129 ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1, 50); 127 ArrayList<Lexicon> lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1, 50);
130 System.out.println(lexEntries); 128 System.out.println(lexEntries);
131 } 129 }
130
131 private String normalizeWords(int page) throws ApplicationException {
132 try {
133 String docDir = "/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585";
134 String pageFileName = docDir + "/" + "pages" + "/page-" + page + "-morph.xml";
135 File pageFile = new File(pageFileName);
136 String fragment = FileUtils.readFileToString(pageFile, "utf-8");
137 WordContentHandler wordContentHandler = new WordContentHandler();
138 XMLReader xmlParser = new SAXParser();
139 xmlParser.setContentHandler(wordContentHandler);
140 StringReader strReader = new StringReader(fragment);
141 InputSource inputSource = new InputSource(strReader);
142 xmlParser.parse(inputSource);
143 String result = wordContentHandler.getResult();
144 return result;
145 } catch (SAXException e) {
146 throw new ApplicationException(e);
147 } catch (IOException e) {
148 throw new ApplicationException(e);
149 }
150 }
132 } 151 }