Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 4a3641ae14d2 |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.Collections; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler.Element; public class XmlTokenizer { private XmlTokenizerContentHandler xmlTokenizerContentHandler; private Reader input; private String docId; private String language = "eng"; // default: english private String[] normFunctions = {"specialNorm"}; // default: use special norm function private String[] nwbElements = {"lb", "br", "cb", "hi"}; // non word breaking elements, default: these elements private String[] stopElements = {}; // stop elements: its tokens should not get word tags (when output format is "xml") or its tokens should be removed (if output format is "string") private String[] elements = {}; private String[] highlightTerms = {}; // highlight terms, default: no highlight terms private String outputFormat = "xml"; // default: xml private String[] outputOptions = {}; public XmlTokenizer(Reader input) { this.input = input; } public void setDocIdentifier(String docId) { this.docId = docId; } public void setLanguage(String lang) { String language = Language.getInstance().getLanguageId(lang); this.language = language; } public void setNormFunctions(String[] normFunctions) { this.normFunctions = normFunctions; } public void setNWBElements(String[] nwbElements) { this.nwbElements = nwbElements; } public void setStopElements(String[] stopElements) { this.stopElements = stopElements; } public void setElements(String[] elements) { this.elements = elements; } public void setOutputFormat(String outputFormat) { this.outputFormat = outputFormat; } public void setOutputOptions(String[] outputOptions) { this.outputOptions = outputOptions; } public void setHighlightTerms(String[] highlightTerms) { this.highlightTerms = highlightTerms; } public void tokenize() throws ApplicationException { try { xmlTokenizerContentHandler = new XmlTokenizerContentHandler(language); xmlTokenizerContentHandler.setDocIdentifier(docId); xmlTokenizerContentHandler.setStopElements(stopElements); xmlTokenizerContentHandler.setNWBElements(nwbElements); xmlTokenizerContentHandler.setHighlightTerms(highlightTerms); xmlTokenizerContentHandler.setNormFunctions(normFunctions); xmlTokenizerContentHandler.setOutputOptions(outputOptions); xmlTokenizerContentHandler.setOutputFormat(outputFormat); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(xmlTokenizerContentHandler); InputSource inputSource = new InputSource(input); xmlParser.parse(inputSource); } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } } public String getXmlResult() throws ApplicationException { return xmlTokenizerContentHandler.getResultString(); } public ArrayList<Token> getResultTokens() { return xmlTokenizerContentHandler.getResultTokens(); } public int getPageCount() { return xmlTokenizerContentHandler.getPageCount(); } public ArrayList<Element> getElements(String elementNamesStr) { ArrayList<Element> retElements = new ArrayList<Element>(); String[] elementNames = elementNamesStr.split(" "); for (int i=0; i<elementNames.length; i++) { String elementName = elementNames[i]; ArrayList<Element> elements = xmlTokenizerContentHandler.getElements(elementName); if (elements != null) retElements.addAll(elements); Collections.sort(retElements); } return retElements; } public String getStringResult() throws ApplicationException { StringBuilder result = new StringBuilder(); ArrayList<Token> resultTokens = new ArrayList<Token>(); if (elements != null && elements.length > 0) { for (int i=0; i<elements.length; i++) { String elemName = elements[i]; ArrayList<XmlTokenizerContentHandler.Element> elems = getElements(elemName); for (int j=0; j<elems.size(); j++) { XmlTokenizerContentHandler.Element elem = elems.get(j); resultTokens.addAll(elem.getTokens()); } } } else { resultTokens = xmlTokenizerContentHandler.getResultTokens(); // all tokens } if (resultTokens != null) { for (int i=0; i<resultTokens.size(); i++) { Token token = resultTokens.get(i); if (! withLemmas(outputOptions)) { if (useNormFunction()) { String contentNorm = token.getContentNorm(); if (contentNorm != null) result.append(contentNorm + " "); } else if (useRegFunction()) { String contentReg = token.getContentReg(); if (contentReg != null) result.append(contentReg + " "); else { String contentOrig = token.getContentOrig(); if (contentOrig != null) result.append(contentOrig + " "); } } else { String contentOrig = token.getContentOrig(); if (contentOrig != null) result.append(contentOrig + " "); } } else { String contentMorph = token.getContentMorph(); if (contentMorph != null) result.append(contentMorph + " "); } } } return result.toString(); } private boolean withLemmas(String[] outputOptions) { boolean result = false; if (outputOptions != null) { for (int i=0; i< outputOptions.length; i++) { String function = outputOptions[i]; if (function.equals("withLemmas")) return true; } } return result; } private boolean useNormFunction() { boolean useNorm = false; if (normFunctions != null) { for (int i=0; i< normFunctions.length; i++) { String function = normFunctions[i]; if (function.equals("norm")) return true; } } return useNorm; } private boolean useRegFunction() { boolean useReg = false; if (normFunctions != null) { for (int i=0; i< normFunctions.length; i++) { String function = normFunctions[i]; if (function.equals("reg")) return true; } } return useReg; } }