Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 4a3641ae14d2 |
children |
line wrap: on
line diff
--- a/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java Wed Dec 14 13:57:09 2011 +0100 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java Tue Nov 27 12:35:19 2012 +0100 @@ -2,6 +2,8 @@ import java.io.IOException; import java.io.Reader; +import java.util.ArrayList; +import java.util.Collections; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -11,19 +13,29 @@ import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler.Element; public class XmlTokenizer { + private XmlTokenizerContentHandler xmlTokenizerContentHandler; private Reader input; + private String docId; private String language = "eng"; // default: english private String[] normFunctions = {"specialNorm"}; // default: use special norm function - private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements - private String[] stopElements = {}; // default: no stop elements + private String[] nwbElements = {"lb", "br", "cb", "hi"}; // non word breaking elements, default: these elements + private String[] stopElements = {}; // stop elements: its tokens should not get word tags (when output format is "xml") or its tokens should be removed (if output format is "string") + private String[] elements = {}; + private String[] highlightTerms = {}; // highlight terms, default: no highlight terms + private String outputFormat = "xml"; // default: xml private String[] outputOptions = {}; public XmlTokenizer(Reader input) { this.input = input; } + public void setDocIdentifier(String docId) { + this.docId = docId; + } + public void setLanguage(String lang) { String language = Language.getInstance().getLanguageId(lang); this.language = language; @@ -41,28 +53,149 @@ this.stopElements = stopElements; } + public void setElements(String[] elements) { + this.elements = elements; + } + + public void setOutputFormat(String outputFormat) { + this.outputFormat = outputFormat; + } + public void setOutputOptions(String[] outputOptions) { this.outputOptions = outputOptions; } - public String tokenize() throws ApplicationException { - String retString = null; + public void setHighlightTerms(String[] highlightTerms) { + this.highlightTerms = highlightTerms; + } + + public void tokenize() throws ApplicationException { try { - XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language); - dictContentHandler.setStopElements(stopElements); - dictContentHandler.setNWBElements(nwbElements); - dictContentHandler.setOutputOptions(outputOptions); + xmlTokenizerContentHandler = new XmlTokenizerContentHandler(language); + xmlTokenizerContentHandler.setDocIdentifier(docId); + xmlTokenizerContentHandler.setStopElements(stopElements); + xmlTokenizerContentHandler.setNWBElements(nwbElements); + xmlTokenizerContentHandler.setHighlightTerms(highlightTerms); + xmlTokenizerContentHandler.setNormFunctions(normFunctions); + xmlTokenizerContentHandler.setOutputOptions(outputOptions); + xmlTokenizerContentHandler.setOutputFormat(outputFormat); XMLReader xmlParser = new SAXParser(); - xmlParser.setContentHandler(dictContentHandler); + xmlParser.setContentHandler(xmlTokenizerContentHandler); InputSource inputSource = new InputSource(input); xmlParser.parse(inputSource); - retString = dictContentHandler.getXmlFragment(); } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } - return retString; + } + + public String getXmlResult() throws ApplicationException { + return xmlTokenizerContentHandler.getResultString(); + } + + public ArrayList<Token> getResultTokens() { + return xmlTokenizerContentHandler.getResultTokens(); + } + + public int getPageCount() { + return xmlTokenizerContentHandler.getPageCount(); + } + + public ArrayList<Element> getElements(String elementNamesStr) { + ArrayList<Element> retElements = new ArrayList<Element>(); + String[] elementNames = elementNamesStr.split(" "); + for (int i=0; i<elementNames.length; i++) { + String elementName = elementNames[i]; + ArrayList<Element> elements = xmlTokenizerContentHandler.getElements(elementName); + if (elements != null) + retElements.addAll(elements); + Collections.sort(retElements); + } + return retElements; + } + + public String getStringResult() throws ApplicationException { + StringBuilder result = new StringBuilder(); + ArrayList<Token> resultTokens = new ArrayList<Token>(); + if (elements != null && elements.length > 0) { + for (int i=0; i<elements.length; i++) { + String elemName = elements[i]; + ArrayList<XmlTokenizerContentHandler.Element> elems = getElements(elemName); + for (int j=0; j<elems.size(); j++) { + XmlTokenizerContentHandler.Element elem = elems.get(j); + resultTokens.addAll(elem.getTokens()); + } + } + } else { + resultTokens = xmlTokenizerContentHandler.getResultTokens(); // all tokens + } + if (resultTokens != null) { + for (int i=0; i<resultTokens.size(); i++) { + Token token = resultTokens.get(i); + if (! withLemmas(outputOptions)) { + if (useNormFunction()) { + String contentNorm = token.getContentNorm(); + if (contentNorm != null) + result.append(contentNorm + " "); + } else if (useRegFunction()) { + String contentReg = token.getContentReg(); + if (contentReg != null) + result.append(contentReg + " "); + else { + String contentOrig = token.getContentOrig(); + if (contentOrig != null) + result.append(contentOrig + " "); + } + } else { + String contentOrig = token.getContentOrig(); + if (contentOrig != null) + result.append(contentOrig + " "); + } + } else { + String contentMorph = token.getContentMorph(); + if (contentMorph != null) + result.append(contentMorph + " "); + } + } + } + return result.toString(); + } + + private boolean withLemmas(String[] outputOptions) { + boolean result = false; + if (outputOptions != null) { + for (int i=0; i< outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withLemmas")) + return true; + } + } + return result; + } + + private boolean useNormFunction() { + boolean useNorm = false; + if (normFunctions != null) { + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("norm")) + return true; + } + } + return useNorm; + } + + private boolean useRegFunction() { + boolean useReg = false; + if (normFunctions != null) { + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("reg")) + return true; + } + } + return useReg; } } \ No newline at end of file