Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 4a3641ae14d2 |
children |
line wrap: on
line diff
--- a/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.java Wed Dec 14 13:57:09 2011 +0100 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.java Tue Nov 27 12:35:19 2012 +0100 @@ -1,18 +1,43 @@ package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; public class Token { - private String content; - private int start; - private int end; + private String docId; + private String language; + private int pageNumber; + private int lineNumber; + private String elementName; // e.g. "TEI:s" + private int elementPosition; + private int elementPagePosition; + private String xmlId; + private String xpath; + private String content; // original text content + private String contentOrig; // word form + private String contentReg; // regularized text content + private String contentNorm; // normalized word form + private String contentMorph; // lemmas separated by blank + private int start; // start position + private int end; // end position public Token(int start, int end, String content) { this.start = start; this.end = end; this.content = content; + this.contentOrig = toWordForm(); } - public String getContent() { - return content; + public Token(String docId, String language, int pageNumber, int lineNumber, int elementPosition, String elementName, String xmlId, String xpath, String contentOrig, String contentReg, String contentNorm, String contentMorph) { + this.docId = docId; + this.language = language; + this.pageNumber = pageNumber; + this.lineNumber = lineNumber; + this.elementPosition = elementPosition; + this.elementName = elementName; + this.xmlId = xmlId; + this.xpath = xpath; + this.contentOrig = contentOrig; + this.contentReg = contentReg; + this.contentNorm = contentNorm; + this.contentMorph = contentMorph; } public int getStart() { @@ -23,12 +48,157 @@ return end; } + public String toWordForm() { + if (content != null) + return content.toLowerCase(); + else + return null; + } + public String toString() { String retStr = ""; - if (content != null) - retStr += content; + if (contentOrig != null) + retStr += contentOrig; retStr = retStr + "(" + start + "," + end + ")"; return retStr; } - + + public String toXmlString() { + StringBuilder retStr = new StringBuilder(); + retStr.append("<token>"); + if (docId != null) + retStr.append("<docId>" + docId + "</docId>"); + if (language != null) + retStr.append("<language>" + language + "</language>"); + retStr.append("<pageNumber>" + pageNumber + "</pageNumber>"); + retStr.append("<elementPosition>" + elementPosition + "</elementPosition>"); + retStr.append("<elementPagePosition>" + elementPagePosition + "</elementPagePosition>"); + if (elementName != null) + retStr.append("<elementName>" + elementName + "</elementName>"); + if (contentOrig != null) + retStr.append("<contentOrig>" + contentOrig + "</contentOrig>"); + retStr.append("</token>"); + return retStr.toString(); + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + this.contentOrig = toWordForm(); + } + + public String getContentOrig() { + return contentOrig; + } + + public void setContentOrig(String contentOrig) { + this.contentOrig = contentOrig; + } + + public String getContentReg() { + return contentReg; + } + + public void setContentReg(String contentReg) { + this.contentReg = contentReg; + } + + public String getContentNorm() { + return contentNorm; + } + + public void setContentNorm(String contentNorm) { + this.contentNorm = contentNorm; + } + + public String getContentMorph() { + return contentMorph; + } + + public void setContentMorph(String contentMorph) { + this.contentMorph = contentMorph; + } + + public String getDocId() { + return docId; + } + + public void setDocId(String docId) { + this.docId = docId; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public int getPageNumber() { + return pageNumber; + } + + public void setPageNumber(int pageNumber) { + this.pageNumber = pageNumber; + } + + public int getLineNumber() { + return lineNumber; + } + + public void setLineNumber(int lineNumber) { + this.lineNumber = lineNumber; + } + + public int getPosition() { + return elementPosition; + } + + public void setElementPosition(int elementPosition) { + this.elementPosition = elementPosition; + } + + public int getPagePosition() { + return elementPagePosition; + } + + public void setElementPagePosition(int elementPagePosition) { + this.elementPagePosition = elementPagePosition; + } + + public String getElementName() { + return elementName; + } + + public void setElementName(String elementName) { + this.elementName = elementName; + } + + public String getXmlId() { + return xmlId; + } + + public void setXmlId(String xmlId) { + this.xmlId = xmlId; + } + + public String getXpath() { + return xpath; + } + + public void setXpath(String xpath) { + this.xpath = xpath; + } + + public void setStart(int start) { + this.start = start; + } + + public void setEnd(int end) { + this.end = end; + } } \ No newline at end of file