diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 4a3641ae14d2
children
line wrap: on
line diff
--- a/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.java	Wed Dec 14 13:57:09 2011 +0100
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.java	Tue Nov 27 12:35:19 2012 +0100
@@ -1,18 +1,43 @@
 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
 
 public class Token {
-  private String content;
-  private int start;
-  private int end;
+  private String docId;
+  private String language;
+  private int pageNumber;
+  private int lineNumber;
+  private String elementName; // e.g. "TEI:s"
+  private int elementPosition;
+  private int elementPagePosition;
+  private String xmlId;
+  private String xpath;
+  private String content; // original text content
+  private String contentOrig;  // word form
+  private String contentReg;  // regularized text content
+  private String contentNorm;  // normalized word form
+  private String contentMorph;  // lemmas separated by blank
+  private int start; // start position
+  private int end;  // end position
   
   public Token(int start, int end, String content) {
     this.start = start;
     this.end = end;
     this.content = content;
+    this.contentOrig = toWordForm();
   }
 
-  public String getContent() {
-    return content;
+  public Token(String docId, String language, int pageNumber, int lineNumber, int elementPosition, String elementName, String xmlId, String xpath, String contentOrig, String contentReg, String contentNorm, String contentMorph) {
+    this.docId = docId;
+    this.language = language;
+    this.pageNumber = pageNumber;
+    this.lineNumber = lineNumber;
+    this.elementPosition = elementPosition;
+    this.elementName = elementName;
+    this.xmlId = xmlId;
+    this.xpath = xpath;
+    this.contentOrig = contentOrig;
+    this.contentReg = contentReg;
+    this.contentNorm = contentNorm;
+    this.contentMorph = contentMorph;
   }
 
   public int getStart() {
@@ -23,12 +48,157 @@
     return end;
   }
   
+  public String toWordForm() {
+    if (content != null)
+      return content.toLowerCase();
+    else 
+      return null;
+  }
+  
   public String toString() {
     String retStr = "";
-    if (content != null)
-      retStr += content;
+    if (contentOrig != null)
+      retStr += contentOrig;
     retStr = retStr + "(" + start + "," + end + ")";
     return retStr;
   }
-  
+ 
+  public String toXmlString() {
+    StringBuilder retStr = new StringBuilder();
+    retStr.append("<token>");
+    if (docId != null)
+      retStr.append("<docId>" + docId + "</docId>");
+    if (language != null)
+      retStr.append("<language>" + language + "</language>");
+    retStr.append("<pageNumber>" + pageNumber + "</pageNumber>");
+    retStr.append("<elementPosition>" + elementPosition + "</elementPosition>");
+    retStr.append("<elementPagePosition>" + elementPagePosition + "</elementPagePosition>");
+    if (elementName != null)
+      retStr.append("<elementName>" + elementName + "</elementName>");
+    if (contentOrig != null)
+      retStr.append("<contentOrig>" + contentOrig + "</contentOrig>");
+    retStr.append("</token>");
+    return retStr.toString();
+  }
+
+  public String getContent() {
+    return content;
+  }
+
+  public void setContent(String content) {
+    this.content = content;
+    this.contentOrig = toWordForm();
+  }
+
+  public String getContentOrig() {
+    return contentOrig;
+  }
+
+  public void setContentOrig(String contentOrig) {
+    this.contentOrig = contentOrig;
+  }
+
+  public String getContentReg() {
+    return contentReg;
+  }
+
+  public void setContentReg(String contentReg) {
+    this.contentReg = contentReg;
+  }
+
+  public String getContentNorm() {
+    return contentNorm;
+  }
+
+  public void setContentNorm(String contentNorm) {
+    this.contentNorm = contentNorm;
+  }
+
+  public String getContentMorph() {
+    return contentMorph;
+  }
+
+  public void setContentMorph(String contentMorph) {
+    this.contentMorph = contentMorph;
+  }
+
+  public String getDocId() {
+    return docId;
+  }
+
+  public void setDocId(String docId) {
+    this.docId = docId;
+  }
+
+  public String getLanguage() {
+    return language;
+  }
+
+  public void setLanguage(String language) {
+    this.language = language;
+  }
+
+  public int getPageNumber() {
+    return pageNumber;
+  }
+
+  public void setPageNumber(int pageNumber) {
+    this.pageNumber = pageNumber;
+  }
+
+  public int getLineNumber() {
+    return lineNumber;
+  }
+
+  public void setLineNumber(int lineNumber) {
+    this.lineNumber = lineNumber;
+  }
+
+  public int getPosition() {
+    return elementPosition;
+  }
+
+  public void setElementPosition(int elementPosition) {
+    this.elementPosition = elementPosition;
+  }
+
+  public int getPagePosition() {
+    return elementPagePosition;
+  }
+
+  public void setElementPagePosition(int elementPagePosition) {
+    this.elementPagePosition = elementPagePosition;
+  }
+
+  public String getElementName() {
+    return elementName;
+  }
+
+  public void setElementName(String elementName) {
+    this.elementName = elementName;
+  }
+
+  public String getXmlId() {
+    return xmlId;
+  }
+
+  public void setXmlId(String xmlId) {
+    this.xmlId = xmlId;
+  }
+
+  public String getXpath() {
+    return xpath;
+  }
+
+  public void setXpath(String xpath) {
+    this.xpath = xpath;
+  }
+
+  public void setStart(int start) {
+    this.start = start;
+  }
+
+  public void setEnd(int end) {
+    this.end = end;
+  }
 }
\ No newline at end of file