comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 4a3641ae14d2
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; 1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
2 2
3 public class Token { 3 public class Token {
4 private String content; 4 private String docId;
5 private int start; 5 private String language;
6 private int end; 6 private int pageNumber;
7 private int lineNumber;
8 private String elementName; // e.g. "TEI:s"
9 private int elementPosition;
10 private int elementPagePosition;
11 private String xmlId;
12 private String xpath;
13 private String content; // original text content
14 private String contentOrig; // word form
15 private String contentReg; // regularized text content
16 private String contentNorm; // normalized word form
17 private String contentMorph; // lemmas separated by blank
18 private int start; // start position
19 private int end; // end position
7 20
8 public Token(int start, int end, String content) { 21 public Token(int start, int end, String content) {
9 this.start = start; 22 this.start = start;
10 this.end = end; 23 this.end = end;
11 this.content = content; 24 this.content = content;
12 } 25 this.contentOrig = toWordForm();
13 26 }
14 public String getContent() { 27
15 return content; 28 public Token(String docId, String language, int pageNumber, int lineNumber, int elementPosition, String elementName, String xmlId, String xpath, String contentOrig, String contentReg, String contentNorm, String contentMorph) {
29 this.docId = docId;
30 this.language = language;
31 this.pageNumber = pageNumber;
32 this.lineNumber = lineNumber;
33 this.elementPosition = elementPosition;
34 this.elementName = elementName;
35 this.xmlId = xmlId;
36 this.xpath = xpath;
37 this.contentOrig = contentOrig;
38 this.contentReg = contentReg;
39 this.contentNorm = contentNorm;
40 this.contentMorph = contentMorph;
16 } 41 }
17 42
18 public int getStart() { 43 public int getStart() {
19 return start; 44 return start;
20 } 45 }
21 46
22 public int getEnd() { 47 public int getEnd() {
23 return end; 48 return end;
49 }
50
51 public String toWordForm() {
52 if (content != null)
53 return content.toLowerCase();
54 else
55 return null;
24 } 56 }
25 57
26 public String toString() { 58 public String toString() {
27 String retStr = ""; 59 String retStr = "";
28 if (content != null) 60 if (contentOrig != null)
29 retStr += content; 61 retStr += contentOrig;
30 retStr = retStr + "(" + start + "," + end + ")"; 62 retStr = retStr + "(" + start + "," + end + ")";
31 return retStr; 63 return retStr;
32 } 64 }
33 65
66 public String toXmlString() {
67 StringBuilder retStr = new StringBuilder();
68 retStr.append("<token>");
69 if (docId != null)
70 retStr.append("<docId>" + docId + "</docId>");
71 if (language != null)
72 retStr.append("<language>" + language + "</language>");
73 retStr.append("<pageNumber>" + pageNumber + "</pageNumber>");
74 retStr.append("<elementPosition>" + elementPosition + "</elementPosition>");
75 retStr.append("<elementPagePosition>" + elementPagePosition + "</elementPagePosition>");
76 if (elementName != null)
77 retStr.append("<elementName>" + elementName + "</elementName>");
78 if (contentOrig != null)
79 retStr.append("<contentOrig>" + contentOrig + "</contentOrig>");
80 retStr.append("</token>");
81 return retStr.toString();
82 }
83
84 public String getContent() {
85 return content;
86 }
87
88 public void setContent(String content) {
89 this.content = content;
90 this.contentOrig = toWordForm();
91 }
92
93 public String getContentOrig() {
94 return contentOrig;
95 }
96
97 public void setContentOrig(String contentOrig) {
98 this.contentOrig = contentOrig;
99 }
100
101 public String getContentReg() {
102 return contentReg;
103 }
104
105 public void setContentReg(String contentReg) {
106 this.contentReg = contentReg;
107 }
108
109 public String getContentNorm() {
110 return contentNorm;
111 }
112
113 public void setContentNorm(String contentNorm) {
114 this.contentNorm = contentNorm;
115 }
116
117 public String getContentMorph() {
118 return contentMorph;
119 }
120
121 public void setContentMorph(String contentMorph) {
122 this.contentMorph = contentMorph;
123 }
124
125 public String getDocId() {
126 return docId;
127 }
128
129 public void setDocId(String docId) {
130 this.docId = docId;
131 }
132
133 public String getLanguage() {
134 return language;
135 }
136
137 public void setLanguage(String language) {
138 this.language = language;
139 }
140
141 public int getPageNumber() {
142 return pageNumber;
143 }
144
145 public void setPageNumber(int pageNumber) {
146 this.pageNumber = pageNumber;
147 }
148
149 public int getLineNumber() {
150 return lineNumber;
151 }
152
153 public void setLineNumber(int lineNumber) {
154 this.lineNumber = lineNumber;
155 }
156
157 public int getPosition() {
158 return elementPosition;
159 }
160
161 public void setElementPosition(int elementPosition) {
162 this.elementPosition = elementPosition;
163 }
164
165 public int getPagePosition() {
166 return elementPagePosition;
167 }
168
169 public void setElementPagePosition(int elementPagePosition) {
170 this.elementPagePosition = elementPagePosition;
171 }
172
173 public String getElementName() {
174 return elementName;
175 }
176
177 public void setElementName(String elementName) {
178 this.elementName = elementName;
179 }
180
181 public String getXmlId() {
182 return xmlId;
183 }
184
185 public void setXmlId(String xmlId) {
186 this.xmlId = xmlId;
187 }
188
189 public String getXpath() {
190 return xpath;
191 }
192
193 public void setXpath(String xpath) {
194 this.xpath = xpath;
195 }
196
197 public void setStart(int start) {
198 this.start = start;
199 }
200
201 public void setEnd(int end) {
202 this.end = end;
203 }
34 } 204 }