annotate src/de/mpiwg/anteater/text/TextPartXMLTranslator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.text;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.ArrayList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import org.apache.commons.lang3.StringEscapeUtils;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7 import org.jdom2.Attribute;
036535fcd179 anteater
jdamerow
parents:
diff changeset
8 import org.jdom2.DataConversionException;
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 import org.jdom2.Element;
036535fcd179 anteater
jdamerow
parents:
diff changeset
10
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import de.mpiwg.anteater.xml.impl.JDOMParser;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 public class TextPartXMLTranslator {
036535fcd179 anteater
jdamerow
parents:
diff changeset
14
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 public final static String SUMMARY = "summary";
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 public final static String SUPPLEINF = "supplInfo";
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 public final static String P = "p";
036535fcd179 anteater
jdamerow
parents:
diff changeset
18 public final static String DATE = "date_filed";
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 public final static String DATE_TAG = "<" + DATE + ">";
036535fcd179 anteater
jdamerow
parents:
diff changeset
20 public final static String DATE_TAG_CLOSE = "</" + DATE + ">";
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 public final static String SUMMARY_TAG = "<" + SUMMARY + ">";
036535fcd179 anteater
jdamerow
parents:
diff changeset
22 public final static String SUMMARY_TAG_CLOSE = "</" + SUMMARY + ">";
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 public final static String SUPPLEINF_TAG = "<" + SUPPLEINF + ">";
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 public final static String SUPPLEINF_TAG_CLOSE = "</" + SUPPLEINF + ">";
036535fcd179 anteater
jdamerow
parents:
diff changeset
25 public final static String P_TAG_Start = "<" + P + " type=\"";
036535fcd179 anteater
jdamerow
parents:
diff changeset
26 public final static String P_TAG_End = "\">";
036535fcd179 anteater
jdamerow
parents:
diff changeset
27 public final static String P_TAG_CLOSE = "</" + P + ">";
036535fcd179 anteater
jdamerow
parents:
diff changeset
28
036535fcd179 anteater
jdamerow
parents:
diff changeset
29 public static String getXMLForText(TextPart textPart) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 StringBuffer sb = new StringBuffer();
036535fcd179 anteater
jdamerow
parents:
diff changeset
31
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 if (textPart.getType() == TextType.TYPE_SUMMARY)
036535fcd179 anteater
jdamerow
parents:
diff changeset
33 sb.append(SUMMARY_TAG);
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 else
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 sb.append(SUPPLEINF_TAG);
036535fcd179 anteater
jdamerow
parents:
diff changeset
36
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 sb.append(DATE_TAG);
036535fcd179 anteater
jdamerow
parents:
diff changeset
38 sb.append(textPart.getDate());
036535fcd179 anteater
jdamerow
parents:
diff changeset
39 sb.append(DATE_TAG_CLOSE);
036535fcd179 anteater
jdamerow
parents:
diff changeset
40
036535fcd179 anteater
jdamerow
parents:
diff changeset
41 for (Paragraph p : textPart.getParagraphs()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 sb.append(P_TAG_Start + p.getParagraphType() + P_TAG_End);
036535fcd179 anteater
jdamerow
parents:
diff changeset
43 sb.append(StringEscapeUtils.escapeXml(p.getParagraphText()));
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 sb.append(P_TAG_CLOSE);
036535fcd179 anteater
jdamerow
parents:
diff changeset
45 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
46
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 if (textPart.getType() == TextType.TYPE_SUMMARY)
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 sb.append(SUMMARY_TAG_CLOSE);
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 else
036535fcd179 anteater
jdamerow
parents:
diff changeset
50 sb.append(SUPPLEINF_TAG_CLOSE);
036535fcd179 anteater
jdamerow
parents:
diff changeset
51
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 return sb.toString();
036535fcd179 anteater
jdamerow
parents:
diff changeset
53 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
54
036535fcd179 anteater
jdamerow
parents:
diff changeset
55 public static List<TextPart> getTextPartForXML(List<String> xmls) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
56 List<TextPart> parts = new ArrayList<TextPart>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 for (String xml : xmls) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 JDOMParser parser = new JDOMParser(xml, false);
036535fcd179 anteater
jdamerow
parents:
diff changeset
59 Element root = parser.getRoot();
036535fcd179 anteater
jdamerow
parents:
diff changeset
60 TextPart part = new TextPart();
036535fcd179 anteater
jdamerow
parents:
diff changeset
61 part.setParagraphs(new ArrayList<Paragraph>());
036535fcd179 anteater
jdamerow
parents:
diff changeset
62
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 if (root.getName().trim().equals(SUMMARY))
036535fcd179 anteater
jdamerow
parents:
diff changeset
64 part.setType(TextType.TYPE_SUMMARY);
036535fcd179 anteater
jdamerow
parents:
diff changeset
65 else
036535fcd179 anteater
jdamerow
parents:
diff changeset
66 part.setType(TextType.TYPE_SUPLINF);
036535fcd179 anteater
jdamerow
parents:
diff changeset
67
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 part.setTextIdx(xmls.indexOf(xml));
036535fcd179 anteater
jdamerow
parents:
diff changeset
69
036535fcd179 anteater
jdamerow
parents:
diff changeset
70 for (Element child : root.getChildren()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 if (child.getName().trim().equals(DATE)) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
72 part.setDate(child.getText());
036535fcd179 anteater
jdamerow
parents:
diff changeset
73 } else {
036535fcd179 anteater
jdamerow
parents:
diff changeset
74 String paratext = child.getText();
036535fcd179 anteater
jdamerow
parents:
diff changeset
75 if (paratext != null) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
76 Paragraph para = new Paragraph();
036535fcd179 anteater
jdamerow
parents:
diff changeset
77 para.setParagraphText(paratext);
036535fcd179 anteater
jdamerow
parents:
diff changeset
78
036535fcd179 anteater
jdamerow
parents:
diff changeset
79 Attribute typeAttr = child.getAttribute("type");
036535fcd179 anteater
jdamerow
parents:
diff changeset
80 if (typeAttr != null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
81 try {
036535fcd179 anteater
jdamerow
parents:
diff changeset
82 para.setParagraphType(typeAttr.getIntValue());
036535fcd179 anteater
jdamerow
parents:
diff changeset
83 } catch (DataConversionException e) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
84 e.printStackTrace();
036535fcd179 anteater
jdamerow
parents:
diff changeset
85 para.setParagraphType(ParagraphType.TYPE_OTHER);
036535fcd179 anteater
jdamerow
parents:
diff changeset
86 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
87 part.getParagraphs().add(para);
036535fcd179 anteater
jdamerow
parents:
diff changeset
88 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
89 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
90 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
91
036535fcd179 anteater
jdamerow
parents:
diff changeset
92 parts.add(part);
036535fcd179 anteater
jdamerow
parents:
diff changeset
93 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
94
036535fcd179 anteater
jdamerow
parents:
diff changeset
95 return parts;
036535fcd179 anteater
jdamerow
parents:
diff changeset
96 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
97 }