0
|
1 package de.mpiwg.anteater.text;
|
|
2
|
|
3 import java.util.ArrayList;
|
|
4 import java.util.List;
|
|
5
|
|
6 import org.apache.commons.lang3.StringEscapeUtils;
|
|
7 import org.jdom2.Attribute;
|
|
8 import org.jdom2.DataConversionException;
|
|
9 import org.jdom2.Element;
|
|
10
|
|
11 import de.mpiwg.anteater.xml.impl.JDOMParser;
|
|
12
|
|
13 public class TextPartXMLTranslator {
|
|
14
|
|
15 public final static String SUMMARY = "summary";
|
|
16 public final static String SUPPLEINF = "supplInfo";
|
|
17 public final static String P = "p";
|
|
18 public final static String DATE = "date_filed";
|
|
19 public final static String DATE_TAG = "<" + DATE + ">";
|
|
20 public final static String DATE_TAG_CLOSE = "</" + DATE + ">";
|
|
21 public final static String SUMMARY_TAG = "<" + SUMMARY + ">";
|
|
22 public final static String SUMMARY_TAG_CLOSE = "</" + SUMMARY + ">";
|
|
23 public final static String SUPPLEINF_TAG = "<" + SUPPLEINF + ">";
|
|
24 public final static String SUPPLEINF_TAG_CLOSE = "</" + SUPPLEINF + ">";
|
|
25 public final static String P_TAG_Start = "<" + P + " type=\"";
|
|
26 public final static String P_TAG_End = "\">";
|
|
27 public final static String P_TAG_CLOSE = "</" + P + ">";
|
|
28
|
|
29 public static String getXMLForText(TextPart textPart) {
|
|
30 StringBuffer sb = new StringBuffer();
|
|
31
|
|
32 if (textPart.getType() == TextType.TYPE_SUMMARY)
|
|
33 sb.append(SUMMARY_TAG);
|
|
34 else
|
|
35 sb.append(SUPPLEINF_TAG);
|
|
36
|
|
37 sb.append(DATE_TAG);
|
|
38 sb.append(textPart.getDate());
|
|
39 sb.append(DATE_TAG_CLOSE);
|
|
40
|
|
41 for (Paragraph p : textPart.getParagraphs()) {
|
|
42 sb.append(P_TAG_Start + p.getParagraphType() + P_TAG_End);
|
|
43 sb.append(StringEscapeUtils.escapeXml(p.getParagraphText()));
|
|
44 sb.append(P_TAG_CLOSE);
|
|
45 }
|
|
46
|
|
47 if (textPart.getType() == TextType.TYPE_SUMMARY)
|
|
48 sb.append(SUMMARY_TAG_CLOSE);
|
|
49 else
|
|
50 sb.append(SUPPLEINF_TAG_CLOSE);
|
|
51
|
|
52 return sb.toString();
|
|
53 }
|
|
54
|
|
55 public static List<TextPart> getTextPartForXML(List<String> xmls) {
|
|
56 List<TextPart> parts = new ArrayList<TextPart>();
|
|
57 for (String xml : xmls) {
|
|
58 JDOMParser parser = new JDOMParser(xml, false);
|
|
59 Element root = parser.getRoot();
|
|
60 TextPart part = new TextPart();
|
|
61 part.setParagraphs(new ArrayList<Paragraph>());
|
|
62
|
|
63 if (root.getName().trim().equals(SUMMARY))
|
|
64 part.setType(TextType.TYPE_SUMMARY);
|
|
65 else
|
|
66 part.setType(TextType.TYPE_SUPLINF);
|
|
67
|
|
68 part.setTextIdx(xmls.indexOf(xml));
|
|
69
|
|
70 for (Element child : root.getChildren()) {
|
|
71 if (child.getName().trim().equals(DATE)) {
|
|
72 part.setDate(child.getText());
|
|
73 } else {
|
|
74 String paratext = child.getText();
|
|
75 if (paratext != null) {
|
|
76 Paragraph para = new Paragraph();
|
|
77 para.setParagraphText(paratext);
|
|
78
|
|
79 Attribute typeAttr = child.getAttribute("type");
|
|
80 if (typeAttr != null)
|
|
81 try {
|
|
82 para.setParagraphType(typeAttr.getIntValue());
|
|
83 } catch (DataConversionException e) {
|
|
84 e.printStackTrace();
|
|
85 para.setParagraphType(ParagraphType.TYPE_OTHER);
|
|
86 }
|
|
87 part.getParagraphs().add(para);
|
|
88 }
|
|
89 }
|
|
90 }
|
|
91
|
|
92 parts.add(part);
|
|
93 }
|
|
94
|
|
95 return parts;
|
|
96 }
|
|
97 }
|