diff src/de/mpiwg/anteater/text/TextPartXMLTranslator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/text/TextPartXMLTranslator.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,97 @@
+package de.mpiwg.anteater.text;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.lang3.StringEscapeUtils;
+import org.jdom2.Attribute;
+import org.jdom2.DataConversionException;
+import org.jdom2.Element;
+
+import de.mpiwg.anteater.xml.impl.JDOMParser;
+
+public class TextPartXMLTranslator {
+
+	public final static String SUMMARY = "summary";
+	public final static String SUPPLEINF = "supplInfo";
+	public final static String P = "p";
+	public final static String DATE = "date_filed";
+	public final static String DATE_TAG = "<" + DATE + ">";
+	public final static String DATE_TAG_CLOSE = "</" + DATE + ">";
+	public final static String SUMMARY_TAG = "<" + SUMMARY + ">";
+	public final static String SUMMARY_TAG_CLOSE = "</" + SUMMARY + ">";
+	public final static String SUPPLEINF_TAG = "<" + SUPPLEINF + ">";
+	public final static String SUPPLEINF_TAG_CLOSE = "</" + SUPPLEINF + ">";
+	public final static String P_TAG_Start = "<" + P + " type=\"";
+	public final static String P_TAG_End = "\">";
+	public final static String P_TAG_CLOSE = "</" + P + ">";
+
+	public static String getXMLForText(TextPart textPart) {
+		StringBuffer sb = new StringBuffer();
+
+		if (textPart.getType() == TextType.TYPE_SUMMARY)
+			sb.append(SUMMARY_TAG);
+		else
+			sb.append(SUPPLEINF_TAG);
+
+		sb.append(DATE_TAG);
+		sb.append(textPart.getDate());
+		sb.append(DATE_TAG_CLOSE);
+
+		for (Paragraph p : textPart.getParagraphs()) {
+			sb.append(P_TAG_Start + p.getParagraphType() + P_TAG_End);
+			sb.append(StringEscapeUtils.escapeXml(p.getParagraphText()));
+			sb.append(P_TAG_CLOSE);
+		}
+
+		if (textPart.getType() == TextType.TYPE_SUMMARY)
+			sb.append(SUMMARY_TAG_CLOSE);
+		else
+			sb.append(SUPPLEINF_TAG_CLOSE);
+
+		return sb.toString();
+	}
+
+	public static List<TextPart> getTextPartForXML(List<String> xmls) {
+		List<TextPart> parts = new ArrayList<TextPart>();
+		for (String xml : xmls) {
+			JDOMParser parser = new JDOMParser(xml, false);
+			Element root = parser.getRoot();
+			TextPart part = new TextPart();
+			part.setParagraphs(new ArrayList<Paragraph>());
+
+			if (root.getName().trim().equals(SUMMARY))
+				part.setType(TextType.TYPE_SUMMARY);
+			else
+				part.setType(TextType.TYPE_SUPLINF);
+
+			part.setTextIdx(xmls.indexOf(xml));
+
+			for (Element child : root.getChildren()) {
+				if (child.getName().trim().equals(DATE)) {
+					part.setDate(child.getText());
+				} else {
+					String paratext = child.getText();
+					if (paratext != null) {
+						Paragraph para = new Paragraph();
+						para.setParagraphText(paratext);
+
+						Attribute typeAttr = child.getAttribute("type");
+						if (typeAttr != null)
+							try {
+								para.setParagraphType(typeAttr.getIntValue());
+							} catch (DataConversionException e) {
+								e.printStackTrace();
+								para.setParagraphType(ParagraphType.TYPE_OTHER);
+							}
+						part.getParagraphs().add(para);
+					}
+				}
+			}
+
+			parts.add(part);
+		}
+
+		return parts;
+	}
+}