view src/de/mpiwg/anteater/text/TextPartXMLTranslator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.text;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringEscapeUtils;
import org.jdom2.Attribute;
import org.jdom2.DataConversionException;
import org.jdom2.Element;

import de.mpiwg.anteater.xml.impl.JDOMParser;

public class TextPartXMLTranslator {

	public final static String SUMMARY = "summary";
	public final static String SUPPLEINF = "supplInfo";
	public final static String P = "p";
	public final static String DATE = "date_filed";
	public final static String DATE_TAG = "<" + DATE + ">";
	public final static String DATE_TAG_CLOSE = "</" + DATE + ">";
	public final static String SUMMARY_TAG = "<" + SUMMARY + ">";
	public final static String SUMMARY_TAG_CLOSE = "</" + SUMMARY + ">";
	public final static String SUPPLEINF_TAG = "<" + SUPPLEINF + ">";
	public final static String SUPPLEINF_TAG_CLOSE = "</" + SUPPLEINF + ">";
	public final static String P_TAG_Start = "<" + P + " type=\"";
	public final static String P_TAG_End = "\">";
	public final static String P_TAG_CLOSE = "</" + P + ">";

	public static String getXMLForText(TextPart textPart) {
		StringBuffer sb = new StringBuffer();

		if (textPart.getType() == TextType.TYPE_SUMMARY)
			sb.append(SUMMARY_TAG);
		else
			sb.append(SUPPLEINF_TAG);

		sb.append(DATE_TAG);
		sb.append(textPart.getDate());
		sb.append(DATE_TAG_CLOSE);

		for (Paragraph p : textPart.getParagraphs()) {
			sb.append(P_TAG_Start + p.getParagraphType() + P_TAG_End);
			sb.append(StringEscapeUtils.escapeXml(p.getParagraphText()));
			sb.append(P_TAG_CLOSE);
		}

		if (textPart.getType() == TextType.TYPE_SUMMARY)
			sb.append(SUMMARY_TAG_CLOSE);
		else
			sb.append(SUPPLEINF_TAG_CLOSE);

		return sb.toString();
	}

	public static List<TextPart> getTextPartForXML(List<String> xmls) {
		List<TextPart> parts = new ArrayList<TextPart>();
		for (String xml : xmls) {
			JDOMParser parser = new JDOMParser(xml, false);
			Element root = parser.getRoot();
			TextPart part = new TextPart();
			part.setParagraphs(new ArrayList<Paragraph>());

			if (root.getName().trim().equals(SUMMARY))
				part.setType(TextType.TYPE_SUMMARY);
			else
				part.setType(TextType.TYPE_SUPLINF);

			part.setTextIdx(xmls.indexOf(xml));

			for (Element child : root.getChildren()) {
				if (child.getName().trim().equals(DATE)) {
					part.setDate(child.getText());
				} else {
					String paratext = child.getText();
					if (paratext != null) {
						Paragraph para = new Paragraph();
						para.setParagraphText(paratext);

						Attribute typeAttr = child.getAttribute("type");
						if (typeAttr != null)
							try {
								para.setParagraphType(typeAttr.getIntValue());
							} catch (DataConversionException e) {
								e.printStackTrace();
								para.setParagraphType(ParagraphType.TYPE_OTHER);
							}
						part.getParagraphs().add(para);
					}
				}
			}

			parts.add(part);
		}

		return parts;
	}
}