Mercurial > hg > anteater
diff src/de/mpiwg/anteater/text/TextPartXMLTranslator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/text/TextPartXMLTranslator.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,97 @@ +package de.mpiwg.anteater.text; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.StringEscapeUtils; +import org.jdom2.Attribute; +import org.jdom2.DataConversionException; +import org.jdom2.Element; + +import de.mpiwg.anteater.xml.impl.JDOMParser; + +public class TextPartXMLTranslator { + + public final static String SUMMARY = "summary"; + public final static String SUPPLEINF = "supplInfo"; + public final static String P = "p"; + public final static String DATE = "date_filed"; + public final static String DATE_TAG = "<" + DATE + ">"; + public final static String DATE_TAG_CLOSE = "</" + DATE + ">"; + public final static String SUMMARY_TAG = "<" + SUMMARY + ">"; + public final static String SUMMARY_TAG_CLOSE = "</" + SUMMARY + ">"; + public final static String SUPPLEINF_TAG = "<" + SUPPLEINF + ">"; + public final static String SUPPLEINF_TAG_CLOSE = "</" + SUPPLEINF + ">"; + public final static String P_TAG_Start = "<" + P + " type=\""; + public final static String P_TAG_End = "\">"; + public final static String P_TAG_CLOSE = "</" + P + ">"; + + public static String getXMLForText(TextPart textPart) { + StringBuffer sb = new StringBuffer(); + + if (textPart.getType() == TextType.TYPE_SUMMARY) + sb.append(SUMMARY_TAG); + else + sb.append(SUPPLEINF_TAG); + + sb.append(DATE_TAG); + sb.append(textPart.getDate()); + sb.append(DATE_TAG_CLOSE); + + for (Paragraph p : textPart.getParagraphs()) { + sb.append(P_TAG_Start + p.getParagraphType() + P_TAG_End); + sb.append(StringEscapeUtils.escapeXml(p.getParagraphText())); + sb.append(P_TAG_CLOSE); + } + + if (textPart.getType() == TextType.TYPE_SUMMARY) + sb.append(SUMMARY_TAG_CLOSE); + else + sb.append(SUPPLEINF_TAG_CLOSE); + + return sb.toString(); + } + + public static List<TextPart> getTextPartForXML(List<String> xmls) { + List<TextPart> parts = new ArrayList<TextPart>(); + for (String xml : xmls) { + JDOMParser parser = new JDOMParser(xml, false); + Element root = parser.getRoot(); + TextPart part = new TextPart(); + part.setParagraphs(new ArrayList<Paragraph>()); + + if (root.getName().trim().equals(SUMMARY)) + part.setType(TextType.TYPE_SUMMARY); + else + part.setType(TextType.TYPE_SUPLINF); + + part.setTextIdx(xmls.indexOf(xml)); + + for (Element child : root.getChildren()) { + if (child.getName().trim().equals(DATE)) { + part.setDate(child.getText()); + } else { + String paratext = child.getText(); + if (paratext != null) { + Paragraph para = new Paragraph(); + para.setParagraphText(paratext); + + Attribute typeAttr = child.getAttribute("type"); + if (typeAttr != null) + try { + para.setParagraphType(typeAttr.getIntValue()); + } catch (DataConversionException e) { + e.printStackTrace(); + para.setParagraphType(ParagraphType.TYPE_OTHER); + } + part.getParagraphs().add(para); + } + } + } + + parts.add(part); + } + + return parts; + } +}