Mercurial > hg > anteater
view src/de/mpiwg/anteater/text/TextPartXMLTranslator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.text; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringEscapeUtils; import org.jdom2.Attribute; import org.jdom2.DataConversionException; import org.jdom2.Element; import de.mpiwg.anteater.xml.impl.JDOMParser; public class TextPartXMLTranslator { public final static String SUMMARY = "summary"; public final static String SUPPLEINF = "supplInfo"; public final static String P = "p"; public final static String DATE = "date_filed"; public final static String DATE_TAG = "<" + DATE + ">"; public final static String DATE_TAG_CLOSE = "</" + DATE + ">"; public final static String SUMMARY_TAG = "<" + SUMMARY + ">"; public final static String SUMMARY_TAG_CLOSE = "</" + SUMMARY + ">"; public final static String SUPPLEINF_TAG = "<" + SUPPLEINF + ">"; public final static String SUPPLEINF_TAG_CLOSE = "</" + SUPPLEINF + ">"; public final static String P_TAG_Start = "<" + P + " type=\""; public final static String P_TAG_End = "\">"; public final static String P_TAG_CLOSE = "</" + P + ">"; public static String getXMLForText(TextPart textPart) { StringBuffer sb = new StringBuffer(); if (textPart.getType() == TextType.TYPE_SUMMARY) sb.append(SUMMARY_TAG); else sb.append(SUPPLEINF_TAG); sb.append(DATE_TAG); sb.append(textPart.getDate()); sb.append(DATE_TAG_CLOSE); for (Paragraph p : textPart.getParagraphs()) { sb.append(P_TAG_Start + p.getParagraphType() + P_TAG_End); sb.append(StringEscapeUtils.escapeXml(p.getParagraphText())); sb.append(P_TAG_CLOSE); } if (textPart.getType() == TextType.TYPE_SUMMARY) sb.append(SUMMARY_TAG_CLOSE); else sb.append(SUPPLEINF_TAG_CLOSE); return sb.toString(); } public static List<TextPart> getTextPartForXML(List<String> xmls) { List<TextPart> parts = new ArrayList<TextPart>(); for (String xml : xmls) { JDOMParser parser = new JDOMParser(xml, false); Element root = parser.getRoot(); TextPart part = new TextPart(); part.setParagraphs(new ArrayList<Paragraph>()); if (root.getName().trim().equals(SUMMARY)) part.setType(TextType.TYPE_SUMMARY); else part.setType(TextType.TYPE_SUPLINF); part.setTextIdx(xmls.indexOf(xml)); for (Element child : root.getChildren()) { if (child.getName().trim().equals(DATE)) { part.setDate(child.getText()); } else { String paratext = child.getText(); if (paratext != null) { Paragraph para = new Paragraph(); para.setParagraphText(paratext); Attribute typeAttr = child.getAttribute("type"); if (typeAttr != null) try { para.setParagraphType(typeAttr.getIntValue()); } catch (DataConversionException e) { e.printStackTrace(); para.setParagraphType(ParagraphType.TYPE_OTHER); } part.getParagraphs().add(para); } } } parts.add(part); } return parts; } }