Mercurial > hg > anteater
view src/de/mpiwg/anteater/xml/impl/TextReader.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.xml.impl; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jdom2.Element; import de.mpiwg.anteater.text.Paragraph; import de.mpiwg.anteater.text.ParagraphType; import de.mpiwg.anteater.text.TextPart; import de.mpiwg.anteater.text.TextType; import de.mpiwg.anteater.xml.ITextReader; public class TextReader extends JDOMParser implements ITextReader { protected String PARA_TYPE_NORM = "p"; protected String PARA_TYPE_HEADING = "hd"; protected String PARA_TYPE_SIG = "sig"; protected String PARA_TYPE_FP = "fp"; public TextReader(String filePath) { super(filePath, true); } public List<TextPart> getSummaryTexts() { List<Element> results = executeXPath("//SUM", null); return getTextParts(results, TextType.TYPE_SUMMARY); } public List<TextPart> getSupplementaryInformationTexts() { List<Element> results = executeXPath("//SUPLINF", null); return getTextParts(results, TextType.TYPE_SUPLINF); } protected List<TextPart> getTextParts(List<Element> textparts, int type) { List<TextPart> textPartList = new ArrayList<TextPart>(); List<Element> frdocs = executeXPath("/NOTICE/FRDOC", null); String date = null; if (frdocs != null && frdocs.size() > 0) { Element frdoc = frdocs.get(0); String frdocString = frdoc.getText(); String pattern = "Filed ([0-9-]+?);"; Pattern pPattern = Pattern.compile(pattern); Matcher match = pPattern.matcher(frdocString); while (match.find()) { date = match.group(1); } } for (Element partNode : textparts) { TextPart part = new TextPart(); part.setDate(date); part.setTextIdx(textparts.indexOf(partNode)); part.setType(type); List<Element> directChildren = partNode.getChildren(); for (Element child : directChildren) { String text = stripText(child); if (text != null) { Paragraph paragraph = new Paragraph(); paragraph.setParagraphText(text); if (child.getName().trim().toLowerCase().equals(PARA_TYPE_FP)) paragraph.setParagraphType(ParagraphType.TYPE_FP); else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_HEADING)) paragraph.setParagraphType(ParagraphType.TYPE_HEADING); else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_NORM)) paragraph.setParagraphType(ParagraphType.TYPE_PARAGRAPH); else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_SIG)) paragraph.setParagraphType(ParagraphType.TYPE_SIGNATURE); else paragraph.setParagraphType(ParagraphType.TYPE_OTHER); if (part.getParagraphs() == null) part.setParagraphs(new ArrayList<Paragraph>()); part.getParagraphs().add(paragraph); } } textPartList.add(part); } return textPartList; } }