Mercurial > hg > anteater
diff src/de/mpiwg/anteater/xml/impl/TextReader.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/xml/impl/TextReader.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,96 @@ +package de.mpiwg.anteater.xml.impl; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jdom2.Element; + +import de.mpiwg.anteater.text.Paragraph; +import de.mpiwg.anteater.text.ParagraphType; +import de.mpiwg.anteater.text.TextPart; +import de.mpiwg.anteater.text.TextType; +import de.mpiwg.anteater.xml.ITextReader; + +public class TextReader extends JDOMParser implements ITextReader { + + protected String PARA_TYPE_NORM = "p"; + protected String PARA_TYPE_HEADING = "hd"; + protected String PARA_TYPE_SIG = "sig"; + protected String PARA_TYPE_FP = "fp"; + + public TextReader(String filePath) { + super(filePath, true); + } + + public List<TextPart> getSummaryTexts() { + + List<Element> results = executeXPath("//SUM", null); + + return getTextParts(results, TextType.TYPE_SUMMARY); + } + + public List<TextPart> getSupplementaryInformationTexts() { + List<Element> results = executeXPath("//SUPLINF", null); + + return getTextParts(results, TextType.TYPE_SUPLINF); + } + + protected List<TextPart> getTextParts(List<Element> textparts, int type) { + List<TextPart> textPartList = new ArrayList<TextPart>(); + List<Element> frdocs = executeXPath("/NOTICE/FRDOC", null); + + String date = null; + if (frdocs != null && frdocs.size() > 0) { + Element frdoc = frdocs.get(0); + String frdocString = frdoc.getText(); + + String pattern = "Filed ([0-9-]+?);"; + Pattern pPattern = Pattern.compile(pattern); + Matcher match = pPattern.matcher(frdocString); + while (match.find()) { + date = match.group(1); + } + } + + for (Element partNode : textparts) { + TextPart part = new TextPart(); + part.setDate(date); + + part.setTextIdx(textparts.indexOf(partNode)); + part.setType(type); + + + List<Element> directChildren = partNode.getChildren(); + for (Element child : directChildren) { + + String text = stripText(child); + + if (text != null) { + Paragraph paragraph = new Paragraph(); + paragraph.setParagraphText(text); + + if (child.getName().trim().toLowerCase().equals(PARA_TYPE_FP)) + paragraph.setParagraphType(ParagraphType.TYPE_FP); + else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_HEADING)) + paragraph.setParagraphType(ParagraphType.TYPE_HEADING); + else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_NORM)) + paragraph.setParagraphType(ParagraphType.TYPE_PARAGRAPH); + else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_SIG)) + paragraph.setParagraphType(ParagraphType.TYPE_SIGNATURE); + else + paragraph.setParagraphType(ParagraphType.TYPE_OTHER); + + if (part.getParagraphs() == null) + part.setParagraphs(new ArrayList<Paragraph>()); + + part.getParagraphs().add(paragraph); + } + } + + textPartList.add(part); + } + return textPartList; + } +}