Mercurial > hg > anteater

diff src/de/mpiwg/anteater/xml/impl/TextReader.java @ 0:036535fcd179
anteater
author: jdamerow
date: Fri, 14 Sep 2012 10:30:43 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/xml/impl/TextReader.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,96 @@
+package de.mpiwg.anteater.xml.impl;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.jdom2.Element;
+
+import de.mpiwg.anteater.text.Paragraph;
+import de.mpiwg.anteater.text.ParagraphType;
+import de.mpiwg.anteater.text.TextPart;
+import de.mpiwg.anteater.text.TextType;
+import de.mpiwg.anteater.xml.ITextReader;
+
+public class TextReader extends JDOMParser implements ITextReader {
+	
+	protected String PARA_TYPE_NORM = "p";
+	protected String PARA_TYPE_HEADING = "hd";
+	protected String PARA_TYPE_SIG = "sig";
+	protected String PARA_TYPE_FP = "fp";
+
+	public TextReader(String filePath) {
+		super(filePath, true);
+	}
+
+	public List<TextPart> getSummaryTexts() {
+		
+		List<Element> results = executeXPath("//SUM", null);
+		
+		return getTextParts(results, TextType.TYPE_SUMMARY);
+	}
+	
+	public List<TextPart> getSupplementaryInformationTexts() {
+		List<Element> results = executeXPath("//SUPLINF", null);
+		
+		return getTextParts(results, TextType.TYPE_SUPLINF);
+	}
+	
+	protected List<TextPart> getTextParts(List<Element> textparts, int type) {
+		List<TextPart> textPartList = new ArrayList<TextPart>();
+		List<Element> frdocs = executeXPath("/NOTICE/FRDOC", null);
+		
+		String date = null;
+		if (frdocs != null && frdocs.size() > 0) {
+			Element frdoc = frdocs.get(0);
+			String frdocString = frdoc.getText();
+			
+			String pattern = "Filed ([0-9-]+?);";
+			Pattern pPattern = Pattern.compile(pattern);
+			Matcher match = pPattern.matcher(frdocString);
+			while (match.find()) {
+				date = match.group(1);
+			}
+		}
+		
+		for (Element partNode : textparts) {
+			TextPart part = new TextPart();
+			part.setDate(date);
+			
+			part.setTextIdx(textparts.indexOf(partNode));
+			part.setType(type);
+			
+			
+			List<Element> directChildren = partNode.getChildren();
+			for (Element child : directChildren) {
+				
+				String text = stripText(child);
+				
+				if (text != null) {
+					Paragraph paragraph = new Paragraph();
+					paragraph.setParagraphText(text);
+					
+					if (child.getName().trim().toLowerCase().equals(PARA_TYPE_FP))
+						paragraph.setParagraphType(ParagraphType.TYPE_FP);
+					else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_HEADING))
+						paragraph.setParagraphType(ParagraphType.TYPE_HEADING);
+					else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_NORM))
+						paragraph.setParagraphType(ParagraphType.TYPE_PARAGRAPH);
+					else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_SIG))
+						paragraph.setParagraphType(ParagraphType.TYPE_SIGNATURE);
+					else
+						paragraph.setParagraphType(ParagraphType.TYPE_OTHER);	
+					
+					if (part.getParagraphs() == null)
+						part.setParagraphs(new ArrayList<Paragraph>());
+					
+					part.getParagraphs().add(paragraph);
+				}
+			}
+			
+			textPartList.add(part);
+		}
+		return textPartList;
+	}
+}
author	jdamerow
date	Fri, 14 Sep 2012 10:30:43 +0200
parents
children