view src/de/mpiwg/anteater/xml/impl/TextReader.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.xml.impl;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jdom2.Element;

import de.mpiwg.anteater.text.Paragraph;
import de.mpiwg.anteater.text.ParagraphType;
import de.mpiwg.anteater.text.TextPart;
import de.mpiwg.anteater.text.TextType;
import de.mpiwg.anteater.xml.ITextReader;

public class TextReader extends JDOMParser implements ITextReader {
	
	protected String PARA_TYPE_NORM = "p";
	protected String PARA_TYPE_HEADING = "hd";
	protected String PARA_TYPE_SIG = "sig";
	protected String PARA_TYPE_FP = "fp";

	public TextReader(String filePath) {
		super(filePath, true);
	}

	public List<TextPart> getSummaryTexts() {
		
		List<Element> results = executeXPath("//SUM", null);
		
		return getTextParts(results, TextType.TYPE_SUMMARY);
	}
	
	public List<TextPart> getSupplementaryInformationTexts() {
		List<Element> results = executeXPath("//SUPLINF", null);
		
		return getTextParts(results, TextType.TYPE_SUPLINF);
	}
	
	protected List<TextPart> getTextParts(List<Element> textparts, int type) {
		List<TextPart> textPartList = new ArrayList<TextPart>();
		List<Element> frdocs = executeXPath("/NOTICE/FRDOC", null);
		
		String date = null;
		if (frdocs != null && frdocs.size() > 0) {
			Element frdoc = frdocs.get(0);
			String frdocString = frdoc.getText();
			
			String pattern = "Filed ([0-9-]+?);";
			Pattern pPattern = Pattern.compile(pattern);
			Matcher match = pPattern.matcher(frdocString);
			while (match.find()) {
				date = match.group(1);
			}
		}
		
		for (Element partNode : textparts) {
			TextPart part = new TextPart();
			part.setDate(date);
			
			part.setTextIdx(textparts.indexOf(partNode));
			part.setType(type);
			
			
			List<Element> directChildren = partNode.getChildren();
			for (Element child : directChildren) {
				
				String text = stripText(child);
				
				if (text != null) {
					Paragraph paragraph = new Paragraph();
					paragraph.setParagraphText(text);
					
					if (child.getName().trim().toLowerCase().equals(PARA_TYPE_FP))
						paragraph.setParagraphType(ParagraphType.TYPE_FP);
					else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_HEADING))
						paragraph.setParagraphType(ParagraphType.TYPE_HEADING);
					else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_NORM))
						paragraph.setParagraphType(ParagraphType.TYPE_PARAGRAPH);
					else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_SIG))
						paragraph.setParagraphType(ParagraphType.TYPE_SIGNATURE);
					else
						paragraph.setParagraphType(ParagraphType.TYPE_OTHER);	
					
					if (part.getParagraphs() == null)
						part.setParagraphs(new ArrayList<Paragraph>());
					
					part.getParagraphs().add(paragraph);
				}
			}
			
			textPartList.add(part);
		}
		return textPartList;
	}
}