comparison src/de/mpiwg/anteater/xml/impl/TextReader.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:036535fcd179
1 package de.mpiwg.anteater.xml.impl;
2
3 import java.util.ArrayList;
4 import java.util.List;
5 import java.util.regex.Matcher;
6 import java.util.regex.Pattern;
7
8 import org.jdom2.Element;
9
10 import de.mpiwg.anteater.text.Paragraph;
11 import de.mpiwg.anteater.text.ParagraphType;
12 import de.mpiwg.anteater.text.TextPart;
13 import de.mpiwg.anteater.text.TextType;
14 import de.mpiwg.anteater.xml.ITextReader;
15
16 public class TextReader extends JDOMParser implements ITextReader {
17
18 protected String PARA_TYPE_NORM = "p";
19 protected String PARA_TYPE_HEADING = "hd";
20 protected String PARA_TYPE_SIG = "sig";
21 protected String PARA_TYPE_FP = "fp";
22
23 public TextReader(String filePath) {
24 super(filePath, true);
25 }
26
27 public List<TextPart> getSummaryTexts() {
28
29 List<Element> results = executeXPath("//SUM", null);
30
31 return getTextParts(results, TextType.TYPE_SUMMARY);
32 }
33
34 public List<TextPart> getSupplementaryInformationTexts() {
35 List<Element> results = executeXPath("//SUPLINF", null);
36
37 return getTextParts(results, TextType.TYPE_SUPLINF);
38 }
39
40 protected List<TextPart> getTextParts(List<Element> textparts, int type) {
41 List<TextPart> textPartList = new ArrayList<TextPart>();
42 List<Element> frdocs = executeXPath("/NOTICE/FRDOC", null);
43
44 String date = null;
45 if (frdocs != null && frdocs.size() > 0) {
46 Element frdoc = frdocs.get(0);
47 String frdocString = frdoc.getText();
48
49 String pattern = "Filed ([0-9-]+?);";
50 Pattern pPattern = Pattern.compile(pattern);
51 Matcher match = pPattern.matcher(frdocString);
52 while (match.find()) {
53 date = match.group(1);
54 }
55 }
56
57 for (Element partNode : textparts) {
58 TextPart part = new TextPart();
59 part.setDate(date);
60
61 part.setTextIdx(textparts.indexOf(partNode));
62 part.setType(type);
63
64
65 List<Element> directChildren = partNode.getChildren();
66 for (Element child : directChildren) {
67
68 String text = stripText(child);
69
70 if (text != null) {
71 Paragraph paragraph = new Paragraph();
72 paragraph.setParagraphText(text);
73
74 if (child.getName().trim().toLowerCase().equals(PARA_TYPE_FP))
75 paragraph.setParagraphType(ParagraphType.TYPE_FP);
76 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_HEADING))
77 paragraph.setParagraphType(ParagraphType.TYPE_HEADING);
78 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_NORM))
79 paragraph.setParagraphType(ParagraphType.TYPE_PARAGRAPH);
80 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_SIG))
81 paragraph.setParagraphType(ParagraphType.TYPE_SIGNATURE);
82 else
83 paragraph.setParagraphType(ParagraphType.TYPE_OTHER);
84
85 if (part.getParagraphs() == null)
86 part.setParagraphs(new ArrayList<Paragraph>());
87
88 part.getParagraphs().add(paragraph);
89 }
90 }
91
92 textPartList.add(part);
93 }
94 return textPartList;
95 }
96 }