0
|
1 package de.mpiwg.anteater.xml.impl;
|
|
2
|
|
3 import java.util.ArrayList;
|
|
4 import java.util.List;
|
|
5 import java.util.regex.Matcher;
|
|
6 import java.util.regex.Pattern;
|
|
7
|
|
8 import org.jdom2.Element;
|
|
9
|
|
10 import de.mpiwg.anteater.text.Paragraph;
|
|
11 import de.mpiwg.anteater.text.ParagraphType;
|
|
12 import de.mpiwg.anteater.text.TextPart;
|
|
13 import de.mpiwg.anteater.text.TextType;
|
|
14 import de.mpiwg.anteater.xml.ITextReader;
|
|
15
|
|
16 public class TextReader extends JDOMParser implements ITextReader {
|
|
17
|
|
18 protected String PARA_TYPE_NORM = "p";
|
|
19 protected String PARA_TYPE_HEADING = "hd";
|
|
20 protected String PARA_TYPE_SIG = "sig";
|
|
21 protected String PARA_TYPE_FP = "fp";
|
|
22
|
|
23 public TextReader(String filePath) {
|
|
24 super(filePath, true);
|
|
25 }
|
|
26
|
|
27 public List<TextPart> getSummaryTexts() {
|
|
28
|
|
29 List<Element> results = executeXPath("//SUM", null);
|
|
30
|
|
31 return getTextParts(results, TextType.TYPE_SUMMARY);
|
|
32 }
|
|
33
|
|
34 public List<TextPart> getSupplementaryInformationTexts() {
|
|
35 List<Element> results = executeXPath("//SUPLINF", null);
|
|
36
|
|
37 return getTextParts(results, TextType.TYPE_SUPLINF);
|
|
38 }
|
|
39
|
|
40 protected List<TextPart> getTextParts(List<Element> textparts, int type) {
|
|
41 List<TextPart> textPartList = new ArrayList<TextPart>();
|
|
42 List<Element> frdocs = executeXPath("/NOTICE/FRDOC", null);
|
|
43
|
|
44 String date = null;
|
|
45 if (frdocs != null && frdocs.size() > 0) {
|
|
46 Element frdoc = frdocs.get(0);
|
|
47 String frdocString = frdoc.getText();
|
|
48
|
|
49 String pattern = "Filed ([0-9-]+?);";
|
|
50 Pattern pPattern = Pattern.compile(pattern);
|
|
51 Matcher match = pPattern.matcher(frdocString);
|
|
52 while (match.find()) {
|
|
53 date = match.group(1);
|
|
54 }
|
|
55 }
|
|
56
|
|
57 for (Element partNode : textparts) {
|
|
58 TextPart part = new TextPart();
|
|
59 part.setDate(date);
|
|
60
|
|
61 part.setTextIdx(textparts.indexOf(partNode));
|
|
62 part.setType(type);
|
|
63
|
|
64
|
|
65 List<Element> directChildren = partNode.getChildren();
|
|
66 for (Element child : directChildren) {
|
|
67
|
|
68 String text = stripText(child);
|
|
69
|
|
70 if (text != null) {
|
|
71 Paragraph paragraph = new Paragraph();
|
|
72 paragraph.setParagraphText(text);
|
|
73
|
|
74 if (child.getName().trim().toLowerCase().equals(PARA_TYPE_FP))
|
|
75 paragraph.setParagraphType(ParagraphType.TYPE_FP);
|
|
76 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_HEADING))
|
|
77 paragraph.setParagraphType(ParagraphType.TYPE_HEADING);
|
|
78 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_NORM))
|
|
79 paragraph.setParagraphType(ParagraphType.TYPE_PARAGRAPH);
|
|
80 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_SIG))
|
|
81 paragraph.setParagraphType(ParagraphType.TYPE_SIGNATURE);
|
|
82 else
|
|
83 paragraph.setParagraphType(ParagraphType.TYPE_OTHER);
|
|
84
|
|
85 if (part.getParagraphs() == null)
|
|
86 part.setParagraphs(new ArrayList<Paragraph>());
|
|
87
|
|
88 part.getParagraphs().add(paragraph);
|
|
89 }
|
|
90 }
|
|
91
|
|
92 textPartList.add(part);
|
|
93 }
|
|
94 return textPartList;
|
|
95 }
|
|
96 }
|