annotate src/de/mpiwg/anteater/xml/impl/TextReader.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.xml.impl;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.ArrayList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5 import java.util.regex.Matcher;
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import java.util.regex.Pattern;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7
036535fcd179 anteater
jdamerow
parents:
diff changeset
8 import org.jdom2.Element;
036535fcd179 anteater
jdamerow
parents:
diff changeset
9
036535fcd179 anteater
jdamerow
parents:
diff changeset
10 import de.mpiwg.anteater.text.Paragraph;
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import de.mpiwg.anteater.text.ParagraphType;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 import de.mpiwg.anteater.text.TextPart;
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 import de.mpiwg.anteater.text.TextType;
036535fcd179 anteater
jdamerow
parents:
diff changeset
14 import de.mpiwg.anteater.xml.ITextReader;
036535fcd179 anteater
jdamerow
parents:
diff changeset
15
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 public class TextReader extends JDOMParser implements ITextReader {
036535fcd179 anteater
jdamerow
parents:
diff changeset
17
036535fcd179 anteater
jdamerow
parents:
diff changeset
18 protected String PARA_TYPE_NORM = "p";
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 protected String PARA_TYPE_HEADING = "hd";
036535fcd179 anteater
jdamerow
parents:
diff changeset
20 protected String PARA_TYPE_SIG = "sig";
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 protected String PARA_TYPE_FP = "fp";
036535fcd179 anteater
jdamerow
parents:
diff changeset
22
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 public TextReader(String filePath) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 super(filePath, true);
036535fcd179 anteater
jdamerow
parents:
diff changeset
25 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
26
036535fcd179 anteater
jdamerow
parents:
diff changeset
27 public List<TextPart> getSummaryTexts() {
036535fcd179 anteater
jdamerow
parents:
diff changeset
28
036535fcd179 anteater
jdamerow
parents:
diff changeset
29 List<Element> results = executeXPath("//SUM", null);
036535fcd179 anteater
jdamerow
parents:
diff changeset
30
036535fcd179 anteater
jdamerow
parents:
diff changeset
31 return getTextParts(results, TextType.TYPE_SUMMARY);
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
33
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 public List<TextPart> getSupplementaryInformationTexts() {
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 List<Element> results = executeXPath("//SUPLINF", null);
036535fcd179 anteater
jdamerow
parents:
diff changeset
36
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 return getTextParts(results, TextType.TYPE_SUPLINF);
036535fcd179 anteater
jdamerow
parents:
diff changeset
38 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
39
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 protected List<TextPart> getTextParts(List<Element> textparts, int type) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
41 List<TextPart> textPartList = new ArrayList<TextPart>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 List<Element> frdocs = executeXPath("/NOTICE/FRDOC", null);
036535fcd179 anteater
jdamerow
parents:
diff changeset
43
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 String date = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
45 if (frdocs != null && frdocs.size() > 0) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
46 Element frdoc = frdocs.get(0);
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 String frdocString = frdoc.getText();
036535fcd179 anteater
jdamerow
parents:
diff changeset
48
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 String pattern = "Filed ([0-9-]+?);";
036535fcd179 anteater
jdamerow
parents:
diff changeset
50 Pattern pPattern = Pattern.compile(pattern);
036535fcd179 anteater
jdamerow
parents:
diff changeset
51 Matcher match = pPattern.matcher(frdocString);
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 while (match.find()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
53 date = match.group(1);
036535fcd179 anteater
jdamerow
parents:
diff changeset
54 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
55 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
56
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 for (Element partNode : textparts) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 TextPart part = new TextPart();
036535fcd179 anteater
jdamerow
parents:
diff changeset
59 part.setDate(date);
036535fcd179 anteater
jdamerow
parents:
diff changeset
60
036535fcd179 anteater
jdamerow
parents:
diff changeset
61 part.setTextIdx(textparts.indexOf(partNode));
036535fcd179 anteater
jdamerow
parents:
diff changeset
62 part.setType(type);
036535fcd179 anteater
jdamerow
parents:
diff changeset
63
036535fcd179 anteater
jdamerow
parents:
diff changeset
64
036535fcd179 anteater
jdamerow
parents:
diff changeset
65 List<Element> directChildren = partNode.getChildren();
036535fcd179 anteater
jdamerow
parents:
diff changeset
66 for (Element child : directChildren) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
67
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 String text = stripText(child);
036535fcd179 anteater
jdamerow
parents:
diff changeset
69
036535fcd179 anteater
jdamerow
parents:
diff changeset
70 if (text != null) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 Paragraph paragraph = new Paragraph();
036535fcd179 anteater
jdamerow
parents:
diff changeset
72 paragraph.setParagraphText(text);
036535fcd179 anteater
jdamerow
parents:
diff changeset
73
036535fcd179 anteater
jdamerow
parents:
diff changeset
74 if (child.getName().trim().toLowerCase().equals(PARA_TYPE_FP))
036535fcd179 anteater
jdamerow
parents:
diff changeset
75 paragraph.setParagraphType(ParagraphType.TYPE_FP);
036535fcd179 anteater
jdamerow
parents:
diff changeset
76 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_HEADING))
036535fcd179 anteater
jdamerow
parents:
diff changeset
77 paragraph.setParagraphType(ParagraphType.TYPE_HEADING);
036535fcd179 anteater
jdamerow
parents:
diff changeset
78 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_NORM))
036535fcd179 anteater
jdamerow
parents:
diff changeset
79 paragraph.setParagraphType(ParagraphType.TYPE_PARAGRAPH);
036535fcd179 anteater
jdamerow
parents:
diff changeset
80 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_SIG))
036535fcd179 anteater
jdamerow
parents:
diff changeset
81 paragraph.setParagraphType(ParagraphType.TYPE_SIGNATURE);
036535fcd179 anteater
jdamerow
parents:
diff changeset
82 else
036535fcd179 anteater
jdamerow
parents:
diff changeset
83 paragraph.setParagraphType(ParagraphType.TYPE_OTHER);
036535fcd179 anteater
jdamerow
parents:
diff changeset
84
036535fcd179 anteater
jdamerow
parents:
diff changeset
85 if (part.getParagraphs() == null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
86 part.setParagraphs(new ArrayList<Paragraph>());
036535fcd179 anteater
jdamerow
parents:
diff changeset
87
036535fcd179 anteater
jdamerow
parents:
diff changeset
88 part.getParagraphs().add(paragraph);
036535fcd179 anteater
jdamerow
parents:
diff changeset
89 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
90 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
91
036535fcd179 anteater
jdamerow
parents:
diff changeset
92 textPartList.add(part);
036535fcd179 anteater
jdamerow
parents:
diff changeset
93 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
94 return textPartList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
95 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
96 }