Mercurial > hg > anteater
comparison src/de/mpiwg/anteater/xml/impl/TextReader.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:036535fcd179 |
---|---|
1 package de.mpiwg.anteater.xml.impl; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.List; | |
5 import java.util.regex.Matcher; | |
6 import java.util.regex.Pattern; | |
7 | |
8 import org.jdom2.Element; | |
9 | |
10 import de.mpiwg.anteater.text.Paragraph; | |
11 import de.mpiwg.anteater.text.ParagraphType; | |
12 import de.mpiwg.anteater.text.TextPart; | |
13 import de.mpiwg.anteater.text.TextType; | |
14 import de.mpiwg.anteater.xml.ITextReader; | |
15 | |
16 public class TextReader extends JDOMParser implements ITextReader { | |
17 | |
18 protected String PARA_TYPE_NORM = "p"; | |
19 protected String PARA_TYPE_HEADING = "hd"; | |
20 protected String PARA_TYPE_SIG = "sig"; | |
21 protected String PARA_TYPE_FP = "fp"; | |
22 | |
23 public TextReader(String filePath) { | |
24 super(filePath, true); | |
25 } | |
26 | |
27 public List<TextPart> getSummaryTexts() { | |
28 | |
29 List<Element> results = executeXPath("//SUM", null); | |
30 | |
31 return getTextParts(results, TextType.TYPE_SUMMARY); | |
32 } | |
33 | |
34 public List<TextPart> getSupplementaryInformationTexts() { | |
35 List<Element> results = executeXPath("//SUPLINF", null); | |
36 | |
37 return getTextParts(results, TextType.TYPE_SUPLINF); | |
38 } | |
39 | |
40 protected List<TextPart> getTextParts(List<Element> textparts, int type) { | |
41 List<TextPart> textPartList = new ArrayList<TextPart>(); | |
42 List<Element> frdocs = executeXPath("/NOTICE/FRDOC", null); | |
43 | |
44 String date = null; | |
45 if (frdocs != null && frdocs.size() > 0) { | |
46 Element frdoc = frdocs.get(0); | |
47 String frdocString = frdoc.getText(); | |
48 | |
49 String pattern = "Filed ([0-9-]+?);"; | |
50 Pattern pPattern = Pattern.compile(pattern); | |
51 Matcher match = pPattern.matcher(frdocString); | |
52 while (match.find()) { | |
53 date = match.group(1); | |
54 } | |
55 } | |
56 | |
57 for (Element partNode : textparts) { | |
58 TextPart part = new TextPart(); | |
59 part.setDate(date); | |
60 | |
61 part.setTextIdx(textparts.indexOf(partNode)); | |
62 part.setType(type); | |
63 | |
64 | |
65 List<Element> directChildren = partNode.getChildren(); | |
66 for (Element child : directChildren) { | |
67 | |
68 String text = stripText(child); | |
69 | |
70 if (text != null) { | |
71 Paragraph paragraph = new Paragraph(); | |
72 paragraph.setParagraphText(text); | |
73 | |
74 if (child.getName().trim().toLowerCase().equals(PARA_TYPE_FP)) | |
75 paragraph.setParagraphType(ParagraphType.TYPE_FP); | |
76 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_HEADING)) | |
77 paragraph.setParagraphType(ParagraphType.TYPE_HEADING); | |
78 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_NORM)) | |
79 paragraph.setParagraphType(ParagraphType.TYPE_PARAGRAPH); | |
80 else if (child.getName().trim().toLowerCase().equals(PARA_TYPE_SIG)) | |
81 paragraph.setParagraphType(ParagraphType.TYPE_SIGNATURE); | |
82 else | |
83 paragraph.setParagraphType(ParagraphType.TYPE_OTHER); | |
84 | |
85 if (part.getParagraphs() == null) | |
86 part.setParagraphs(new ArrayList<Paragraph>()); | |
87 | |
88 part.getParagraphs().add(paragraph); | |
89 } | |
90 } | |
91 | |
92 textPartList.add(part); | |
93 } | |
94 return textPartList; | |
95 } | |
96 } |