annotate src/de/mpiwg/dwinter/fulltextIndexer/harvester/.svn/text-base/DocHarvesterThreaded.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7622afcfea initial
dwinter
parents:
diff changeset
1 package de.mpiwg.dwinter.fulltextIndexer.harvester;
dc7622afcfea initial
dwinter
parents:
diff changeset
2
dc7622afcfea initial
dwinter
parents:
diff changeset
3 /* Harveste jeweils ein komplettes Buch in einen Eintrag
dc7622afcfea initial
dwinter
parents:
diff changeset
4 *
dc7622afcfea initial
dwinter
parents:
diff changeset
5 * */
dc7622afcfea initial
dwinter
parents:
diff changeset
6 import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
dc7622afcfea initial
dwinter
parents:
diff changeset
7
dc7622afcfea initial
dwinter
parents:
diff changeset
8 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
9
dc7622afcfea initial
dwinter
parents:
diff changeset
10 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
dc7622afcfea initial
dwinter
parents:
diff changeset
11
dc7622afcfea initial
dwinter
parents:
diff changeset
12 import java.io.BufferedReader;
dc7622afcfea initial
dwinter
parents:
diff changeset
13
dc7622afcfea initial
dwinter
parents:
diff changeset
14 import java.io.File;
dc7622afcfea initial
dwinter
parents:
diff changeset
15
dc7622afcfea initial
dwinter
parents:
diff changeset
16 import java.io.FileNotFoundException;
dc7622afcfea initial
dwinter
parents:
diff changeset
17
dc7622afcfea initial
dwinter
parents:
diff changeset
18 import java.io.BufferedInputStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
19 import java.io.BufferedWriter;
dc7622afcfea initial
dwinter
parents:
diff changeset
20 import java.io.ByteArrayOutputStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
21 import java.io.FileInputStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
22 import java.io.FileOutputStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
23 import java.io.FileReader;
dc7622afcfea initial
dwinter
parents:
diff changeset
24 import java.io.FileWriter;
dc7622afcfea initial
dwinter
parents:
diff changeset
25 import java.io.InputStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
26 import java.io.OutputStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
27 import java.io.OutputStreamWriter;
dc7622afcfea initial
dwinter
parents:
diff changeset
28 import java.io.StringWriter;
dc7622afcfea initial
dwinter
parents:
diff changeset
29
dc7622afcfea initial
dwinter
parents:
diff changeset
30 import java.io.IOException;
dc7622afcfea initial
dwinter
parents:
diff changeset
31
dc7622afcfea initial
dwinter
parents:
diff changeset
32 import java.io.PrintStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
33
dc7622afcfea initial
dwinter
parents:
diff changeset
34 import java.util.ArrayList;
dc7622afcfea initial
dwinter
parents:
diff changeset
35
dc7622afcfea initial
dwinter
parents:
diff changeset
36 import java.util.Arrays;
dc7622afcfea initial
dwinter
parents:
diff changeset
37
dc7622afcfea initial
dwinter
parents:
diff changeset
38 import java.util.Date;
dc7622afcfea initial
dwinter
parents:
diff changeset
39
dc7622afcfea initial
dwinter
parents:
diff changeset
40 import java.util.HashMap;
dc7622afcfea initial
dwinter
parents:
diff changeset
41
dc7622afcfea initial
dwinter
parents:
diff changeset
42 import java.util.List;
dc7622afcfea initial
dwinter
parents:
diff changeset
43 import java.util.regex.Matcher;
dc7622afcfea initial
dwinter
parents:
diff changeset
44 import java.util.regex.Pattern;
dc7622afcfea initial
dwinter
parents:
diff changeset
45
dc7622afcfea initial
dwinter
parents:
diff changeset
46 import javax.xml.parsers.DocumentBuilder;
dc7622afcfea initial
dwinter
parents:
diff changeset
47 import javax.xml.parsers.DocumentBuilderFactory;
dc7622afcfea initial
dwinter
parents:
diff changeset
48 import javax.xml.parsers.ParserConfigurationException;
dc7622afcfea initial
dwinter
parents:
diff changeset
49 import javax.xml.parsers.SAXParser;
dc7622afcfea initial
dwinter
parents:
diff changeset
50 import javax.xml.transform.OutputKeys;
dc7622afcfea initial
dwinter
parents:
diff changeset
51 import javax.xml.transform.Transformer;
dc7622afcfea initial
dwinter
parents:
diff changeset
52 import javax.xml.transform.TransformerConfigurationException;
dc7622afcfea initial
dwinter
parents:
diff changeset
53 import javax.xml.transform.TransformerException;
dc7622afcfea initial
dwinter
parents:
diff changeset
54 import javax.xml.transform.TransformerFactory;
dc7622afcfea initial
dwinter
parents:
diff changeset
55 import javax.xml.transform.dom.DOMResult;
dc7622afcfea initial
dwinter
parents:
diff changeset
56 import javax.xml.transform.dom.DOMSource;
dc7622afcfea initial
dwinter
parents:
diff changeset
57 import javax.xml.transform.stream.StreamResult;
dc7622afcfea initial
dwinter
parents:
diff changeset
58 import javax.xml.transform.stream.StreamSource;
dc7622afcfea initial
dwinter
parents:
diff changeset
59
dc7622afcfea initial
dwinter
parents:
diff changeset
60 import org.apache.commons.io.IOUtils;
dc7622afcfea initial
dwinter
parents:
diff changeset
61 import org.apache.lucene.analysis.de.GermanAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
62
dc7622afcfea initial
dwinter
parents:
diff changeset
63 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
64
dc7622afcfea initial
dwinter
parents:
diff changeset
65 import org.apache.lucene.analysis.standard.StandardAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
66
dc7622afcfea initial
dwinter
parents:
diff changeset
67 import org.apache.lucene.index.CorruptIndexException;
dc7622afcfea initial
dwinter
parents:
diff changeset
68
dc7622afcfea initial
dwinter
parents:
diff changeset
69 import org.apache.lucene.store.LockObtainFailedException;
dc7622afcfea initial
dwinter
parents:
diff changeset
70
dc7622afcfea initial
dwinter
parents:
diff changeset
71 import org.apache.lucene.util.Version;
dc7622afcfea initial
dwinter
parents:
diff changeset
72 import org.apache.ws.commons.serialize.XMLWriterImpl;
dc7622afcfea initial
dwinter
parents:
diff changeset
73
dc7622afcfea initial
dwinter
parents:
diff changeset
74 import org.jdom.Document;
dc7622afcfea initial
dwinter
parents:
diff changeset
75
dc7622afcfea initial
dwinter
parents:
diff changeset
76 import org.jdom.Element;
dc7622afcfea initial
dwinter
parents:
diff changeset
77
dc7622afcfea initial
dwinter
parents:
diff changeset
78 import org.jdom.JDOMException;
dc7622afcfea initial
dwinter
parents:
diff changeset
79
dc7622afcfea initial
dwinter
parents:
diff changeset
80 import org.jdom.input.SAXBuilder;
dc7622afcfea initial
dwinter
parents:
diff changeset
81 import org.jdom.xpath.XPath;
dc7622afcfea initial
dwinter
parents:
diff changeset
82 import org.w3c.dom.DocumentFragment;
dc7622afcfea initial
dwinter
parents:
diff changeset
83 import org.xml.sax.SAXException;
dc7622afcfea initial
dwinter
parents:
diff changeset
84
dc7622afcfea initial
dwinter
parents:
diff changeset
85 import com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl;
dc7622afcfea initial
dwinter
parents:
diff changeset
86
dc7622afcfea initial
dwinter
parents:
diff changeset
87 public class DocHarvesterThreaded {
dc7622afcfea initial
dwinter
parents:
diff changeset
88 private static final boolean DEBUG = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
89 private static final int MAXFILES = 3;
dc7622afcfea initial
dwinter
parents:
diff changeset
90
dc7622afcfea initial
dwinter
parents:
diff changeset
91 //private static final String PREFIX = "/tmp/composed/files";
dc7622afcfea initial
dwinter
parents:
diff changeset
92 private static final String PREFIX = "/Volumes/data/composed/files";
dc7622afcfea initial
dwinter
parents:
diff changeset
93 private static final String COMPOSEDFN = "doc.xml";
dc7622afcfea initial
dwinter
parents:
diff changeset
94 private static final boolean CREATE_NEW = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
95
dc7622afcfea initial
dwinter
parents:
diff changeset
96 protected static ArrayList<String> fileTypesToIndex = new ArrayList(
dc7622afcfea initial
dwinter
parents:
diff changeset
97 Arrays.asList(new String[] { "xml" }));
dc7622afcfea initial
dwinter
parents:
diff changeset
98
dc7622afcfea initial
dwinter
parents:
diff changeset
99 protected static ArrayList<String> excludeFolders = new ArrayList(
dc7622afcfea initial
dwinter
parents:
diff changeset
100 Arrays.asList(new String[] { "OCR" }));
dc7622afcfea initial
dwinter
parents:
diff changeset
101 protected static boolean indexMetaPriority = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
102
dc7622afcfea initial
dwinter
parents:
diff changeset
103 private static String specialMode = "";
dc7622afcfea initial
dwinter
parents:
diff changeset
104 protected static int maxThread = 30;
dc7622afcfea initial
dwinter
parents:
diff changeset
105 protected File docDir;
dc7622afcfea initial
dwinter
parents:
diff changeset
106 protected File index_dir;
dc7622afcfea initial
dwinter
parents:
diff changeset
107 protected HashMap<String, String> textLanguage = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
108 protected HashMap<String, String> languageToISO = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
109 protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
dc7622afcfea initial
dwinter
parents:
diff changeset
110
dc7622afcfea initial
dwinter
parents:
diff changeset
111 private int counter = 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
112 protected String languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
113 protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread];
dc7622afcfea initial
dwinter
parents:
diff changeset
114 private int filecount = 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
115
dc7622afcfea initial
dwinter
parents:
diff changeset
116 protected String mdProviderUrl = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
117 private String preferedLanguage;
dc7622afcfea initial
dwinter
parents:
diff changeset
118 protected HashMap<String, String> supportedLanguageFolder = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
119 private int completedFiles = 0; // counter for all files completed and indexed
dc7622afcfea initial
dwinter
parents:
diff changeset
120
dc7622afcfea initial
dwinter
parents:
diff changeset
121 public DocHarvesterThreaded() {
dc7622afcfea initial
dwinter
parents:
diff changeset
122 }
dc7622afcfea initial
dwinter
parents:
diff changeset
123
dc7622afcfea initial
dwinter
parents:
diff changeset
124 public DocHarvesterThreaded(File docDir, File index_dir,
dc7622afcfea initial
dwinter
parents:
diff changeset
125 String languageFileName, String mdProviderUrl, String lang)
dc7622afcfea initial
dwinter
parents:
diff changeset
126 throws CorruptIndexException, LockObtainFailedException,
dc7622afcfea initial
dwinter
parents:
diff changeset
127 IOException {
dc7622afcfea initial
dwinter
parents:
diff changeset
128 /* 119 */this.docDir = docDir;
dc7622afcfea initial
dwinter
parents:
diff changeset
129 /* 120 */this.languageFileName = languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
130 /* 121 */this.preferedLanguage = lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
131
dc7622afcfea initial
dwinter
parents:
diff changeset
132 /* 133 */this.mdProviderUrl = mdProviderUrl;
dc7622afcfea initial
dwinter
parents:
diff changeset
133
dc7622afcfea initial
dwinter
parents:
diff changeset
134 /* 135 */this.index_dir = index_dir;
dc7622afcfea initial
dwinter
parents:
diff changeset
135
dc7622afcfea initial
dwinter
parents:
diff changeset
136 /* 137 */for (int i = 0; i < maxThread; ++i) {
dc7622afcfea initial
dwinter
parents:
diff changeset
137 /* 139 */this.mythreads[i] = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
138 }
dc7622afcfea initial
dwinter
parents:
diff changeset
139
dc7622afcfea initial
dwinter
parents:
diff changeset
140 /* 142 */init_languages();
dc7622afcfea initial
dwinter
parents:
diff changeset
141 }
dc7622afcfea initial
dwinter
parents:
diff changeset
142
dc7622afcfea initial
dwinter
parents:
diff changeset
143 private void init_languages() {
dc7622afcfea initial
dwinter
parents:
diff changeset
144 /* 146 */this.languageToISO.put("German", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
145 /* 147 */this.languageToISO.put("French", "fr");
dc7622afcfea initial
dwinter
parents:
diff changeset
146 /* 148 */this.languageToISO.put("English", "en");
dc7622afcfea initial
dwinter
parents:
diff changeset
147 /* 149 */this.languageToISO.put("German-f", "de-f");
dc7622afcfea initial
dwinter
parents:
diff changeset
148
dc7622afcfea initial
dwinter
parents:
diff changeset
149 /* 151 */this.supportedLanguageFolder.put("deu", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
150 /* 152 */this.supportedLanguageFolder.put("deu-f", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
151 /* 153 */this.supportedLanguageFolder.put("fra", "fr");
dc7622afcfea initial
dwinter
parents:
diff changeset
152 /* 154 */this.supportedLanguageFolder.put("eng", "en");
dc7622afcfea initial
dwinter
parents:
diff changeset
153 /* 155 */this.supportedLanguageFolder.put("lic", "la");
dc7622afcfea initial
dwinter
parents:
diff changeset
154 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
155 /* 158 */this.languageAnalyzers.add(new LanguageAnalyzer("de",
dc7622afcfea initial
dwinter
parents:
diff changeset
156 new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
157 /* 160 */this.languageAnalyzers.add(new LanguageAnalyzer("en",
dc7622afcfea initial
dwinter
parents:
diff changeset
158 new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
159 /* 161 */this.languageAnalyzers.add(new LanguageAnalyzer("fr",
dc7622afcfea initial
dwinter
parents:
diff changeset
160 new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
161 /* 162 */this.languageAnalyzers.add(new LanguageAnalyzer("la",
dc7622afcfea initial
dwinter
parents:
diff changeset
162 new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
163
dc7622afcfea initial
dwinter
parents:
diff changeset
164 /* 164 */this.languageAnalyzers.add(new LanguageAnalyzer("all",
dc7622afcfea initial
dwinter
parents:
diff changeset
165 new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
166 /* 165 */this.languageAnalyzers.add(new LanguageAnalyzer("morph",
dc7622afcfea initial
dwinter
parents:
diff changeset
167 new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
168 } catch (CorruptIndexException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
169 /* 167 */e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
170 /* 168 */System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
171 } catch (LockObtainFailedException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
172 /* 170 */e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
173 /* 171 */System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
174 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
175 /* 173 */e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
176 /* 174 */System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
177 }
dc7622afcfea initial
dwinter
parents:
diff changeset
178 }
dc7622afcfea initial
dwinter
parents:
diff changeset
179
dc7622afcfea initial
dwinter
parents:
diff changeset
180 public DocHarvesterThreaded(File docDir, File index_dir,
dc7622afcfea initial
dwinter
parents:
diff changeset
181 String mdProviderUrl) throws CorruptIndexException,
dc7622afcfea initial
dwinter
parents:
diff changeset
182 LockObtainFailedException, IOException {
dc7622afcfea initial
dwinter
parents:
diff changeset
183 /* 180 */this(docDir, index_dir, null, mdProviderUrl, null);
dc7622afcfea initial
dwinter
parents:
diff changeset
184 }
dc7622afcfea initial
dwinter
parents:
diff changeset
185
dc7622afcfea initial
dwinter
parents:
diff changeset
186 protected HashMap<String, String> loadLanguages() {
dc7622afcfea initial
dwinter
parents:
diff changeset
187 /* 187 */File languageFile = new File(this.docDir + "/"
dc7622afcfea initial
dwinter
parents:
diff changeset
188 + this.languageFileName);
dc7622afcfea initial
dwinter
parents:
diff changeset
189 /* 188 */String languageFilePath = this.docDir + "/"
dc7622afcfea initial
dwinter
parents:
diff changeset
190 + this.languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
191 /* 189 */HashMap languages = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
192 /* 190 */boolean relativ = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
193 /* 191 */if (this.languageFileName == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
194 /* 192 */return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
195 /* 193 */if (!languageFile.exists()) {
dc7622afcfea initial
dwinter
parents:
diff changeset
196 /* 195 */languageFile = new File(this.languageFileName);
dc7622afcfea initial
dwinter
parents:
diff changeset
197 /* 196 */languageFilePath = this.languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
198 /* 197 */relativ = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
199 /* 198 */if (!languageFile.exists())
dc7622afcfea initial
dwinter
parents:
diff changeset
200 /* 199 */return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
201 }
dc7622afcfea initial
dwinter
parents:
diff changeset
202 BufferedReader in;
dc7622afcfea initial
dwinter
parents:
diff changeset
203 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
204 /* 203 */in = new BufferedReader(new FileReader(languageFilePath));
dc7622afcfea initial
dwinter
parents:
diff changeset
205 } catch (FileNotFoundException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
206 /* 205 */return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
207 }
dc7622afcfea initial
dwinter
parents:
diff changeset
208
dc7622afcfea initial
dwinter
parents:
diff changeset
209 /* 208 */String zeile = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
210 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
211 /* 210 */while ((zeile = in.readLine()) != null) {
dc7622afcfea initial
dwinter
parents:
diff changeset
212 /* 211 */String[] splitted = zeile.replace("\"", "").split(
dc7622afcfea initial
dwinter
parents:
diff changeset
213 "[,]");
dc7622afcfea initial
dwinter
parents:
diff changeset
214 /* 212 */if (splitted.length == 2)
dc7622afcfea initial
dwinter
parents:
diff changeset
215 /* 213 */if (relativ)
dc7622afcfea initial
dwinter
parents:
diff changeset
216 /* 214 */languages.put(this.docDir + "/" + splitted[0],
dc7622afcfea initial
dwinter
parents:
diff changeset
217 splitted[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
218 else
dc7622afcfea initial
dwinter
parents:
diff changeset
219 /* 216 */languages.put(splitted[0], splitted[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
220 }
dc7622afcfea initial
dwinter
parents:
diff changeset
221 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
222 /* 220 */e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
223 /* 221 */return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
224 }
dc7622afcfea initial
dwinter
parents:
diff changeset
225
dc7622afcfea initial
dwinter
parents:
diff changeset
226 /* 224 */return languages;
dc7622afcfea initial
dwinter
parents:
diff changeset
227 }
dc7622afcfea initial
dwinter
parents:
diff changeset
228
dc7622afcfea initial
dwinter
parents:
diff changeset
229 public void harvestFromRDF(String rdffilepath) throws InterruptedException,
dc7622afcfea initial
dwinter
parents:
diff changeset
230 JDOMException {
dc7622afcfea initial
dwinter
parents:
diff changeset
231 /* 228 */Date start = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
232 /* 229 */boolean create = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
233 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
234 /* 240 */System.out.println("Indexing to directory '"
dc7622afcfea initial
dwinter
parents:
diff changeset
235 + this.index_dir + "'...");
dc7622afcfea initial
dwinter
parents:
diff changeset
236 /* 241 */ArrayList<String> files = getFileListFromRDF(rdffilepath);
dc7622afcfea initial
dwinter
parents:
diff changeset
237 /* 242 */indexDocs(files);
dc7622afcfea initial
dwinter
parents:
diff changeset
238 /* 243 */System.out.println("Optimizing...");
dc7622afcfea initial
dwinter
parents:
diff changeset
239 /* 244 */this.languageAnalyzers.optimize();
dc7622afcfea initial
dwinter
parents:
diff changeset
240 /* 245 */this.languageAnalyzers.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
241
dc7622afcfea initial
dwinter
parents:
diff changeset
242 /* 247 */Date end = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
243 /* 248 */System.out.println(end.getTime() - start.getTime()
dc7622afcfea initial
dwinter
parents:
diff changeset
244 + " total milliseconds");
dc7622afcfea initial
dwinter
parents:
diff changeset
245 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
246 /* 251 */System.out.println(" caught a " + e.getClass() +
dc7622afcfea initial
dwinter
parents:
diff changeset
247 /* 252 */"\n with message: " + e.getMessage());
dc7622afcfea initial
dwinter
parents:
diff changeset
248 }
dc7622afcfea initial
dwinter
parents:
diff changeset
249 }
dc7622afcfea initial
dwinter
parents:
diff changeset
250
dc7622afcfea initial
dwinter
parents:
diff changeset
251 private ArrayList<String> getFileListFromRDF(String rdffilepath)
dc7622afcfea initial
dwinter
parents:
diff changeset
252 throws JDOMException, IOException {
dc7622afcfea initial
dwinter
parents:
diff changeset
253 /* 260 */ArrayList ret = new ArrayList();
dc7622afcfea initial
dwinter
parents:
diff changeset
254 /* 261 */SAXBuilder builder = new SAXBuilder();
dc7622afcfea initial
dwinter
parents:
diff changeset
255
dc7622afcfea initial
dwinter
parents:
diff changeset
256 /* 263 */Document doc = builder.build(rdffilepath);
dc7622afcfea initial
dwinter
parents:
diff changeset
257
dc7622afcfea initial
dwinter
parents:
diff changeset
258 /* 265 */Element el = doc.getRootElement();
dc7622afcfea initial
dwinter
parents:
diff changeset
259
dc7622afcfea initial
dwinter
parents:
diff changeset
260 /* 267 */XPath xpath = XPath.newInstance("//MPIWG:archive-path");
dc7622afcfea initial
dwinter
parents:
diff changeset
261 /* 268 */xpath.addNamespace("MPIWG",
dc7622afcfea initial
dwinter
parents:
diff changeset
262 "http://www.mpiwg-berlin.mpg.de/ns/mpiwg");
dc7622afcfea initial
dwinter
parents:
diff changeset
263 /* 269 */List<Element> paths = xpath.selectNodes(el);
dc7622afcfea initial
dwinter
parents:
diff changeset
264 /* 270 */for (Element path : paths) {
dc7622afcfea initial
dwinter
parents:
diff changeset
265 /* 271 */ret.add(path.getText());
dc7622afcfea initial
dwinter
parents:
diff changeset
266 }
dc7622afcfea initial
dwinter
parents:
diff changeset
267
dc7622afcfea initial
dwinter
parents:
diff changeset
268 /* 274 */return ret;
dc7622afcfea initial
dwinter
parents:
diff changeset
269 }
dc7622afcfea initial
dwinter
parents:
diff changeset
270
dc7622afcfea initial
dwinter
parents:
diff changeset
271 public void harvestFolder() throws InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
272 /* 278 */Date start = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
273 /* 279 */boolean create = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
274 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
275 /* 290 */System.out.println("Indexing to directory '"
dc7622afcfea initial
dwinter
parents:
diff changeset
276 + this.index_dir + "'...");
dc7622afcfea initial
dwinter
parents:
diff changeset
277 /* 291 */indexDocs(this.docDir);
dc7622afcfea initial
dwinter
parents:
diff changeset
278 /* 292 */System.out.println("Optimizing...");
dc7622afcfea initial
dwinter
parents:
diff changeset
279 /* 293 */this.languageAnalyzers.optimize();
dc7622afcfea initial
dwinter
parents:
diff changeset
280 /* 294 */this.languageAnalyzers.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
281
dc7622afcfea initial
dwinter
parents:
diff changeset
282 /* 296 */Date end = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
283 /* 297 */System.out.println(end.getTime() - start.getTime()
dc7622afcfea initial
dwinter
parents:
diff changeset
284 + " total milliseconds");
dc7622afcfea initial
dwinter
parents:
diff changeset
285 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
286 /* 300 */System.out.println(" caught a " + e.getClass() +
dc7622afcfea initial
dwinter
parents:
diff changeset
287 /* 301 */"\n with message: " + e.getMessage());
dc7622afcfea initial
dwinter
parents:
diff changeset
288 }
dc7622afcfea initial
dwinter
parents:
diff changeset
289 }
dc7622afcfea initial
dwinter
parents:
diff changeset
290
dc7622afcfea initial
dwinter
parents:
diff changeset
291 private void indexDocs(ArrayList<String> files) throws IOException,
dc7622afcfea initial
dwinter
parents:
diff changeset
292 InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
293 /* 308 */for (String filename : files) {
dc7622afcfea initial
dwinter
parents:
diff changeset
294 /* 310 */indexDocs(new File(this.docDir.getAbsolutePath()
dc7622afcfea initial
dwinter
parents:
diff changeset
295 + filename));
dc7622afcfea initial
dwinter
parents:
diff changeset
296 if ((DEBUG == true) & (this.filecount > MAXFILES))
dc7622afcfea initial
dwinter
parents:
diff changeset
297 break;
dc7622afcfea initial
dwinter
parents:
diff changeset
298 }
dc7622afcfea initial
dwinter
parents:
diff changeset
299 }
dc7622afcfea initial
dwinter
parents:
diff changeset
300
dc7622afcfea initial
dwinter
parents:
diff changeset
301 void indexDocs(File file) throws IOException, InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
302 /* 317 */if (!file.canRead())
dc7622afcfea initial
dwinter
parents:
diff changeset
303 return;
dc7622afcfea initial
dwinter
parents:
diff changeset
304 /* 319 */
dc7622afcfea initial
dwinter
parents:
diff changeset
305 /* 321 */if ((DEBUG == true) && (this.filecount > MAXFILES))
dc7622afcfea initial
dwinter
parents:
diff changeset
306 return;
dc7622afcfea initial
dwinter
parents:
diff changeset
307 /* 325 */String[] files = file.list();
dc7622afcfea initial
dwinter
parents:
diff changeset
308
dc7622afcfea initial
dwinter
parents:
diff changeset
309 /* 327 */String folderName = file.getName();
dc7622afcfea initial
dwinter
parents:
diff changeset
310
dc7622afcfea initial
dwinter
parents:
diff changeset
311 boolean notExists = !checkFileExists(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
312 boolean createNew = CREATE_NEW || notExists;
dc7622afcfea initial
dwinter
parents:
diff changeset
313 // boolean createNew = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
314
dc7622afcfea initial
dwinter
parents:
diff changeset
315 boolean fileStillEmpty = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
316 if (createNew) {
dc7622afcfea initial
dwinter
parents:
diff changeset
317 clearFile(file); // loesche das gesamtfile
dc7622afcfea initial
dwinter
parents:
diff changeset
318 } else {
dc7622afcfea initial
dwinter
parents:
diff changeset
319 fileStillEmpty = false; // assume that file is not empty, if it already exists
dc7622afcfea initial
dwinter
parents:
diff changeset
320 }
dc7622afcfea initial
dwinter
parents:
diff changeset
321
dc7622afcfea initial
dwinter
parents:
diff changeset
322
dc7622afcfea initial
dwinter
parents:
diff changeset
323 if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0
dc7622afcfea initial
dwinter
parents:
diff changeset
324 : 1)) != 0) {
dc7622afcfea initial
dwinter
parents:
diff changeset
325 for (int i = 0; i < files.length; ++i) {
dc7622afcfea initial
dwinter
parents:
diff changeset
326 File nextFile = new File(file, files[i]);
dc7622afcfea initial
dwinter
parents:
diff changeset
327
dc7622afcfea initial
dwinter
parents:
diff changeset
328 if (nextFile.isDirectory()) // directory dann gehe in die
dc7622afcfea initial
dwinter
parents:
diff changeset
329 // naechste ebene
dc7622afcfea initial
dwinter
parents:
diff changeset
330 indexDocs(nextFile);
dc7622afcfea initial
dwinter
parents:
diff changeset
331
dc7622afcfea initial
dwinter
parents:
diff changeset
332 else if (isTextFile(nextFile)) {
dc7622afcfea initial
dwinter
parents:
diff changeset
333
dc7622afcfea initial
dwinter
parents:
diff changeset
334 if (createNew) {
dc7622afcfea initial
dwinter
parents:
diff changeset
335 fileStillEmpty = false; //datei hat jetzt einen Inhalt
dc7622afcfea initial
dwinter
parents:
diff changeset
336 compose(file, nextFile); // fuege das file an das
dc7622afcfea initial
dwinter
parents:
diff changeset
337 // gesamtfilean
dc7622afcfea initial
dwinter
parents:
diff changeset
338 }
dc7622afcfea initial
dwinter
parents:
diff changeset
339
dc7622afcfea initial
dwinter
parents:
diff changeset
340 }
dc7622afcfea initial
dwinter
parents:
diff changeset
341 if ((DEBUG == true) && (this.filecount > MAXFILES))
dc7622afcfea initial
dwinter
parents:
diff changeset
342 break;
dc7622afcfea initial
dwinter
parents:
diff changeset
343 }
dc7622afcfea initial
dwinter
parents:
diff changeset
344 if (createNew) {
dc7622afcfea initial
dwinter
parents:
diff changeset
345 if (fileStillEmpty){
dc7622afcfea initial
dwinter
parents:
diff changeset
346 deleteComposedFile(file); // file hat keinen inhalt dann loeschen
dc7622afcfea initial
dwinter
parents:
diff changeset
347 } else {
dc7622afcfea initial
dwinter
parents:
diff changeset
348 finishFile(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
349 }
dc7622afcfea initial
dwinter
parents:
diff changeset
350 }
dc7622afcfea initial
dwinter
parents:
diff changeset
351
dc7622afcfea initial
dwinter
parents:
diff changeset
352 if (!fileStillEmpty)
dc7622afcfea initial
dwinter
parents:
diff changeset
353 processCompleteFile(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
354 /* 335 */} else {
dc7622afcfea initial
dwinter
parents:
diff changeset
355 /* 342 */System.out.println("not adding " + file);
dc7622afcfea initial
dwinter
parents:
diff changeset
356 }
dc7622afcfea initial
dwinter
parents:
diff changeset
357 }
dc7622afcfea initial
dwinter
parents:
diff changeset
358
dc7622afcfea initial
dwinter
parents:
diff changeset
359 private void finishFile(File folder) {
dc7622afcfea initial
dwinter
parents:
diff changeset
360 File cf = getComposedFile(folder);
dc7622afcfea initial
dwinter
parents:
diff changeset
361 System.out.println();
dc7622afcfea initial
dwinter
parents:
diff changeset
362 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
363 System.out.println("finish file:" + cf.getCanonicalPath());
dc7622afcfea initial
dwinter
parents:
diff changeset
364 FileWriter fw = new FileWriter(cf, true);
dc7622afcfea initial
dwinter
parents:
diff changeset
365
dc7622afcfea initial
dwinter
parents:
diff changeset
366 fw.write("</document>");
dc7622afcfea initial
dwinter
parents:
diff changeset
367 fw.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
368
dc7622afcfea initial
dwinter
parents:
diff changeset
369 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
370 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
371 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
372 }
dc7622afcfea initial
dwinter
parents:
diff changeset
373
dc7622afcfea initial
dwinter
parents:
diff changeset
374 }
dc7622afcfea initial
dwinter
parents:
diff changeset
375
dc7622afcfea initial
dwinter
parents:
diff changeset
376 private boolean deleteComposedFile(File folder) {
dc7622afcfea initial
dwinter
parents:
diff changeset
377 File cf = getComposedFile(folder);
dc7622afcfea initial
dwinter
parents:
diff changeset
378 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
379 System.out.println("file deleted, because empty:" + cf.getCanonicalPath());
dc7622afcfea initial
dwinter
parents:
diff changeset
380 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
381 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
382 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
383 }
dc7622afcfea initial
dwinter
parents:
diff changeset
384 return cf.delete();
dc7622afcfea initial
dwinter
parents:
diff changeset
385 }
dc7622afcfea initial
dwinter
parents:
diff changeset
386
dc7622afcfea initial
dwinter
parents:
diff changeset
387
dc7622afcfea initial
dwinter
parents:
diff changeset
388 private void processCompleteFile(File folder) {
dc7622afcfea initial
dwinter
parents:
diff changeset
389 System.out.println("Completed File:"+String.valueOf(completedFiles++));
dc7622afcfea initial
dwinter
parents:
diff changeset
390 File cf = getComposedFile(folder);
dc7622afcfea initial
dwinter
parents:
diff changeset
391 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
392 processFile(cf);
dc7622afcfea initial
dwinter
parents:
diff changeset
393 } catch (CorruptIndexException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
394 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
395 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
396 } catch (LockObtainFailedException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
397 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
398 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
399 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
400 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
401 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
402 }
dc7622afcfea initial
dwinter
parents:
diff changeset
403
dc7622afcfea initial
dwinter
parents:
diff changeset
404 }
dc7622afcfea initial
dwinter
parents:
diff changeset
405
dc7622afcfea initial
dwinter
parents:
diff changeset
406 private boolean checkFileExists(File folder) {
dc7622afcfea initial
dwinter
parents:
diff changeset
407 File cf = getComposedFile(folder);
dc7622afcfea initial
dwinter
parents:
diff changeset
408 return cf.exists();
dc7622afcfea initial
dwinter
parents:
diff changeset
409
dc7622afcfea initial
dwinter
parents:
diff changeset
410 }
dc7622afcfea initial
dwinter
parents:
diff changeset
411
dc7622afcfea initial
dwinter
parents:
diff changeset
412 private void clearFile(File folder) {
dc7622afcfea initial
dwinter
parents:
diff changeset
413 File cf = getComposedFile(folder);
dc7622afcfea initial
dwinter
parents:
diff changeset
414 cf.delete();
dc7622afcfea initial
dwinter
parents:
diff changeset
415 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
416 File dir = cf.getParentFile();
dc7622afcfea initial
dwinter
parents:
diff changeset
417 if (false == dir.exists()) {
dc7622afcfea initial
dwinter
parents:
diff changeset
418 dir.mkdirs();
dc7622afcfea initial
dwinter
parents:
diff changeset
419 }
dc7622afcfea initial
dwinter
parents:
diff changeset
420
dc7622afcfea initial
dwinter
parents:
diff changeset
421 cf.createNewFile();
dc7622afcfea initial
dwinter
parents:
diff changeset
422
dc7622afcfea initial
dwinter
parents:
diff changeset
423 FileWriter fw = new FileWriter(cf);
dc7622afcfea initial
dwinter
parents:
diff changeset
424 fw.write("<document>");
dc7622afcfea initial
dwinter
parents:
diff changeset
425 fw.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
426
dc7622afcfea initial
dwinter
parents:
diff changeset
427 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
428 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
429 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
430 }
dc7622afcfea initial
dwinter
parents:
diff changeset
431
dc7622afcfea initial
dwinter
parents:
diff changeset
432 }
dc7622afcfea initial
dwinter
parents:
diff changeset
433
dc7622afcfea initial
dwinter
parents:
diff changeset
434 private void compose(File folder, File file) {
dc7622afcfea initial
dwinter
parents:
diff changeset
435 File cf = getComposedFile(folder);
dc7622afcfea initial
dwinter
parents:
diff changeset
436 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
437 System.out.println("Adding" + file.getCanonicalPath());
dc7622afcfea initial
dwinter
parents:
diff changeset
438 //FileWriter fw = new FileWriter(cf, true);
dc7622afcfea initial
dwinter
parents:
diff changeset
439
dc7622afcfea initial
dwinter
parents:
diff changeset
440 FileOutputStream stream = new FileOutputStream(cf,true);
dc7622afcfea initial
dwinter
parents:
diff changeset
441
dc7622afcfea initial
dwinter
parents:
diff changeset
442 OutputStreamWriter fw = new OutputStreamWriter(stream, "utf-8");
dc7622afcfea initial
dwinter
parents:
diff changeset
443
dc7622afcfea initial
dwinter
parents:
diff changeset
444 String filteredDocument="";
dc7622afcfea initial
dwinter
parents:
diff changeset
445 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
446 filteredDocument = getFilteredFile(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
447 } catch (TransformerException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
448 filteredDocument = "";
dc7622afcfea initial
dwinter
parents:
diff changeset
449 }catch (ParserConfigurationException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
450 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
451 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
452 } catch (SAXException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
453 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
454 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
455 }
dc7622afcfea initial
dwinter
parents:
diff changeset
456
dc7622afcfea initial
dwinter
parents:
diff changeset
457 fw.append(filteredDocument);
dc7622afcfea initial
dwinter
parents:
diff changeset
458 fw.write("<pb name=\"");
dc7622afcfea initial
dwinter
parents:
diff changeset
459 fw.write(file.getName());
dc7622afcfea initial
dwinter
parents:
diff changeset
460 fw.write("\"/>");
dc7622afcfea initial
dwinter
parents:
diff changeset
461 fw.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
462
dc7622afcfea initial
dwinter
parents:
diff changeset
463 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
464 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
465 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
466 }
dc7622afcfea initial
dwinter
parents:
diff changeset
467
dc7622afcfea initial
dwinter
parents:
diff changeset
468 }
dc7622afcfea initial
dwinter
parents:
diff changeset
469
dc7622afcfea initial
dwinter
parents:
diff changeset
470 private String getFilteredFile(File file) throws IOException,
dc7622afcfea initial
dwinter
parents:
diff changeset
471 TransformerException, ParserConfigurationException, SAXException {
dc7622afcfea initial
dwinter
parents:
diff changeset
472
dc7622afcfea initial
dwinter
parents:
diff changeset
473 // String txt = IOUtils.toString(new FileInputStream(file));
dc7622afcfea initial
dwinter
parents:
diff changeset
474 // get rid of the entities
dc7622afcfea initial
dwinter
parents:
diff changeset
475 TransformerFactory tf = TransformerFactory.newInstance();
dc7622afcfea initial
dwinter
parents:
diff changeset
476 Transformer t = tf.newTransformer();
dc7622afcfea initial
dwinter
parents:
diff changeset
477
dc7622afcfea initial
dwinter
parents:
diff changeset
478
dc7622afcfea initial
dwinter
parents:
diff changeset
479 //OutputStream output = new ByteArrayOutputStream();
dc7622afcfea initial
dwinter
parents:
diff changeset
480
dc7622afcfea initial
dwinter
parents:
diff changeset
481 //BufferedWriter sw = new BufferedWriter(new OutputStreamWriter(output, "utf-8"));
dc7622afcfea initial
dwinter
parents:
diff changeset
482
dc7622afcfea initial
dwinter
parents:
diff changeset
483 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dc7622afcfea initial
dwinter
parents:
diff changeset
484 dbf.setNamespaceAware(true);
dc7622afcfea initial
dwinter
parents:
diff changeset
485 dbf.setValidating(false);
dc7622afcfea initial
dwinter
parents:
diff changeset
486 DocumentBuilder db = dbf.newDocumentBuilder();
dc7622afcfea initial
dwinter
parents:
diff changeset
487 db.setEntityResolver(new MyResolver());
dc7622afcfea initial
dwinter
parents:
diff changeset
488 org.w3c.dom.Document doc = db.parse(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
489
dc7622afcfea initial
dwinter
parents:
diff changeset
490 StringWriter sw = new StringWriter();
dc7622afcfea initial
dwinter
parents:
diff changeset
491 StreamResult sr = new StreamResult(sw);
dc7622afcfea initial
dwinter
parents:
diff changeset
492
dc7622afcfea initial
dwinter
parents:
diff changeset
493 org.w3c.dom.Document tgtDoc = db.newDocument();
dc7622afcfea initial
dwinter
parents:
diff changeset
494 DocumentFragment fragment = tgtDoc.createDocumentFragment();
dc7622afcfea initial
dwinter
parents:
diff changeset
495 DOMResult tgtDom = new DOMResult( fragment );
dc7622afcfea initial
dwinter
parents:
diff changeset
496
dc7622afcfea initial
dwinter
parents:
diff changeset
497 t.setOutputProperty(OutputKeys.ENCODING, "utf-8");
dc7622afcfea initial
dwinter
parents:
diff changeset
498 t.transform(new DOMSource(doc), sr);
dc7622afcfea initial
dwinter
parents:
diff changeset
499 t.transform(new DOMSource(doc), tgtDom);
dc7622afcfea initial
dwinter
parents:
diff changeset
500
dc7622afcfea initial
dwinter
parents:
diff changeset
501 String txt = sw.toString();
dc7622afcfea initial
dwinter
parents:
diff changeset
502
dc7622afcfea initial
dwinter
parents:
diff changeset
503
dc7622afcfea initial
dwinter
parents:
diff changeset
504
dc7622afcfea initial
dwinter
parents:
diff changeset
505 Pattern p = Pattern.compile("<body>(.*)</body>", Pattern.DOTALL);
dc7622afcfea initial
dwinter
parents:
diff changeset
506 Matcher m = p.matcher(txt);
dc7622afcfea initial
dwinter
parents:
diff changeset
507 if (m.find())
dc7622afcfea initial
dwinter
parents:
diff changeset
508 if (m.groupCount() > 0) {
dc7622afcfea initial
dwinter
parents:
diff changeset
509 return m.group(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
510 }
dc7622afcfea initial
dwinter
parents:
diff changeset
511 return "";
dc7622afcfea initial
dwinter
parents:
diff changeset
512 }
dc7622afcfea initial
dwinter
parents:
diff changeset
513
dc7622afcfea initial
dwinter
parents:
diff changeset
514 private File getComposedFile(File folder) {
dc7622afcfea initial
dwinter
parents:
diff changeset
515 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
516 String path = folder.getCanonicalPath();
dc7622afcfea initial
dwinter
parents:
diff changeset
517 String newPath = PREFIX + path + "/" + COMPOSEDFN;
dc7622afcfea initial
dwinter
parents:
diff changeset
518 return new File(newPath);
dc7622afcfea initial
dwinter
parents:
diff changeset
519 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
520 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
521 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
522 }
dc7622afcfea initial
dwinter
parents:
diff changeset
523 return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
524 }
dc7622afcfea initial
dwinter
parents:
diff changeset
525
dc7622afcfea initial
dwinter
parents:
diff changeset
526 protected void processFile(File file) throws CorruptIndexException,
dc7622afcfea initial
dwinter
parents:
diff changeset
527 LockObtainFailedException, IOException {
dc7622afcfea initial
dwinter
parents:
diff changeset
528 /* 348 */int freeThread = -1;
dc7622afcfea initial
dwinter
parents:
diff changeset
529 /* 349 */while (freeThread == -1) {
dc7622afcfea initial
dwinter
parents:
diff changeset
530 /* 351 */freeThread = waitForFreeThread();
dc7622afcfea initial
dwinter
parents:
diff changeset
531 }
dc7622afcfea initial
dwinter
parents:
diff changeset
532
dc7622afcfea initial
dwinter
parents:
diff changeset
533 /* 355 */if (this.textLanguage == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
534 /* 356 */this.textLanguage = loadLanguages();
dc7622afcfea initial
dwinter
parents:
diff changeset
535 /* 357 */this.mythreads[freeThread] = new ProcessFileThread(
dc7622afcfea initial
dwinter
parents:
diff changeset
536 this.languageAnalyzers, file, this.languageFileName,
dc7622afcfea initial
dwinter
parents:
diff changeset
537 this.textLanguage, this.mdProviderUrl, this.preferedLanguage,
dc7622afcfea initial
dwinter
parents:
diff changeset
538 this.languageToISO, this.supportedLanguageFolder);
dc7622afcfea initial
dwinter
parents:
diff changeset
539 /* 358 */this.mythreads[freeThread].start();
dc7622afcfea initial
dwinter
parents:
diff changeset
540 /* 359 */System.out.println("New process started:" + freeThread);
dc7622afcfea initial
dwinter
parents:
diff changeset
541 }
dc7622afcfea initial
dwinter
parents:
diff changeset
542
dc7622afcfea initial
dwinter
parents:
diff changeset
543 protected int waitForFreeThread() {
dc7622afcfea initial
dwinter
parents:
diff changeset
544 /* 367 */for (int i = 0; i < maxThread; ++i) {
dc7622afcfea initial
dwinter
parents:
diff changeset
545 /* 369 */if (this.mythreads[i] == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
546 /* 370 */return i;
dc7622afcfea initial
dwinter
parents:
diff changeset
547 /* 371 */if (!this.mythreads[i].done)
dc7622afcfea initial
dwinter
parents:
diff changeset
548 continue;
dc7622afcfea initial
dwinter
parents:
diff changeset
549 /* 373 */this.filecount += 1;
dc7622afcfea initial
dwinter
parents:
diff changeset
550 /* 374 */System.out.println("filecount:" + this.filecount);
dc7622afcfea initial
dwinter
parents:
diff changeset
551 /* 375 */return i;
dc7622afcfea initial
dwinter
parents:
diff changeset
552 }
dc7622afcfea initial
dwinter
parents:
diff changeset
553
dc7622afcfea initial
dwinter
parents:
diff changeset
554 /* 378 */return -1;
dc7622afcfea initial
dwinter
parents:
diff changeset
555 }
dc7622afcfea initial
dwinter
parents:
diff changeset
556
dc7622afcfea initial
dwinter
parents:
diff changeset
557 private boolean isTextFile(File file) {
dc7622afcfea initial
dwinter
parents:
diff changeset
558 /* 392 */String fn = file.getName();
dc7622afcfea initial
dwinter
parents:
diff changeset
559
dc7622afcfea initial
dwinter
parents:
diff changeset
560 /* 394 */String[] splitted = fn.split("[.]");
dc7622afcfea initial
dwinter
parents:
diff changeset
561
dc7622afcfea initial
dwinter
parents:
diff changeset
562 /* 396 */String ext = "";
dc7622afcfea initial
dwinter
parents:
diff changeset
563
dc7622afcfea initial
dwinter
parents:
diff changeset
564 /* 398 */if (splitted.length > 1) {
dc7622afcfea initial
dwinter
parents:
diff changeset
565 /* 400 */ext = splitted[(splitted.length - 1)];
dc7622afcfea initial
dwinter
parents:
diff changeset
566 }
dc7622afcfea initial
dwinter
parents:
diff changeset
567 boolean ret = fileTypesToIndex.contains(ext);
dc7622afcfea initial
dwinter
parents:
diff changeset
568 /* 403 */return ret;
dc7622afcfea initial
dwinter
parents:
diff changeset
569 }
dc7622afcfea initial
dwinter
parents:
diff changeset
570
dc7622afcfea initial
dwinter
parents:
diff changeset
571 }
dc7622afcfea initial
dwinter
parents:
diff changeset
572
dc7622afcfea initial
dwinter
parents:
diff changeset
573 /*
dc7622afcfea initial
dwinter
parents:
diff changeset
574 * Location: /private/tmp/fulltextIndexer.jar Qualified Name:
dc7622afcfea initial
dwinter
parents:
diff changeset
575 * de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded JD-Core Version:
dc7622afcfea initial
dwinter
parents:
diff changeset
576 * 0.5.4
dc7622afcfea initial
dwinter
parents:
diff changeset
577 */