Mercurial > hg > fulltextIndexer
comparison src/de/mpiwg/dwinter/fulltextIndexer/harvester/.svn/text-base/DocHarvesterThreaded.java.svn-base @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dc7622afcfea |
---|---|
1 package de.mpiwg.dwinter.fulltextIndexer.harvester; | |
2 | |
3 /* Harveste jeweils ein komplettes Buch in einen Eintrag | |
4 * | |
5 * */ | |
6 import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread; | |
7 | |
8 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer; | |
9 | |
10 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; | |
11 | |
12 import java.io.BufferedReader; | |
13 | |
14 import java.io.File; | |
15 | |
16 import java.io.FileNotFoundException; | |
17 | |
18 import java.io.BufferedInputStream; | |
19 import java.io.BufferedWriter; | |
20 import java.io.ByteArrayOutputStream; | |
21 import java.io.FileInputStream; | |
22 import java.io.FileOutputStream; | |
23 import java.io.FileReader; | |
24 import java.io.FileWriter; | |
25 import java.io.InputStream; | |
26 import java.io.OutputStream; | |
27 import java.io.OutputStreamWriter; | |
28 import java.io.StringWriter; | |
29 | |
30 import java.io.IOException; | |
31 | |
32 import java.io.PrintStream; | |
33 | |
34 import java.util.ArrayList; | |
35 | |
36 import java.util.Arrays; | |
37 | |
38 import java.util.Date; | |
39 | |
40 import java.util.HashMap; | |
41 | |
42 import java.util.List; | |
43 import java.util.regex.Matcher; | |
44 import java.util.regex.Pattern; | |
45 | |
46 import javax.xml.parsers.DocumentBuilder; | |
47 import javax.xml.parsers.DocumentBuilderFactory; | |
48 import javax.xml.parsers.ParserConfigurationException; | |
49 import javax.xml.parsers.SAXParser; | |
50 import javax.xml.transform.OutputKeys; | |
51 import javax.xml.transform.Transformer; | |
52 import javax.xml.transform.TransformerConfigurationException; | |
53 import javax.xml.transform.TransformerException; | |
54 import javax.xml.transform.TransformerFactory; | |
55 import javax.xml.transform.dom.DOMResult; | |
56 import javax.xml.transform.dom.DOMSource; | |
57 import javax.xml.transform.stream.StreamResult; | |
58 import javax.xml.transform.stream.StreamSource; | |
59 | |
60 import org.apache.commons.io.IOUtils; | |
61 import org.apache.lucene.analysis.de.GermanAnalyzer; | |
62 | |
63 import org.apache.lucene.analysis.fr.FrenchAnalyzer; | |
64 | |
65 import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
66 | |
67 import org.apache.lucene.index.CorruptIndexException; | |
68 | |
69 import org.apache.lucene.store.LockObtainFailedException; | |
70 | |
71 import org.apache.lucene.util.Version; | |
72 import org.apache.ws.commons.serialize.XMLWriterImpl; | |
73 | |
74 import org.jdom.Document; | |
75 | |
76 import org.jdom.Element; | |
77 | |
78 import org.jdom.JDOMException; | |
79 | |
80 import org.jdom.input.SAXBuilder; | |
81 import org.jdom.xpath.XPath; | |
82 import org.w3c.dom.DocumentFragment; | |
83 import org.xml.sax.SAXException; | |
84 | |
85 import com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl; | |
86 | |
87 public class DocHarvesterThreaded { | |
88 private static final boolean DEBUG = false; | |
89 private static final int MAXFILES = 3; | |
90 | |
91 //private static final String PREFIX = "/tmp/composed/files"; | |
92 private static final String PREFIX = "/Volumes/data/composed/files"; | |
93 private static final String COMPOSEDFN = "doc.xml"; | |
94 private static final boolean CREATE_NEW = false; | |
95 | |
96 protected static ArrayList<String> fileTypesToIndex = new ArrayList( | |
97 Arrays.asList(new String[] { "xml" })); | |
98 | |
99 protected static ArrayList<String> excludeFolders = new ArrayList( | |
100 Arrays.asList(new String[] { "OCR" })); | |
101 protected static boolean indexMetaPriority = false; | |
102 | |
103 private static String specialMode = ""; | |
104 protected static int maxThread = 30; | |
105 protected File docDir; | |
106 protected File index_dir; | |
107 protected HashMap<String, String> textLanguage = null; | |
108 protected HashMap<String, String> languageToISO = new HashMap(); | |
109 protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers(); | |
110 | |
111 private int counter = 0; | |
112 protected String languageFileName; | |
113 protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread]; | |
114 private int filecount = 0; | |
115 | |
116 protected String mdProviderUrl = null; | |
117 private String preferedLanguage; | |
118 protected HashMap<String, String> supportedLanguageFolder = new HashMap(); | |
119 private int completedFiles = 0; // counter for all files completed and indexed | |
120 | |
121 public DocHarvesterThreaded() { | |
122 } | |
123 | |
124 public DocHarvesterThreaded(File docDir, File index_dir, | |
125 String languageFileName, String mdProviderUrl, String lang) | |
126 throws CorruptIndexException, LockObtainFailedException, | |
127 IOException { | |
128 /* 119 */this.docDir = docDir; | |
129 /* 120 */this.languageFileName = languageFileName; | |
130 /* 121 */this.preferedLanguage = lang; | |
131 | |
132 /* 133 */this.mdProviderUrl = mdProviderUrl; | |
133 | |
134 /* 135 */this.index_dir = index_dir; | |
135 | |
136 /* 137 */for (int i = 0; i < maxThread; ++i) { | |
137 /* 139 */this.mythreads[i] = null; | |
138 } | |
139 | |
140 /* 142 */init_languages(); | |
141 } | |
142 | |
143 private void init_languages() { | |
144 /* 146 */this.languageToISO.put("German", "de"); | |
145 /* 147 */this.languageToISO.put("French", "fr"); | |
146 /* 148 */this.languageToISO.put("English", "en"); | |
147 /* 149 */this.languageToISO.put("German-f", "de-f"); | |
148 | |
149 /* 151 */this.supportedLanguageFolder.put("deu", "de"); | |
150 /* 152 */this.supportedLanguageFolder.put("deu-f", "de"); | |
151 /* 153 */this.supportedLanguageFolder.put("fra", "fr"); | |
152 /* 154 */this.supportedLanguageFolder.put("eng", "en"); | |
153 /* 155 */this.supportedLanguageFolder.put("lic", "la"); | |
154 try { | |
155 /* 158 */this.languageAnalyzers.add(new LanguageAnalyzer("de", | |
156 new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); | |
157 /* 160 */this.languageAnalyzers.add(new LanguageAnalyzer("en", | |
158 new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); | |
159 /* 161 */this.languageAnalyzers.add(new LanguageAnalyzer("fr", | |
160 new FrenchAnalyzer(Version.LUCENE_30), this.index_dir)); | |
161 /* 162 */this.languageAnalyzers.add(new LanguageAnalyzer("la", | |
162 new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); | |
163 | |
164 /* 164 */this.languageAnalyzers.add(new LanguageAnalyzer("all", | |
165 new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); | |
166 /* 165 */this.languageAnalyzers.add(new LanguageAnalyzer("morph", | |
167 new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); | |
168 } catch (CorruptIndexException e) { | |
169 /* 167 */e.printStackTrace(); | |
170 /* 168 */System.exit(1); | |
171 } catch (LockObtainFailedException e) { | |
172 /* 170 */e.printStackTrace(); | |
173 /* 171 */System.exit(1); | |
174 } catch (IOException e) { | |
175 /* 173 */e.printStackTrace(); | |
176 /* 174 */System.exit(1); | |
177 } | |
178 } | |
179 | |
180 public DocHarvesterThreaded(File docDir, File index_dir, | |
181 String mdProviderUrl) throws CorruptIndexException, | |
182 LockObtainFailedException, IOException { | |
183 /* 180 */this(docDir, index_dir, null, mdProviderUrl, null); | |
184 } | |
185 | |
186 protected HashMap<String, String> loadLanguages() { | |
187 /* 187 */File languageFile = new File(this.docDir + "/" | |
188 + this.languageFileName); | |
189 /* 188 */String languageFilePath = this.docDir + "/" | |
190 + this.languageFileName; | |
191 /* 189 */HashMap languages = new HashMap(); | |
192 /* 190 */boolean relativ = true; | |
193 /* 191 */if (this.languageFileName == null) | |
194 /* 192 */return null; | |
195 /* 193 */if (!languageFile.exists()) { | |
196 /* 195 */languageFile = new File(this.languageFileName); | |
197 /* 196 */languageFilePath = this.languageFileName; | |
198 /* 197 */relativ = false; | |
199 /* 198 */if (!languageFile.exists()) | |
200 /* 199 */return null; | |
201 } | |
202 BufferedReader in; | |
203 try { | |
204 /* 203 */in = new BufferedReader(new FileReader(languageFilePath)); | |
205 } catch (FileNotFoundException e) { | |
206 /* 205 */return null; | |
207 } | |
208 | |
209 /* 208 */String zeile = null; | |
210 try { | |
211 /* 210 */while ((zeile = in.readLine()) != null) { | |
212 /* 211 */String[] splitted = zeile.replace("\"", "").split( | |
213 "[,]"); | |
214 /* 212 */if (splitted.length == 2) | |
215 /* 213 */if (relativ) | |
216 /* 214 */languages.put(this.docDir + "/" + splitted[0], | |
217 splitted[1]); | |
218 else | |
219 /* 216 */languages.put(splitted[0], splitted[1]); | |
220 } | |
221 } catch (IOException e) { | |
222 /* 220 */e.printStackTrace(); | |
223 /* 221 */return null; | |
224 } | |
225 | |
226 /* 224 */return languages; | |
227 } | |
228 | |
229 public void harvestFromRDF(String rdffilepath) throws InterruptedException, | |
230 JDOMException { | |
231 /* 228 */Date start = new Date(); | |
232 /* 229 */boolean create = true; | |
233 try { | |
234 /* 240 */System.out.println("Indexing to directory '" | |
235 + this.index_dir + "'..."); | |
236 /* 241 */ArrayList<String> files = getFileListFromRDF(rdffilepath); | |
237 /* 242 */indexDocs(files); | |
238 /* 243 */System.out.println("Optimizing..."); | |
239 /* 244 */this.languageAnalyzers.optimize(); | |
240 /* 245 */this.languageAnalyzers.close(); | |
241 | |
242 /* 247 */Date end = new Date(); | |
243 /* 248 */System.out.println(end.getTime() - start.getTime() | |
244 + " total milliseconds"); | |
245 } catch (IOException e) { | |
246 /* 251 */System.out.println(" caught a " + e.getClass() + | |
247 /* 252 */"\n with message: " + e.getMessage()); | |
248 } | |
249 } | |
250 | |
251 private ArrayList<String> getFileListFromRDF(String rdffilepath) | |
252 throws JDOMException, IOException { | |
253 /* 260 */ArrayList ret = new ArrayList(); | |
254 /* 261 */SAXBuilder builder = new SAXBuilder(); | |
255 | |
256 /* 263 */Document doc = builder.build(rdffilepath); | |
257 | |
258 /* 265 */Element el = doc.getRootElement(); | |
259 | |
260 /* 267 */XPath xpath = XPath.newInstance("//MPIWG:archive-path"); | |
261 /* 268 */xpath.addNamespace("MPIWG", | |
262 "http://www.mpiwg-berlin.mpg.de/ns/mpiwg"); | |
263 /* 269 */List<Element> paths = xpath.selectNodes(el); | |
264 /* 270 */for (Element path : paths) { | |
265 /* 271 */ret.add(path.getText()); | |
266 } | |
267 | |
268 /* 274 */return ret; | |
269 } | |
270 | |
271 public void harvestFolder() throws InterruptedException { | |
272 /* 278 */Date start = new Date(); | |
273 /* 279 */boolean create = true; | |
274 try { | |
275 /* 290 */System.out.println("Indexing to directory '" | |
276 + this.index_dir + "'..."); | |
277 /* 291 */indexDocs(this.docDir); | |
278 /* 292 */System.out.println("Optimizing..."); | |
279 /* 293 */this.languageAnalyzers.optimize(); | |
280 /* 294 */this.languageAnalyzers.close(); | |
281 | |
282 /* 296 */Date end = new Date(); | |
283 /* 297 */System.out.println(end.getTime() - start.getTime() | |
284 + " total milliseconds"); | |
285 } catch (IOException e) { | |
286 /* 300 */System.out.println(" caught a " + e.getClass() + | |
287 /* 301 */"\n with message: " + e.getMessage()); | |
288 } | |
289 } | |
290 | |
291 private void indexDocs(ArrayList<String> files) throws IOException, | |
292 InterruptedException { | |
293 /* 308 */for (String filename : files) { | |
294 /* 310 */indexDocs(new File(this.docDir.getAbsolutePath() | |
295 + filename)); | |
296 if ((DEBUG == true) & (this.filecount > MAXFILES)) | |
297 break; | |
298 } | |
299 } | |
300 | |
301 void indexDocs(File file) throws IOException, InterruptedException { | |
302 /* 317 */if (!file.canRead()) | |
303 return; | |
304 /* 319 */ | |
305 /* 321 */if ((DEBUG == true) && (this.filecount > MAXFILES)) | |
306 return; | |
307 /* 325 */String[] files = file.list(); | |
308 | |
309 /* 327 */String folderName = file.getName(); | |
310 | |
311 boolean notExists = !checkFileExists(file); | |
312 boolean createNew = CREATE_NEW || notExists; | |
313 // boolean createNew = true; | |
314 | |
315 boolean fileStillEmpty = true; | |
316 if (createNew) { | |
317 clearFile(file); // loesche das gesamtfile | |
318 } else { | |
319 fileStillEmpty = false; // assume that file is not empty, if it already exists | |
320 } | |
321 | |
322 | |
323 if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0 | |
324 : 1)) != 0) { | |
325 for (int i = 0; i < files.length; ++i) { | |
326 File nextFile = new File(file, files[i]); | |
327 | |
328 if (nextFile.isDirectory()) // directory dann gehe in die | |
329 // naechste ebene | |
330 indexDocs(nextFile); | |
331 | |
332 else if (isTextFile(nextFile)) { | |
333 | |
334 if (createNew) { | |
335 fileStillEmpty = false; //datei hat jetzt einen Inhalt | |
336 compose(file, nextFile); // fuege das file an das | |
337 // gesamtfilean | |
338 } | |
339 | |
340 } | |
341 if ((DEBUG == true) && (this.filecount > MAXFILES)) | |
342 break; | |
343 } | |
344 if (createNew) { | |
345 if (fileStillEmpty){ | |
346 deleteComposedFile(file); // file hat keinen inhalt dann loeschen | |
347 } else { | |
348 finishFile(file); | |
349 } | |
350 } | |
351 | |
352 if (!fileStillEmpty) | |
353 processCompleteFile(file); | |
354 /* 335 */} else { | |
355 /* 342 */System.out.println("not adding " + file); | |
356 } | |
357 } | |
358 | |
359 private void finishFile(File folder) { | |
360 File cf = getComposedFile(folder); | |
361 System.out.println(); | |
362 try { | |
363 System.out.println("finish file:" + cf.getCanonicalPath()); | |
364 FileWriter fw = new FileWriter(cf, true); | |
365 | |
366 fw.write("</document>"); | |
367 fw.close(); | |
368 | |
369 } catch (IOException e) { | |
370 // TODO Auto-generated catch block | |
371 e.printStackTrace(); | |
372 } | |
373 | |
374 } | |
375 | |
376 private boolean deleteComposedFile(File folder) { | |
377 File cf = getComposedFile(folder); | |
378 try { | |
379 System.out.println("file deleted, because empty:" + cf.getCanonicalPath()); | |
380 } catch (IOException e) { | |
381 // TODO Auto-generated catch block | |
382 e.printStackTrace(); | |
383 } | |
384 return cf.delete(); | |
385 } | |
386 | |
387 | |
388 private void processCompleteFile(File folder) { | |
389 System.out.println("Completed File:"+String.valueOf(completedFiles++)); | |
390 File cf = getComposedFile(folder); | |
391 try { | |
392 processFile(cf); | |
393 } catch (CorruptIndexException e) { | |
394 // TODO Auto-generated catch block | |
395 e.printStackTrace(); | |
396 } catch (LockObtainFailedException e) { | |
397 // TODO Auto-generated catch block | |
398 e.printStackTrace(); | |
399 } catch (IOException e) { | |
400 // TODO Auto-generated catch block | |
401 e.printStackTrace(); | |
402 } | |
403 | |
404 } | |
405 | |
406 private boolean checkFileExists(File folder) { | |
407 File cf = getComposedFile(folder); | |
408 return cf.exists(); | |
409 | |
410 } | |
411 | |
412 private void clearFile(File folder) { | |
413 File cf = getComposedFile(folder); | |
414 cf.delete(); | |
415 try { | |
416 File dir = cf.getParentFile(); | |
417 if (false == dir.exists()) { | |
418 dir.mkdirs(); | |
419 } | |
420 | |
421 cf.createNewFile(); | |
422 | |
423 FileWriter fw = new FileWriter(cf); | |
424 fw.write("<document>"); | |
425 fw.close(); | |
426 | |
427 } catch (IOException e) { | |
428 // TODO Auto-generated catch block | |
429 e.printStackTrace(); | |
430 } | |
431 | |
432 } | |
433 | |
434 private void compose(File folder, File file) { | |
435 File cf = getComposedFile(folder); | |
436 try { | |
437 System.out.println("Adding" + file.getCanonicalPath()); | |
438 //FileWriter fw = new FileWriter(cf, true); | |
439 | |
440 FileOutputStream stream = new FileOutputStream(cf,true); | |
441 | |
442 OutputStreamWriter fw = new OutputStreamWriter(stream, "utf-8"); | |
443 | |
444 String filteredDocument=""; | |
445 try { | |
446 filteredDocument = getFilteredFile(file); | |
447 } catch (TransformerException e) { | |
448 filteredDocument = ""; | |
449 }catch (ParserConfigurationException e) { | |
450 // TODO Auto-generated catch block | |
451 e.printStackTrace(); | |
452 } catch (SAXException e) { | |
453 // TODO Auto-generated catch block | |
454 e.printStackTrace(); | |
455 } | |
456 | |
457 fw.append(filteredDocument); | |
458 fw.write("<pb name=\""); | |
459 fw.write(file.getName()); | |
460 fw.write("\"/>"); | |
461 fw.close(); | |
462 | |
463 } catch (IOException e) { | |
464 // TODO Auto-generated catch block | |
465 e.printStackTrace(); | |
466 } | |
467 | |
468 } | |
469 | |
470 private String getFilteredFile(File file) throws IOException, | |
471 TransformerException, ParserConfigurationException, SAXException { | |
472 | |
473 // String txt = IOUtils.toString(new FileInputStream(file)); | |
474 // get rid of the entities | |
475 TransformerFactory tf = TransformerFactory.newInstance(); | |
476 Transformer t = tf.newTransformer(); | |
477 | |
478 | |
479 //OutputStream output = new ByteArrayOutputStream(); | |
480 | |
481 //BufferedWriter sw = new BufferedWriter(new OutputStreamWriter(output, "utf-8")); | |
482 | |
483 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); | |
484 dbf.setNamespaceAware(true); | |
485 dbf.setValidating(false); | |
486 DocumentBuilder db = dbf.newDocumentBuilder(); | |
487 db.setEntityResolver(new MyResolver()); | |
488 org.w3c.dom.Document doc = db.parse(file); | |
489 | |
490 StringWriter sw = new StringWriter(); | |
491 StreamResult sr = new StreamResult(sw); | |
492 | |
493 org.w3c.dom.Document tgtDoc = db.newDocument(); | |
494 DocumentFragment fragment = tgtDoc.createDocumentFragment(); | |
495 DOMResult tgtDom = new DOMResult( fragment ); | |
496 | |
497 t.setOutputProperty(OutputKeys.ENCODING, "utf-8"); | |
498 t.transform(new DOMSource(doc), sr); | |
499 t.transform(new DOMSource(doc), tgtDom); | |
500 | |
501 String txt = sw.toString(); | |
502 | |
503 | |
504 | |
505 Pattern p = Pattern.compile("<body>(.*)</body>", Pattern.DOTALL); | |
506 Matcher m = p.matcher(txt); | |
507 if (m.find()) | |
508 if (m.groupCount() > 0) { | |
509 return m.group(1); | |
510 } | |
511 return ""; | |
512 } | |
513 | |
514 private File getComposedFile(File folder) { | |
515 try { | |
516 String path = folder.getCanonicalPath(); | |
517 String newPath = PREFIX + path + "/" + COMPOSEDFN; | |
518 return new File(newPath); | |
519 } catch (IOException e) { | |
520 // TODO Auto-generated catch block | |
521 e.printStackTrace(); | |
522 } | |
523 return null; | |
524 } | |
525 | |
526 protected void processFile(File file) throws CorruptIndexException, | |
527 LockObtainFailedException, IOException { | |
528 /* 348 */int freeThread = -1; | |
529 /* 349 */while (freeThread == -1) { | |
530 /* 351 */freeThread = waitForFreeThread(); | |
531 } | |
532 | |
533 /* 355 */if (this.textLanguage == null) | |
534 /* 356 */this.textLanguage = loadLanguages(); | |
535 /* 357 */this.mythreads[freeThread] = new ProcessFileThread( | |
536 this.languageAnalyzers, file, this.languageFileName, | |
537 this.textLanguage, this.mdProviderUrl, this.preferedLanguage, | |
538 this.languageToISO, this.supportedLanguageFolder); | |
539 /* 358 */this.mythreads[freeThread].start(); | |
540 /* 359 */System.out.println("New process started:" + freeThread); | |
541 } | |
542 | |
543 protected int waitForFreeThread() { | |
544 /* 367 */for (int i = 0; i < maxThread; ++i) { | |
545 /* 369 */if (this.mythreads[i] == null) | |
546 /* 370 */return i; | |
547 /* 371 */if (!this.mythreads[i].done) | |
548 continue; | |
549 /* 373 */this.filecount += 1; | |
550 /* 374 */System.out.println("filecount:" + this.filecount); | |
551 /* 375 */return i; | |
552 } | |
553 | |
554 /* 378 */return -1; | |
555 } | |
556 | |
557 private boolean isTextFile(File file) { | |
558 /* 392 */String fn = file.getName(); | |
559 | |
560 /* 394 */String[] splitted = fn.split("[.]"); | |
561 | |
562 /* 396 */String ext = ""; | |
563 | |
564 /* 398 */if (splitted.length > 1) { | |
565 /* 400 */ext = splitted[(splitted.length - 1)]; | |
566 } | |
567 boolean ret = fileTypesToIndex.contains(ext); | |
568 /* 403 */return ret; | |
569 } | |
570 | |
571 } | |
572 | |
573 /* | |
574 * Location: /private/tmp/fulltextIndexer.jar Qualified Name: | |
575 * de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded JD-Core Version: | |
576 * 0.5.4 | |
577 */ |