comparison src/de/mpiwg/dwinter/fulltextIndexer/harvester/.svn/text-base/DocHarvesterThreaded.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dc7622afcfea
1 package de.mpiwg.dwinter.fulltextIndexer.harvester;
2
3 /* Harveste jeweils ein komplettes Buch in einen Eintrag
4 *
5 * */
6 import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
7
8 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
9
10 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
11
12 import java.io.BufferedReader;
13
14 import java.io.File;
15
16 import java.io.FileNotFoundException;
17
18 import java.io.BufferedInputStream;
19 import java.io.BufferedWriter;
20 import java.io.ByteArrayOutputStream;
21 import java.io.FileInputStream;
22 import java.io.FileOutputStream;
23 import java.io.FileReader;
24 import java.io.FileWriter;
25 import java.io.InputStream;
26 import java.io.OutputStream;
27 import java.io.OutputStreamWriter;
28 import java.io.StringWriter;
29
30 import java.io.IOException;
31
32 import java.io.PrintStream;
33
34 import java.util.ArrayList;
35
36 import java.util.Arrays;
37
38 import java.util.Date;
39
40 import java.util.HashMap;
41
42 import java.util.List;
43 import java.util.regex.Matcher;
44 import java.util.regex.Pattern;
45
46 import javax.xml.parsers.DocumentBuilder;
47 import javax.xml.parsers.DocumentBuilderFactory;
48 import javax.xml.parsers.ParserConfigurationException;
49 import javax.xml.parsers.SAXParser;
50 import javax.xml.transform.OutputKeys;
51 import javax.xml.transform.Transformer;
52 import javax.xml.transform.TransformerConfigurationException;
53 import javax.xml.transform.TransformerException;
54 import javax.xml.transform.TransformerFactory;
55 import javax.xml.transform.dom.DOMResult;
56 import javax.xml.transform.dom.DOMSource;
57 import javax.xml.transform.stream.StreamResult;
58 import javax.xml.transform.stream.StreamSource;
59
60 import org.apache.commons.io.IOUtils;
61 import org.apache.lucene.analysis.de.GermanAnalyzer;
62
63 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
64
65 import org.apache.lucene.analysis.standard.StandardAnalyzer;
66
67 import org.apache.lucene.index.CorruptIndexException;
68
69 import org.apache.lucene.store.LockObtainFailedException;
70
71 import org.apache.lucene.util.Version;
72 import org.apache.ws.commons.serialize.XMLWriterImpl;
73
74 import org.jdom.Document;
75
76 import org.jdom.Element;
77
78 import org.jdom.JDOMException;
79
80 import org.jdom.input.SAXBuilder;
81 import org.jdom.xpath.XPath;
82 import org.w3c.dom.DocumentFragment;
83 import org.xml.sax.SAXException;
84
85 import com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl;
86
87 public class DocHarvesterThreaded {
88 private static final boolean DEBUG = false;
89 private static final int MAXFILES = 3;
90
91 //private static final String PREFIX = "/tmp/composed/files";
92 private static final String PREFIX = "/Volumes/data/composed/files";
93 private static final String COMPOSEDFN = "doc.xml";
94 private static final boolean CREATE_NEW = false;
95
96 protected static ArrayList<String> fileTypesToIndex = new ArrayList(
97 Arrays.asList(new String[] { "xml" }));
98
99 protected static ArrayList<String> excludeFolders = new ArrayList(
100 Arrays.asList(new String[] { "OCR" }));
101 protected static boolean indexMetaPriority = false;
102
103 private static String specialMode = "";
104 protected static int maxThread = 30;
105 protected File docDir;
106 protected File index_dir;
107 protected HashMap<String, String> textLanguage = null;
108 protected HashMap<String, String> languageToISO = new HashMap();
109 protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
110
111 private int counter = 0;
112 protected String languageFileName;
113 protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread];
114 private int filecount = 0;
115
116 protected String mdProviderUrl = null;
117 private String preferedLanguage;
118 protected HashMap<String, String> supportedLanguageFolder = new HashMap();
119 private int completedFiles = 0; // counter for all files completed and indexed
120
121 public DocHarvesterThreaded() {
122 }
123
124 public DocHarvesterThreaded(File docDir, File index_dir,
125 String languageFileName, String mdProviderUrl, String lang)
126 throws CorruptIndexException, LockObtainFailedException,
127 IOException {
128 /* 119 */this.docDir = docDir;
129 /* 120 */this.languageFileName = languageFileName;
130 /* 121 */this.preferedLanguage = lang;
131
132 /* 133 */this.mdProviderUrl = mdProviderUrl;
133
134 /* 135 */this.index_dir = index_dir;
135
136 /* 137 */for (int i = 0; i < maxThread; ++i) {
137 /* 139 */this.mythreads[i] = null;
138 }
139
140 /* 142 */init_languages();
141 }
142
143 private void init_languages() {
144 /* 146 */this.languageToISO.put("German", "de");
145 /* 147 */this.languageToISO.put("French", "fr");
146 /* 148 */this.languageToISO.put("English", "en");
147 /* 149 */this.languageToISO.put("German-f", "de-f");
148
149 /* 151 */this.supportedLanguageFolder.put("deu", "de");
150 /* 152 */this.supportedLanguageFolder.put("deu-f", "de");
151 /* 153 */this.supportedLanguageFolder.put("fra", "fr");
152 /* 154 */this.supportedLanguageFolder.put("eng", "en");
153 /* 155 */this.supportedLanguageFolder.put("lic", "la");
154 try {
155 /* 158 */this.languageAnalyzers.add(new LanguageAnalyzer("de",
156 new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
157 /* 160 */this.languageAnalyzers.add(new LanguageAnalyzer("en",
158 new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
159 /* 161 */this.languageAnalyzers.add(new LanguageAnalyzer("fr",
160 new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
161 /* 162 */this.languageAnalyzers.add(new LanguageAnalyzer("la",
162 new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
163
164 /* 164 */this.languageAnalyzers.add(new LanguageAnalyzer("all",
165 new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
166 /* 165 */this.languageAnalyzers.add(new LanguageAnalyzer("morph",
167 new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
168 } catch (CorruptIndexException e) {
169 /* 167 */e.printStackTrace();
170 /* 168 */System.exit(1);
171 } catch (LockObtainFailedException e) {
172 /* 170 */e.printStackTrace();
173 /* 171 */System.exit(1);
174 } catch (IOException e) {
175 /* 173 */e.printStackTrace();
176 /* 174 */System.exit(1);
177 }
178 }
179
180 public DocHarvesterThreaded(File docDir, File index_dir,
181 String mdProviderUrl) throws CorruptIndexException,
182 LockObtainFailedException, IOException {
183 /* 180 */this(docDir, index_dir, null, mdProviderUrl, null);
184 }
185
186 protected HashMap<String, String> loadLanguages() {
187 /* 187 */File languageFile = new File(this.docDir + "/"
188 + this.languageFileName);
189 /* 188 */String languageFilePath = this.docDir + "/"
190 + this.languageFileName;
191 /* 189 */HashMap languages = new HashMap();
192 /* 190 */boolean relativ = true;
193 /* 191 */if (this.languageFileName == null)
194 /* 192 */return null;
195 /* 193 */if (!languageFile.exists()) {
196 /* 195 */languageFile = new File(this.languageFileName);
197 /* 196 */languageFilePath = this.languageFileName;
198 /* 197 */relativ = false;
199 /* 198 */if (!languageFile.exists())
200 /* 199 */return null;
201 }
202 BufferedReader in;
203 try {
204 /* 203 */in = new BufferedReader(new FileReader(languageFilePath));
205 } catch (FileNotFoundException e) {
206 /* 205 */return null;
207 }
208
209 /* 208 */String zeile = null;
210 try {
211 /* 210 */while ((zeile = in.readLine()) != null) {
212 /* 211 */String[] splitted = zeile.replace("\"", "").split(
213 "[,]");
214 /* 212 */if (splitted.length == 2)
215 /* 213 */if (relativ)
216 /* 214 */languages.put(this.docDir + "/" + splitted[0],
217 splitted[1]);
218 else
219 /* 216 */languages.put(splitted[0], splitted[1]);
220 }
221 } catch (IOException e) {
222 /* 220 */e.printStackTrace();
223 /* 221 */return null;
224 }
225
226 /* 224 */return languages;
227 }
228
229 public void harvestFromRDF(String rdffilepath) throws InterruptedException,
230 JDOMException {
231 /* 228 */Date start = new Date();
232 /* 229 */boolean create = true;
233 try {
234 /* 240 */System.out.println("Indexing to directory '"
235 + this.index_dir + "'...");
236 /* 241 */ArrayList<String> files = getFileListFromRDF(rdffilepath);
237 /* 242 */indexDocs(files);
238 /* 243 */System.out.println("Optimizing...");
239 /* 244 */this.languageAnalyzers.optimize();
240 /* 245 */this.languageAnalyzers.close();
241
242 /* 247 */Date end = new Date();
243 /* 248 */System.out.println(end.getTime() - start.getTime()
244 + " total milliseconds");
245 } catch (IOException e) {
246 /* 251 */System.out.println(" caught a " + e.getClass() +
247 /* 252 */"\n with message: " + e.getMessage());
248 }
249 }
250
251 private ArrayList<String> getFileListFromRDF(String rdffilepath)
252 throws JDOMException, IOException {
253 /* 260 */ArrayList ret = new ArrayList();
254 /* 261 */SAXBuilder builder = new SAXBuilder();
255
256 /* 263 */Document doc = builder.build(rdffilepath);
257
258 /* 265 */Element el = doc.getRootElement();
259
260 /* 267 */XPath xpath = XPath.newInstance("//MPIWG:archive-path");
261 /* 268 */xpath.addNamespace("MPIWG",
262 "http://www.mpiwg-berlin.mpg.de/ns/mpiwg");
263 /* 269 */List<Element> paths = xpath.selectNodes(el);
264 /* 270 */for (Element path : paths) {
265 /* 271 */ret.add(path.getText());
266 }
267
268 /* 274 */return ret;
269 }
270
271 public void harvestFolder() throws InterruptedException {
272 /* 278 */Date start = new Date();
273 /* 279 */boolean create = true;
274 try {
275 /* 290 */System.out.println("Indexing to directory '"
276 + this.index_dir + "'...");
277 /* 291 */indexDocs(this.docDir);
278 /* 292 */System.out.println("Optimizing...");
279 /* 293 */this.languageAnalyzers.optimize();
280 /* 294 */this.languageAnalyzers.close();
281
282 /* 296 */Date end = new Date();
283 /* 297 */System.out.println(end.getTime() - start.getTime()
284 + " total milliseconds");
285 } catch (IOException e) {
286 /* 300 */System.out.println(" caught a " + e.getClass() +
287 /* 301 */"\n with message: " + e.getMessage());
288 }
289 }
290
291 private void indexDocs(ArrayList<String> files) throws IOException,
292 InterruptedException {
293 /* 308 */for (String filename : files) {
294 /* 310 */indexDocs(new File(this.docDir.getAbsolutePath()
295 + filename));
296 if ((DEBUG == true) & (this.filecount > MAXFILES))
297 break;
298 }
299 }
300
301 void indexDocs(File file) throws IOException, InterruptedException {
302 /* 317 */if (!file.canRead())
303 return;
304 /* 319 */
305 /* 321 */if ((DEBUG == true) && (this.filecount > MAXFILES))
306 return;
307 /* 325 */String[] files = file.list();
308
309 /* 327 */String folderName = file.getName();
310
311 boolean notExists = !checkFileExists(file);
312 boolean createNew = CREATE_NEW || notExists;
313 // boolean createNew = true;
314
315 boolean fileStillEmpty = true;
316 if (createNew) {
317 clearFile(file); // loesche das gesamtfile
318 } else {
319 fileStillEmpty = false; // assume that file is not empty, if it already exists
320 }
321
322
323 if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0
324 : 1)) != 0) {
325 for (int i = 0; i < files.length; ++i) {
326 File nextFile = new File(file, files[i]);
327
328 if (nextFile.isDirectory()) // directory dann gehe in die
329 // naechste ebene
330 indexDocs(nextFile);
331
332 else if (isTextFile(nextFile)) {
333
334 if (createNew) {
335 fileStillEmpty = false; //datei hat jetzt einen Inhalt
336 compose(file, nextFile); // fuege das file an das
337 // gesamtfilean
338 }
339
340 }
341 if ((DEBUG == true) && (this.filecount > MAXFILES))
342 break;
343 }
344 if (createNew) {
345 if (fileStillEmpty){
346 deleteComposedFile(file); // file hat keinen inhalt dann loeschen
347 } else {
348 finishFile(file);
349 }
350 }
351
352 if (!fileStillEmpty)
353 processCompleteFile(file);
354 /* 335 */} else {
355 /* 342 */System.out.println("not adding " + file);
356 }
357 }
358
359 private void finishFile(File folder) {
360 File cf = getComposedFile(folder);
361 System.out.println();
362 try {
363 System.out.println("finish file:" + cf.getCanonicalPath());
364 FileWriter fw = new FileWriter(cf, true);
365
366 fw.write("</document>");
367 fw.close();
368
369 } catch (IOException e) {
370 // TODO Auto-generated catch block
371 e.printStackTrace();
372 }
373
374 }
375
376 private boolean deleteComposedFile(File folder) {
377 File cf = getComposedFile(folder);
378 try {
379 System.out.println("file deleted, because empty:" + cf.getCanonicalPath());
380 } catch (IOException e) {
381 // TODO Auto-generated catch block
382 e.printStackTrace();
383 }
384 return cf.delete();
385 }
386
387
388 private void processCompleteFile(File folder) {
389 System.out.println("Completed File:"+String.valueOf(completedFiles++));
390 File cf = getComposedFile(folder);
391 try {
392 processFile(cf);
393 } catch (CorruptIndexException e) {
394 // TODO Auto-generated catch block
395 e.printStackTrace();
396 } catch (LockObtainFailedException e) {
397 // TODO Auto-generated catch block
398 e.printStackTrace();
399 } catch (IOException e) {
400 // TODO Auto-generated catch block
401 e.printStackTrace();
402 }
403
404 }
405
406 private boolean checkFileExists(File folder) {
407 File cf = getComposedFile(folder);
408 return cf.exists();
409
410 }
411
412 private void clearFile(File folder) {
413 File cf = getComposedFile(folder);
414 cf.delete();
415 try {
416 File dir = cf.getParentFile();
417 if (false == dir.exists()) {
418 dir.mkdirs();
419 }
420
421 cf.createNewFile();
422
423 FileWriter fw = new FileWriter(cf);
424 fw.write("<document>");
425 fw.close();
426
427 } catch (IOException e) {
428 // TODO Auto-generated catch block
429 e.printStackTrace();
430 }
431
432 }
433
434 private void compose(File folder, File file) {
435 File cf = getComposedFile(folder);
436 try {
437 System.out.println("Adding" + file.getCanonicalPath());
438 //FileWriter fw = new FileWriter(cf, true);
439
440 FileOutputStream stream = new FileOutputStream(cf,true);
441
442 OutputStreamWriter fw = new OutputStreamWriter(stream, "utf-8");
443
444 String filteredDocument="";
445 try {
446 filteredDocument = getFilteredFile(file);
447 } catch (TransformerException e) {
448 filteredDocument = "";
449 }catch (ParserConfigurationException e) {
450 // TODO Auto-generated catch block
451 e.printStackTrace();
452 } catch (SAXException e) {
453 // TODO Auto-generated catch block
454 e.printStackTrace();
455 }
456
457 fw.append(filteredDocument);
458 fw.write("<pb name=\"");
459 fw.write(file.getName());
460 fw.write("\"/>");
461 fw.close();
462
463 } catch (IOException e) {
464 // TODO Auto-generated catch block
465 e.printStackTrace();
466 }
467
468 }
469
470 private String getFilteredFile(File file) throws IOException,
471 TransformerException, ParserConfigurationException, SAXException {
472
473 // String txt = IOUtils.toString(new FileInputStream(file));
474 // get rid of the entities
475 TransformerFactory tf = TransformerFactory.newInstance();
476 Transformer t = tf.newTransformer();
477
478
479 //OutputStream output = new ByteArrayOutputStream();
480
481 //BufferedWriter sw = new BufferedWriter(new OutputStreamWriter(output, "utf-8"));
482
483 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
484 dbf.setNamespaceAware(true);
485 dbf.setValidating(false);
486 DocumentBuilder db = dbf.newDocumentBuilder();
487 db.setEntityResolver(new MyResolver());
488 org.w3c.dom.Document doc = db.parse(file);
489
490 StringWriter sw = new StringWriter();
491 StreamResult sr = new StreamResult(sw);
492
493 org.w3c.dom.Document tgtDoc = db.newDocument();
494 DocumentFragment fragment = tgtDoc.createDocumentFragment();
495 DOMResult tgtDom = new DOMResult( fragment );
496
497 t.setOutputProperty(OutputKeys.ENCODING, "utf-8");
498 t.transform(new DOMSource(doc), sr);
499 t.transform(new DOMSource(doc), tgtDom);
500
501 String txt = sw.toString();
502
503
504
505 Pattern p = Pattern.compile("<body>(.*)</body>", Pattern.DOTALL);
506 Matcher m = p.matcher(txt);
507 if (m.find())
508 if (m.groupCount() > 0) {
509 return m.group(1);
510 }
511 return "";
512 }
513
514 private File getComposedFile(File folder) {
515 try {
516 String path = folder.getCanonicalPath();
517 String newPath = PREFIX + path + "/" + COMPOSEDFN;
518 return new File(newPath);
519 } catch (IOException e) {
520 // TODO Auto-generated catch block
521 e.printStackTrace();
522 }
523 return null;
524 }
525
526 protected void processFile(File file) throws CorruptIndexException,
527 LockObtainFailedException, IOException {
528 /* 348 */int freeThread = -1;
529 /* 349 */while (freeThread == -1) {
530 /* 351 */freeThread = waitForFreeThread();
531 }
532
533 /* 355 */if (this.textLanguage == null)
534 /* 356 */this.textLanguage = loadLanguages();
535 /* 357 */this.mythreads[freeThread] = new ProcessFileThread(
536 this.languageAnalyzers, file, this.languageFileName,
537 this.textLanguage, this.mdProviderUrl, this.preferedLanguage,
538 this.languageToISO, this.supportedLanguageFolder);
539 /* 358 */this.mythreads[freeThread].start();
540 /* 359 */System.out.println("New process started:" + freeThread);
541 }
542
543 protected int waitForFreeThread() {
544 /* 367 */for (int i = 0; i < maxThread; ++i) {
545 /* 369 */if (this.mythreads[i] == null)
546 /* 370 */return i;
547 /* 371 */if (!this.mythreads[i].done)
548 continue;
549 /* 373 */this.filecount += 1;
550 /* 374 */System.out.println("filecount:" + this.filecount);
551 /* 375 */return i;
552 }
553
554 /* 378 */return -1;
555 }
556
557 private boolean isTextFile(File file) {
558 /* 392 */String fn = file.getName();
559
560 /* 394 */String[] splitted = fn.split("[.]");
561
562 /* 396 */String ext = "";
563
564 /* 398 */if (splitted.length > 1) {
565 /* 400 */ext = splitted[(splitted.length - 1)];
566 }
567 boolean ret = fileTypesToIndex.contains(ext);
568 /* 403 */return ret;
569 }
570
571 }
572
573 /*
574 * Location: /private/tmp/fulltextIndexer.jar Qualified Name:
575 * de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded JD-Core Version:
576 * 0.5.4
577 */