0
|
1 package de.mpiwg.dwinter.fulltextIndexer.harvester.CLI;
|
|
2
|
|
3 import java.io.File;
|
|
4 import java.io.IOException;
|
|
5
|
|
6 import org.apache.lucene.index.CorruptIndexException;
|
|
7 import org.apache.lucene.store.LockObtainFailedException;
|
|
8 import org.jdom.JDOMException;
|
|
9
|
|
10 import de.mpiwg.dwinter.fulltextIndexer.harvester.OCRHarvesterThreaded;
|
|
11
|
|
12
|
|
13
|
|
14 public class OCRHarvesterCLIRDFMD {
|
|
15
|
|
16 /** Index all text files under a directory.
|
|
17 * @throws IOException
|
|
18 * @throws LockObtainFailedException
|
|
19 * @throws CorruptIndexException
|
|
20 * @throws InterruptedException */
|
|
21 public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
|
|
22 String usage = "java de.mpiwg.itgroup.fulltext.harvester.HarvesterCLIRDF <pathToRDF> <root_directory> <index_dir> <mdProviderURL> --lang=<language>\n" +
|
|
23 "java de.mpiwg.itgroup.fulltext.harvester.HarvesterCLIRDF <pathtoRDF> <root_directory> <index_dir> <mdProviderURL> --langfile=<languageFile>\n";
|
|
24
|
|
25
|
|
26 if (args.length != 4 & args.length != 5) {
|
|
27 System.err.println("Usage: " + usage);
|
|
28 System.exit(1);
|
|
29 }
|
|
30
|
|
31 // if (INDEX_DIR.exists()) {
|
|
32 // System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
|
|
33 // System.exit(1);
|
|
34 // }
|
|
35
|
|
36 File docDir = new File(args[1]);
|
|
37 if ((!docDir.exists()) || (!docDir.canRead())) {
|
|
38 System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path");
|
|
39 System.exit(1);
|
|
40 }
|
|
41
|
|
42 final File rdfFile = new File(args[0]);
|
|
43 if (!rdfFile.exists() || !rdfFile.canRead()) {
|
|
44 System.out.println("RDFFile directory '" +rdfFile.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
|
|
45 System.exit(1);
|
|
46 }
|
|
47
|
|
48 final File index_dir= new File(args[2]);
|
|
49 final String mdProviderUrl= args[3];
|
|
50
|
|
51 if (args.length == 4){
|
|
52 doTheHarvest(rdfFile, docDir, index_dir, mdProviderUrl,null);
|
|
53 }
|
|
54 else {
|
|
55 String[] options = args[4].split("=");
|
|
56 if (options.length != 3) {
|
|
57 System.err.println("wrong options:" + args[4]);
|
|
58 System.exit(1);
|
|
59 }
|
|
60
|
|
61 if (options[0].equals("--lang")) {
|
|
62 doTheHarvestLanguage(rdfFile,docDir, index_dir, args[3], options[1]);
|
|
63 } else if (options[0].equals("--langfile")) {
|
|
64 doTheHarvest(rdfFile,docDir, index_dir, args[3], options[1]);
|
|
65 } else {
|
|
66 System.err.println("wrong options:" + options[0]);
|
|
67 System.exit(1);
|
|
68 }
|
|
69 }
|
|
70 }
|
|
71
|
|
72 private static void doTheHarvest(File rdfFile, File docDir,File index_dir,
|
|
73 String mdProviderUrl, String languageFile) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
|
|
74 OCRHarvesterThreaded harvester = new OCRHarvesterThreaded(docDir,index_dir,languageFile,mdProviderUrl,null);
|
|
75 try {
|
|
76 harvester.harvestFromRDF(rdfFile.getAbsolutePath());
|
|
77 } catch (JDOMException e) {
|
|
78 e.printStackTrace();
|
|
79 }
|
|
80
|
|
81 }
|
|
82
|
|
83 private static void doTheHarvestLanguage(File rdfFile, File docDir,File index_dir,
|
|
84 String mdProviderUrl, String lang) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
|
|
85 OCRHarvesterThreaded harvester = new OCRHarvesterThreaded(docDir,index_dir,null,mdProviderUrl,lang);
|
|
86 try {
|
|
87 harvester.harvestFromRDF(rdfFile.getAbsolutePath());
|
|
88 } catch (JDOMException e) {
|
|
89 // TODO Auto-generated catch block
|
|
90 e.printStackTrace();
|
|
91 }
|
|
92
|
|
93 }
|
|
94
|
|
95
|
|
96
|
|
97 }
|