annotate src/de/mpiwg/dwinter/fulltextIndexer/harvester/CLI/HarvesterCLIRDFMD.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7622afcfea initial
dwinter
parents:
diff changeset
1 package de.mpiwg.dwinter.fulltextIndexer.harvester.CLI;
dc7622afcfea initial
dwinter
parents:
diff changeset
2
dc7622afcfea initial
dwinter
parents:
diff changeset
3 import java.io.File;
dc7622afcfea initial
dwinter
parents:
diff changeset
4 import java.io.IOException;
dc7622afcfea initial
dwinter
parents:
diff changeset
5
dc7622afcfea initial
dwinter
parents:
diff changeset
6 import org.apache.lucene.index.CorruptIndexException;
dc7622afcfea initial
dwinter
parents:
diff changeset
7 import org.apache.lucene.store.LockObtainFailedException;
dc7622afcfea initial
dwinter
parents:
diff changeset
8 import org.jdom.JDOMException;
dc7622afcfea initial
dwinter
parents:
diff changeset
9
dc7622afcfea initial
dwinter
parents:
diff changeset
10 import de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded;
dc7622afcfea initial
dwinter
parents:
diff changeset
11 import de.mpiwg.dwinter.fulltextIndexer.harvester.OCRHarvesterThreaded;
dc7622afcfea initial
dwinter
parents:
diff changeset
12
dc7622afcfea initial
dwinter
parents:
diff changeset
13
dc7622afcfea initial
dwinter
parents:
diff changeset
14
dc7622afcfea initial
dwinter
parents:
diff changeset
15 public class HarvesterCLIRDFMD {
dc7622afcfea initial
dwinter
parents:
diff changeset
16
dc7622afcfea initial
dwinter
parents:
diff changeset
17 /** Index all text files under a directory.
dc7622afcfea initial
dwinter
parents:
diff changeset
18 * @throws IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
19 * @throws LockObtainFailedException
dc7622afcfea initial
dwinter
parents:
diff changeset
20 * @throws CorruptIndexException
dc7622afcfea initial
dwinter
parents:
diff changeset
21 * @throws InterruptedException */
dc7622afcfea initial
dwinter
parents:
diff changeset
22 public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
23 String usage = "java de.mpiwg.itgroup.fulltext.harvester.HarvesterCLIRDF <pathToRDF> <root_directory> <index_dir> <mdProviderURL> --lang=<language>\n" +
dc7622afcfea initial
dwinter
parents:
diff changeset
24 "java de.mpiwg.itgroup.fulltext.harvester.HarvesterCLIRDF <pathtoRDF> <root_directory> <index_dir> <mdProviderURL> --langfile=<languageFile>\n";
dc7622afcfea initial
dwinter
parents:
diff changeset
25
dc7622afcfea initial
dwinter
parents:
diff changeset
26
dc7622afcfea initial
dwinter
parents:
diff changeset
27 if (args.length != 4 & args.length != 5) {
dc7622afcfea initial
dwinter
parents:
diff changeset
28 System.err.println("Usage: " + usage);
dc7622afcfea initial
dwinter
parents:
diff changeset
29 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
30 }
dc7622afcfea initial
dwinter
parents:
diff changeset
31
dc7622afcfea initial
dwinter
parents:
diff changeset
32 // if (INDEX_DIR.exists()) {
dc7622afcfea initial
dwinter
parents:
diff changeset
33 // System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
dc7622afcfea initial
dwinter
parents:
diff changeset
34 // System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
35 // }
dc7622afcfea initial
dwinter
parents:
diff changeset
36
dc7622afcfea initial
dwinter
parents:
diff changeset
37 File docDir = new File(args[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
38 if ((!docDir.exists()) || (!docDir.canRead())) {
dc7622afcfea initial
dwinter
parents:
diff changeset
39 System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path");
dc7622afcfea initial
dwinter
parents:
diff changeset
40 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
41 }
dc7622afcfea initial
dwinter
parents:
diff changeset
42
dc7622afcfea initial
dwinter
parents:
diff changeset
43 final File rdfFile = new File(args[0]);
dc7622afcfea initial
dwinter
parents:
diff changeset
44 if (!rdfFile.exists() || !rdfFile.canRead()) {
dc7622afcfea initial
dwinter
parents:
diff changeset
45 System.out.println("RDFFile directory '" +rdfFile.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
dc7622afcfea initial
dwinter
parents:
diff changeset
46 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
47 }
dc7622afcfea initial
dwinter
parents:
diff changeset
48
dc7622afcfea initial
dwinter
parents:
diff changeset
49 final File index_dir= new File(args[2]);
dc7622afcfea initial
dwinter
parents:
diff changeset
50 final String mdProviderUrl= args[3];
dc7622afcfea initial
dwinter
parents:
diff changeset
51
dc7622afcfea initial
dwinter
parents:
diff changeset
52 if (args.length == 4){
dc7622afcfea initial
dwinter
parents:
diff changeset
53 doTheHarvest(rdfFile, docDir, index_dir, mdProviderUrl,null);
dc7622afcfea initial
dwinter
parents:
diff changeset
54 }
dc7622afcfea initial
dwinter
parents:
diff changeset
55 else {
dc7622afcfea initial
dwinter
parents:
diff changeset
56 String[] options = args[4].split("=");
dc7622afcfea initial
dwinter
parents:
diff changeset
57 if (options.length != 3) {
dc7622afcfea initial
dwinter
parents:
diff changeset
58 System.err.println("wrong options:" + args[4]);
dc7622afcfea initial
dwinter
parents:
diff changeset
59 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
60 }
dc7622afcfea initial
dwinter
parents:
diff changeset
61
dc7622afcfea initial
dwinter
parents:
diff changeset
62 if (options[0].equals("--lang")) {
dc7622afcfea initial
dwinter
parents:
diff changeset
63 doTheHarvestLanguage(rdfFile,docDir, index_dir, args[3], options[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
64 } else if (options[0].equals("--langfile")) {
dc7622afcfea initial
dwinter
parents:
diff changeset
65 doTheHarvest(rdfFile,docDir, index_dir, args[3], options[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
66 } else {
dc7622afcfea initial
dwinter
parents:
diff changeset
67 System.err.println("wrong options:" + options[0]);
dc7622afcfea initial
dwinter
parents:
diff changeset
68 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
69 }
dc7622afcfea initial
dwinter
parents:
diff changeset
70 }
dc7622afcfea initial
dwinter
parents:
diff changeset
71 }
dc7622afcfea initial
dwinter
parents:
diff changeset
72
dc7622afcfea initial
dwinter
parents:
diff changeset
73 private static void doTheHarvest(File rdfFile, File docDir,File index_dir,
dc7622afcfea initial
dwinter
parents:
diff changeset
74 String mdProviderUrl, String languageFile) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
75 HarvesterThreaded harvester = new HarvesterThreaded(docDir,index_dir,languageFile,mdProviderUrl,null);
dc7622afcfea initial
dwinter
parents:
diff changeset
76 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
77 harvester.harvestFromRDF(rdfFile.getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
78 } catch (JDOMException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
79 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
80 }
dc7622afcfea initial
dwinter
parents:
diff changeset
81
dc7622afcfea initial
dwinter
parents:
diff changeset
82 }
dc7622afcfea initial
dwinter
parents:
diff changeset
83
dc7622afcfea initial
dwinter
parents:
diff changeset
84 private static void doTheHarvestLanguage(File rdfFile, File docDir,File index_dir,
dc7622afcfea initial
dwinter
parents:
diff changeset
85 String mdProviderUrl, String lang) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
86 HarvesterThreaded harvester = new HarvesterThreaded(docDir,index_dir,null,mdProviderUrl,lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
87 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
88 harvester.harvestFromRDF(rdfFile.getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
89 } catch (JDOMException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
90 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
91 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
92 }
dc7622afcfea initial
dwinter
parents:
diff changeset
93
dc7622afcfea initial
dwinter
parents:
diff changeset
94 }
dc7622afcfea initial
dwinter
parents:
diff changeset
95
dc7622afcfea initial
dwinter
parents:
diff changeset
96
dc7622afcfea initial
dwinter
parents:
diff changeset
97
dc7622afcfea initial
dwinter
parents:
diff changeset
98 }