annotate src/de/mpiwg/dwinter/fulltextIndexer/harvester/CLI/.svn/text-base/DocHarvesterCLIRDFMD.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7622afcfea initial
dwinter
parents:
diff changeset
1 package de.mpiwg.dwinter.fulltextIndexer.harvester.CLI;
dc7622afcfea initial
dwinter
parents:
diff changeset
2
dc7622afcfea initial
dwinter
parents:
diff changeset
3 import java.io.File;
dc7622afcfea initial
dwinter
parents:
diff changeset
4 import java.io.IOException;
dc7622afcfea initial
dwinter
parents:
diff changeset
5
dc7622afcfea initial
dwinter
parents:
diff changeset
6 import org.apache.lucene.index.CorruptIndexException;
dc7622afcfea initial
dwinter
parents:
diff changeset
7 import org.apache.lucene.store.LockObtainFailedException;
dc7622afcfea initial
dwinter
parents:
diff changeset
8 import org.jdom.JDOMException;
dc7622afcfea initial
dwinter
parents:
diff changeset
9
dc7622afcfea initial
dwinter
parents:
diff changeset
10 import de.mpiwg.dwinter.fulltextIndexer.harvester.DocHarvesterThreaded;
dc7622afcfea initial
dwinter
parents:
diff changeset
11
dc7622afcfea initial
dwinter
parents:
diff changeset
12
dc7622afcfea initial
dwinter
parents:
diff changeset
13
dc7622afcfea initial
dwinter
parents:
diff changeset
14 public class DocHarvesterCLIRDFMD {
dc7622afcfea initial
dwinter
parents:
diff changeset
15
dc7622afcfea initial
dwinter
parents:
diff changeset
16 /** Index all text files under a directory.
dc7622afcfea initial
dwinter
parents:
diff changeset
17 * @throws IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
18 * @throws LockObtainFailedException
dc7622afcfea initial
dwinter
parents:
diff changeset
19 * @throws CorruptIndexException
dc7622afcfea initial
dwinter
parents:
diff changeset
20 * @throws InterruptedException */
dc7622afcfea initial
dwinter
parents:
diff changeset
21 public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
22 String usage = "java de.mpiwg.itgroup.fulltext.harvester.HarvesterCLIRDF <pathToRDF> <root_directory> <index_dir> <mdProviderURL> --lang=<language>\n" +
dc7622afcfea initial
dwinter
parents:
diff changeset
23 "java de.mpiwg.itgroup.fulltext.harvester.HarvesterCLIRDF <pathtoRDF> <root_directory> <index_dir> <mdProviderURL> --langfile=<languageFile>\n";
dc7622afcfea initial
dwinter
parents:
diff changeset
24
dc7622afcfea initial
dwinter
parents:
diff changeset
25
dc7622afcfea initial
dwinter
parents:
diff changeset
26 if (args.length != 4 & args.length != 5) {
dc7622afcfea initial
dwinter
parents:
diff changeset
27 System.err.println("Usage: " + usage);
dc7622afcfea initial
dwinter
parents:
diff changeset
28 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
29 }
dc7622afcfea initial
dwinter
parents:
diff changeset
30
dc7622afcfea initial
dwinter
parents:
diff changeset
31 // if (INDEX_DIR.exists()) {
dc7622afcfea initial
dwinter
parents:
diff changeset
32 // System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
dc7622afcfea initial
dwinter
parents:
diff changeset
33 // System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
34 // }
dc7622afcfea initial
dwinter
parents:
diff changeset
35
dc7622afcfea initial
dwinter
parents:
diff changeset
36 File docDir = new File(args[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
37 if ((!docDir.exists()) || (!docDir.canRead())) {
dc7622afcfea initial
dwinter
parents:
diff changeset
38 System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path");
dc7622afcfea initial
dwinter
parents:
diff changeset
39 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
40 }
dc7622afcfea initial
dwinter
parents:
diff changeset
41
dc7622afcfea initial
dwinter
parents:
diff changeset
42 final File rdfFile = new File(args[0]);
dc7622afcfea initial
dwinter
parents:
diff changeset
43 if (!rdfFile.exists() || !rdfFile.canRead()) {
dc7622afcfea initial
dwinter
parents:
diff changeset
44 System.out.println("RDFFile directory '" +rdfFile.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
dc7622afcfea initial
dwinter
parents:
diff changeset
45 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
46 }
dc7622afcfea initial
dwinter
parents:
diff changeset
47
dc7622afcfea initial
dwinter
parents:
diff changeset
48 final File index_dir= new File(args[2]);
dc7622afcfea initial
dwinter
parents:
diff changeset
49 final String mdProviderUrl= args[3];
dc7622afcfea initial
dwinter
parents:
diff changeset
50
dc7622afcfea initial
dwinter
parents:
diff changeset
51 if (args.length == 4){
dc7622afcfea initial
dwinter
parents:
diff changeset
52 doTheHarvest(rdfFile, docDir, index_dir, mdProviderUrl,null);
dc7622afcfea initial
dwinter
parents:
diff changeset
53 }
dc7622afcfea initial
dwinter
parents:
diff changeset
54 else {
dc7622afcfea initial
dwinter
parents:
diff changeset
55 String[] options = args[4].split("=");
dc7622afcfea initial
dwinter
parents:
diff changeset
56 if (options.length != 3) {
dc7622afcfea initial
dwinter
parents:
diff changeset
57 System.err.println("wrong options:" + args[4]);
dc7622afcfea initial
dwinter
parents:
diff changeset
58 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
59 }
dc7622afcfea initial
dwinter
parents:
diff changeset
60
dc7622afcfea initial
dwinter
parents:
diff changeset
61 if (options[0].equals("--lang")) {
dc7622afcfea initial
dwinter
parents:
diff changeset
62 doTheHarvestLanguage(rdfFile,docDir, index_dir, args[3], options[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
63 } else if (options[0].equals("--langfile")) {
dc7622afcfea initial
dwinter
parents:
diff changeset
64 doTheHarvest(rdfFile,docDir, index_dir, args[3], options[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
65 } else {
dc7622afcfea initial
dwinter
parents:
diff changeset
66 System.err.println("wrong options:" + options[0]);
dc7622afcfea initial
dwinter
parents:
diff changeset
67 System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
68 }
dc7622afcfea initial
dwinter
parents:
diff changeset
69 }
dc7622afcfea initial
dwinter
parents:
diff changeset
70 }
dc7622afcfea initial
dwinter
parents:
diff changeset
71
dc7622afcfea initial
dwinter
parents:
diff changeset
72 /**
dc7622afcfea initial
dwinter
parents:
diff changeset
73 * @param rdfFile Pfad to the RDF file (ECHO-rdf-Format)
dc7622afcfea initial
dwinter
parents:
diff changeset
74 * @param docDir
dc7622afcfea initial
dwinter
parents:
diff changeset
75 * @param index_dir
dc7622afcfea initial
dwinter
parents:
diff changeset
76 * @param mdProviderUrl
dc7622afcfea initial
dwinter
parents:
diff changeset
77 * @param languageFile
dc7622afcfea initial
dwinter
parents:
diff changeset
78 * @throws CorruptIndexException
dc7622afcfea initial
dwinter
parents:
diff changeset
79 * @throws LockObtainFailedException
dc7622afcfea initial
dwinter
parents:
diff changeset
80 * @throws IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
81 * @throws InterruptedException
dc7622afcfea initial
dwinter
parents:
diff changeset
82 */
dc7622afcfea initial
dwinter
parents:
diff changeset
83 private static void doTheHarvest(File rdfFile, File docDir,File index_dir,
dc7622afcfea initial
dwinter
parents:
diff changeset
84 String mdProviderUrl, String languageFile) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
85 DocHarvesterThreaded harvester = new DocHarvesterThreaded(docDir,index_dir,languageFile,mdProviderUrl,null);
dc7622afcfea initial
dwinter
parents:
diff changeset
86 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
87 harvester.harvestFromRDF(rdfFile.getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
88 } catch (JDOMException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
89 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
90 }
dc7622afcfea initial
dwinter
parents:
diff changeset
91
dc7622afcfea initial
dwinter
parents:
diff changeset
92 }
dc7622afcfea initial
dwinter
parents:
diff changeset
93
dc7622afcfea initial
dwinter
parents:
diff changeset
94 private static void doTheHarvestLanguage(File rdfFile, File docDir,File index_dir,
dc7622afcfea initial
dwinter
parents:
diff changeset
95 String mdProviderUrl, String lang) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
96 DocHarvesterThreaded harvester = new DocHarvesterThreaded(docDir,index_dir,null,mdProviderUrl,lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
97 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
98 harvester.harvestFromRDF(rdfFile.getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
99 } catch (JDOMException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
100 // TODO Auto-generated catch block
dc7622afcfea initial
dwinter
parents:
diff changeset
101 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
102 }
dc7622afcfea initial
dwinter
parents:
diff changeset
103
dc7622afcfea initial
dwinter
parents:
diff changeset
104 }
dc7622afcfea initial
dwinter
parents:
diff changeset
105
dc7622afcfea initial
dwinter
parents:
diff changeset
106
dc7622afcfea initial
dwinter
parents:
diff changeset
107
dc7622afcfea initial
dwinter
parents:
diff changeset
108 }