diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/CLI/.svn/text-base/OCRHarvesterCLIRDFMD.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/CLI/.svn/text-base/OCRHarvesterCLIRDFMD.java.svn-base	Wed Nov 03 12:33:16 2010 +0100
@@ -0,0 +1,97 @@
+package de.mpiwg.dwinter.fulltextIndexer.harvester.CLI;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.jdom.JDOMException;
+
+import de.mpiwg.dwinter.fulltextIndexer.harvester.OCRHarvesterThreaded;
+
+ 
+
+public class OCRHarvesterCLIRDFMD {
+
+	/** Index all text files under a directory. 
+	 * @throws IOException 
+	 * @throws LockObtainFailedException 
+	 * @throws CorruptIndexException 
+	 * @throws InterruptedException */
+	public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
+		String usage = "java de.mpiwg.itgroup.fulltext.harvester.HarvesterCLIRDF <pathToRDF> <root_directory> <index_dir> <mdProviderURL> --lang=<language>\n" +
+				"java de.mpiwg.itgroup.fulltext.harvester.HarvesterCLIRDF <pathtoRDF>  <root_directory> <index_dir>  <mdProviderURL> --langfile=<languageFile>\n";
+
+				
+		if (args.length != 4 & args.length != 5)  {
+			System.err.println("Usage: " + usage);
+			System.exit(1);
+		}
+
+//		if (INDEX_DIR.exists()) {
+//			System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
+//			System.exit(1);
+//		}
+
+		 File docDir = new File(args[1]);
+		     if ((!docDir.exists()) || (!docDir.canRead())) {
+		      System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path");
+		        System.exit(1);
+		    }
+		
+		final File rdfFile = new File(args[0]);
+		if (!rdfFile.exists() || !rdfFile.canRead()) {
+			System.out.println("RDFFile directory '" +rdfFile.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
+			System.exit(1);
+		}
+		
+		final File index_dir= new File(args[2]);
+		final String mdProviderUrl= args[3];
+		
+		if (args.length == 4){
+			doTheHarvest(rdfFile, docDir, index_dir, mdProviderUrl,null);
+		}
+		else {
+			 String[] options = args[4].split("=");
+		     if (options.length != 3) {
+			       System.err.println("wrong options:" + args[4]);
+			        System.exit(1);
+		       }
+			
+			 if (options[0].equals("--lang")) {
+		         doTheHarvestLanguage(rdfFile,docDir, index_dir, args[3], options[1]);
+		      } else if (options[0].equals("--langfile")) {
+			        doTheHarvest(rdfFile,docDir, index_dir, args[3], options[1]);
+			    } else {
+			        System.err.println("wrong options:" + options[0]);
+		       System.exit(1);
+			      }
+			    }
+			   }
+	
+	private static void doTheHarvest(File rdfFile, File docDir,File index_dir,
+			String mdProviderUrl, String languageFile) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
+		OCRHarvesterThreaded harvester = new OCRHarvesterThreaded(docDir,index_dir,languageFile,mdProviderUrl,null);
+		try {
+			harvester.harvestFromRDF(rdfFile.getAbsolutePath());
+		} catch (JDOMException e) {
+			e.printStackTrace();
+		}
+		
+	}
+
+	private static void doTheHarvestLanguage(File rdfFile, File docDir,File index_dir,
+			String mdProviderUrl, String lang) throws CorruptIndexException, LockObtainFailedException, IOException, InterruptedException {
+		OCRHarvesterThreaded harvester = new OCRHarvesterThreaded(docDir,index_dir,null,mdProviderUrl,lang);
+		try {
+			harvester.harvestFromRDF(rdfFile.getAbsolutePath());
+		} catch (JDOMException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		
+	}
+
+	
+
+}