Mercurial > hg > fulltextIndexer
diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/HarvesterThreaded.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/HarvesterThreaded.java Wed Nov 03 12:33:16 2010 +0100 @@ -0,0 +1,311 @@ +/* */ package de.mpiwg.dwinter.fulltextIndexer.harvester; +/* */ +/* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread; +/* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer; +/* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; +/* */ import java.io.BufferedReader; +/* */ import java.io.File; +/* */ import java.io.FileNotFoundException; +/* */ import java.io.FileReader; +/* */ import java.io.IOException; +/* */ import java.io.PrintStream; +/* */ import java.util.ArrayList; +/* */ import java.util.Arrays; +/* */ import java.util.Date; +/* */ import java.util.HashMap; +/* */ import java.util.List; +/* */ import org.apache.lucene.analysis.de.GermanAnalyzer; +/* */ import org.apache.lucene.analysis.fr.FrenchAnalyzer; +/* */ import org.apache.lucene.analysis.standard.StandardAnalyzer; +/* */ import org.apache.lucene.index.CorruptIndexException; +/* */ import org.apache.lucene.store.LockObtainFailedException; +/* */ import org.apache.lucene.util.Version; +/* */ import org.jdom.Document; +/* */ import org.jdom.Element; +/* */ import org.jdom.JDOMException; +/* */ import org.jdom.input.SAXBuilder; +/* */ import org.jdom.xpath.XPath; +/* */ +/* */ public class HarvesterThreaded +/* */ { +/* */ private static final boolean DEBUG = false; + private static final int MAXFILES = 100; // only used if DEBUG is true +/* 75 */ protected static ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" })); +/* */ +/* 77 */ protected static ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" })); +/* 78 */ protected static boolean indexMetaPriority = false; +/* */ +/* 81 */ private static String specialMode = ""; +/* 82 */ protected static int maxThread = 30; +/* */ protected File docDir; +/* */ protected File index_dir; +/* 88 */ protected HashMap<String, String> textLanguage = null; +/* 89 */ protected HashMap<String, String> languageToISO = new HashMap(); +/* 90 */ protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers(); +/* */ +/* 92 */ private int counter = 0; +/* */ protected String languageFileName; +/* 99 */ protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread]; +/* 100 */ private int filecount = 0; +/* */ +/* 102 */ protected String mdProviderUrl = null; +/* */ private String preferedLanguage; +/* 106 */ protected HashMap<String, String> supportedLanguageFolder = new HashMap(); +/* */ +/* */ public HarvesterThreaded() +/* */ { +/* */ } +/* */ +/* */ public HarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang) +/* */ throws CorruptIndexException, LockObtainFailedException, IOException +/* */ { +/* 119 */ this.docDir = docDir; +/* 120 */ this.languageFileName = languageFileName; +/* 121 */ this.preferedLanguage = lang; +/* */ +/* 133 */ this.mdProviderUrl = mdProviderUrl; +/* */ +/* 135 */ this.index_dir = index_dir; +/* */ +/* 137 */ for (int i = 0; i < maxThread; ++i) +/* */ { +/* 139 */ this.mythreads[i] = null; +/* */ } +/* */ +/* 142 */ init_languages(); +/* */ } +/* */ +/* */ private void init_languages() { +/* 146 */ this.languageToISO.put("German", "de"); +/* 147 */ this.languageToISO.put("French", "fr"); +/* 148 */ this.languageToISO.put("English", "en"); +/* 149 */ this.languageToISO.put("German-f", "de-f"); +/* */ +/* 151 */ this.supportedLanguageFolder.put("deu", "de"); +/* 152 */ this.supportedLanguageFolder.put("deu-f", "de"); +/* 153 */ this.supportedLanguageFolder.put("fra", "fr"); +/* 154 */ this.supportedLanguageFolder.put("eng", "en"); +/* 155 */ this.supportedLanguageFolder.put("lic", "la"); +/* */ try +/* */ { +/* 158 */ this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 159 */ this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 160 */ this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 161 */ this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 162 */ this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); +/* */ +/* 164 */ this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 165 */ this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); +/* */ } catch (CorruptIndexException e) { +/* 167 */ e.printStackTrace(); +/* 168 */ System.exit(1); +/* */ } catch (LockObtainFailedException e) { +/* 170 */ e.printStackTrace(); +/* 171 */ System.exit(1); +/* */ } catch (IOException e) { +/* 173 */ e.printStackTrace(); +/* 174 */ System.exit(1); +/* */ } +/* */ } +/* */ +/* */ public HarvesterThreaded(File docDir, File index_dir, String mdProviderUrl) throws CorruptIndexException, LockObtainFailedException, IOException +/* */ { +/* 180 */ this(docDir, index_dir, null, mdProviderUrl, null); +/* */ } +/* */ +/* */ protected HashMap<String, String> loadLanguages() +/* */ { +/* 187 */ File languageFile = new File(this.docDir + "/" + this.languageFileName); +/* 188 */ String languageFilePath = this.docDir + "/" + this.languageFileName; +/* 189 */ HashMap languages = new HashMap(); +/* 190 */ boolean relativ = true; +/* 191 */ if (this.languageFileName == null) +/* 192 */ return null; +/* 193 */ if (!languageFile.exists()) +/* */ { +/* 195 */ languageFile = new File(this.languageFileName); +/* 196 */ languageFilePath = this.languageFileName; +/* 197 */ relativ = false; +/* 198 */ if (!languageFile.exists()) +/* 199 */ return null; +/* */ } +/* */ BufferedReader in; +/* */ try { +/* 203 */ in = new BufferedReader(new FileReader(languageFilePath)); +/* */ } catch (FileNotFoundException e) { +/* 205 */ return null; +/* */ } +/* */ +/* 208 */ String zeile = null; +/* */ try { +/* 210 */ while ((zeile = in.readLine()) != null) { +/* 211 */ String[] splitted = zeile.replace("\"", "").split("[,]"); +/* 212 */ if (splitted.length == 2) +/* 213 */ if (relativ) +/* 214 */ languages.put(this.docDir + "/" + splitted[0], splitted[1]); +/* */ else +/* 216 */ languages.put(splitted[0], splitted[1]); +/* */ } +/* */ } +/* */ catch (IOException e) { +/* 220 */ e.printStackTrace(); +/* 221 */ return null; +/* */ } +/* */ +/* 224 */ return languages; +/* */ } +/* */ +/* */ public void harvestFromRDF(String rdffilepath) throws InterruptedException, JDOMException { +/* 228 */ Date start = new Date(); +/* 229 */ boolean create = true; +/* */ try +/* */ { +/* 240 */ System.out.println("Indexing to directory '" + this.index_dir + "'..."); +/* 241 */ ArrayList files = getFileListFromRDF(rdffilepath); +/* 242 */ indexDocs(files); +/* 243 */ System.out.println("Optimizing..."); +/* 244 */ this.languageAnalyzers.optimize(); +/* 245 */ this.languageAnalyzers.close(); +/* */ +/* 247 */ Date end = new Date(); +/* 248 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds"); +/* */ } +/* */ catch (IOException e) { +/* 251 */ System.out.println(" caught a " + e.getClass() + +/* 252 */ "\n with message: " + e.getMessage()); +/* */ } +/* */ } +/* */ +/* */ private ArrayList<String> getFileListFromRDF(String rdffilepath) +/* */ throws JDOMException, IOException +/* */ { +/* 260 */ ArrayList ret = new ArrayList(); +/* 261 */ SAXBuilder builder = new SAXBuilder(); +/* */ +/* 263 */ Document doc = builder.build(rdffilepath); +/* */ +/* 265 */ Element el = doc.getRootElement(); +/* */ +/* 267 */ XPath xpath = XPath.newInstance("//MPIWG:archive-path"); +/* 268 */ xpath.addNamespace("MPIWG", "http://www.mpiwg-berlin.mpg.de/ns/mpiwg"); +/* 269 */ List<Element> paths = xpath.selectNodes(el); +/* 270 */ for (Element path : paths) { +/* 271 */ ret.add(path.getText()); +/* */ } +/* */ +/* 274 */ return ret; +/* */ } +/* */ +/* */ public void harvestFolder() throws InterruptedException { +/* 278 */ Date start = new Date(); +/* 279 */ boolean create = true; +/* */ try +/* */ { +/* 290 */ System.out.println("Indexing to directory '" + this.index_dir + "'..."); +/* 291 */ indexDocs(this.docDir); +/* 292 */ System.out.println("Optimizing..."); +/* 293 */ this.languageAnalyzers.optimize(); +/* 294 */ this.languageAnalyzers.close(); +/* */ +/* 296 */ Date end = new Date(); +/* 297 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds"); +/* */ } +/* */ catch (IOException e) { +/* 300 */ System.out.println(" caught a " + e.getClass() + +/* 301 */ "\n with message: " + e.getMessage()); +/* */ } +/* */ } +/* */ +/* */ private void indexDocs(ArrayList<String> files) +/* */ throws IOException, InterruptedException +/* */ { +/* 308 */ for (String filename : files) +/* */ { +/* 310 */ indexDocs(new File(this.docDir.getAbsolutePath() + filename)); + if ((DEBUG==true) & (this.filecount>MAXFILES)) + break; +/* */ } +/* */ } +/* */ +/* */ void indexDocs(File file) +/* */ throws IOException, InterruptedException +/* */ { +/* 317 */ if (!file.canRead()) +/* */ return; +/* 319 */ if (file.isDirectory()) +/* */ { +/* 321 */ if ((DEBUG==true) && (this.filecount>MAXFILES)) + return; +/* 325 */ String[] files = file.list(); +/* */ +/* 327 */ String folderName = file.getName(); +/* 328 */ if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0 : 1)) != 0) +/* 329 */ for (int i = 0; i < files.length; ++i) +/* */ { +/* 332 */ indexDocs(new File(file, files[i])); + if ((DEBUG==true) && (this.filecount>MAXFILES)) + break; +/* */ } +/* */ } +/* 335 */ else if (isTextFile(file)) +/* */ { +/* 338 */ processFile(file); +/* */ } +/* */ else +/* */ { +/* 342 */ System.out.println("not adding " + file); +/* */ } +/* */ } +/* */ +/* */ protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException +/* */ { +/* 348 */ int freeThread = -1; +/* 349 */ while (freeThread == -1) +/* */ { +/* 351 */ freeThread = waitForFreeThread(); +/* */ } +/* */ +/* 355 */ if (this.textLanguage == null) +/* 356 */ this.textLanguage = loadLanguages(); +/* 357 */ this.mythreads[freeThread] = new ProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder); +/* 358 */ this.mythreads[freeThread].start(); +/* 359 */ System.out.println("New process started:" + freeThread); +/* */ } +/* */ +/* */ protected int waitForFreeThread() +/* */ { +/* 367 */ for (int i = 0; i < maxThread; ++i) +/* */ { +/* 369 */ if (this.mythreads[i] == null) +/* 370 */ return i; +/* 371 */ if (!this.mythreads[i].done) +/* */ continue; +/* 373 */ this.filecount += 1; +/* 374 */ System.out.println("filecount:" + this.filecount); +/* 375 */ return i; +/* */ } +/* */ +/* 378 */ return -1; +/* */ } +/* */ +/* */ private boolean isTextFile(File file) +/* */ { +/* 392 */ String fn = file.getName(); +/* */ +/* 394 */ String[] splitted = fn.split("[.]"); +/* */ +/* 396 */ String ext = ""; +/* */ +/* 398 */ if (splitted.length > 1) +/* */ { +/* 400 */ ext = splitted[(splitted.length - 1)]; +/* */ } +/* */ +/* 403 */ return fileTypesToIndex.contains(ext); +/* */ } +/* */ } + +/* Location: /private/tmp/fulltextIndexer.jar + * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded + * JD-Core Version: 0.5.4 + */ \ No newline at end of file