diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/DocHarvesterThreaded.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/DocHarvesterThreaded.java	Wed Nov 03 12:33:16 2010 +0100
@@ -0,0 +1,577 @@
+package de.mpiwg.dwinter.fulltextIndexer.harvester;
+
+/*   Harveste jeweils ein komplettes Buch in einen Eintrag
+ * 
+ *   */
+import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
+
+import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
+
+import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
+
+import java.io.BufferedReader;
+
+import java.io.File;
+
+import java.io.FileNotFoundException;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedWriter;
+import java.io.ByteArrayOutputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+
+import java.io.IOException;
+
+import java.io.PrintStream;
+
+import java.util.ArrayList;
+
+import java.util.Arrays;
+
+import java.util.Date;
+
+import java.util.HashMap;
+
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMResult;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import javax.xml.transform.stream.StreamSource;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+import org.apache.lucene.index.CorruptIndexException;
+
+import org.apache.lucene.store.LockObtainFailedException;
+
+import org.apache.lucene.util.Version;
+import org.apache.ws.commons.serialize.XMLWriterImpl;
+
+import org.jdom.Document;
+
+import org.jdom.Element;
+
+import org.jdom.JDOMException;
+
+import org.jdom.input.SAXBuilder;
+import org.jdom.xpath.XPath;
+import org.w3c.dom.DocumentFragment;
+import org.xml.sax.SAXException;
+
+import com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl;
+
+public class DocHarvesterThreaded {
+	private static final boolean DEBUG = false;
+	private static final int MAXFILES = 3;
+
+	//private static final String PREFIX = "/tmp/composed/files";
+	private static final String PREFIX = "/Volumes/data/composed/files";
+	private static final String COMPOSEDFN = "doc.xml";
+	private static final boolean CREATE_NEW = false;
+
+	protected static ArrayList<String> fileTypesToIndex = new ArrayList(
+			Arrays.asList(new String[] { "xml" }));
+
+	protected static ArrayList<String> excludeFolders = new ArrayList(
+			Arrays.asList(new String[] { "OCR" }));
+	protected static boolean indexMetaPriority = false;
+
+	private static String specialMode = "";
+	protected static int maxThread = 30;
+	protected File docDir;
+	protected File index_dir;
+	protected HashMap<String, String> textLanguage = null;
+	protected HashMap<String, String> languageToISO = new HashMap();
+	protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
+
+	private int counter = 0;
+	protected String languageFileName;
+	protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread];
+	private int filecount = 0;
+
+	protected String mdProviderUrl = null;
+	private String preferedLanguage;
+	protected HashMap<String, String> supportedLanguageFolder = new HashMap();
+	private int completedFiles = 0; // counter for all files completed and indexed
+
+	public DocHarvesterThreaded() {
+	}
+
+	public DocHarvesterThreaded(File docDir, File index_dir,
+			String languageFileName, String mdProviderUrl, String lang)
+			throws CorruptIndexException, LockObtainFailedException,
+			IOException {
+		/* 119 */this.docDir = docDir;
+		/* 120 */this.languageFileName = languageFileName;
+		/* 121 */this.preferedLanguage = lang;
+
+		/* 133 */this.mdProviderUrl = mdProviderUrl;
+
+		/* 135 */this.index_dir = index_dir;
+
+		/* 137 */for (int i = 0; i < maxThread; ++i) {
+			/* 139 */this.mythreads[i] = null;
+		}
+
+		/* 142 */init_languages();
+	}
+
+	private void init_languages() {
+		/* 146 */this.languageToISO.put("German", "de");
+		/* 147 */this.languageToISO.put("French", "fr");
+		/* 148 */this.languageToISO.put("English", "en");
+		/* 149 */this.languageToISO.put("German-f", "de-f");
+
+		/* 151 */this.supportedLanguageFolder.put("deu", "de");
+		/* 152 */this.supportedLanguageFolder.put("deu-f", "de");
+		/* 153 */this.supportedLanguageFolder.put("fra", "fr");
+		/* 154 */this.supportedLanguageFolder.put("eng", "en");
+		/* 155 */this.supportedLanguageFolder.put("lic", "la");
+		try {
+			/* 158 */this.languageAnalyzers.add(new LanguageAnalyzer("de",
+					new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
+			/* 160 */this.languageAnalyzers.add(new LanguageAnalyzer("en",
+					new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+			/* 161 */this.languageAnalyzers.add(new LanguageAnalyzer("fr",
+					new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
+			/* 162 */this.languageAnalyzers.add(new LanguageAnalyzer("la",
+					new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+
+			/* 164 */this.languageAnalyzers.add(new LanguageAnalyzer("all",
+					new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+			/* 165 */this.languageAnalyzers.add(new LanguageAnalyzer("morph",
+					new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+		} catch (CorruptIndexException e) {
+			/* 167 */e.printStackTrace();
+			/* 168 */System.exit(1);
+		} catch (LockObtainFailedException e) {
+			/* 170 */e.printStackTrace();
+			/* 171 */System.exit(1);
+		} catch (IOException e) {
+			/* 173 */e.printStackTrace();
+			/* 174 */System.exit(1);
+		}
+	}
+
+	public DocHarvesterThreaded(File docDir, File index_dir,
+			String mdProviderUrl) throws CorruptIndexException,
+			LockObtainFailedException, IOException {
+		/* 180 */this(docDir, index_dir, null, mdProviderUrl, null);
+	}
+
+	protected HashMap<String, String> loadLanguages() {
+		/* 187 */File languageFile = new File(this.docDir + "/"
+				+ this.languageFileName);
+		/* 188 */String languageFilePath = this.docDir + "/"
+				+ this.languageFileName;
+		/* 189 */HashMap languages = new HashMap();
+		/* 190 */boolean relativ = true;
+		/* 191 */if (this.languageFileName == null)
+			/* 192 */return null;
+		/* 193 */if (!languageFile.exists()) {
+			/* 195 */languageFile = new File(this.languageFileName);
+			/* 196 */languageFilePath = this.languageFileName;
+			/* 197 */relativ = false;
+			/* 198 */if (!languageFile.exists())
+				/* 199 */return null;
+		}
+		BufferedReader in;
+		try {
+			/* 203 */in = new BufferedReader(new FileReader(languageFilePath));
+		} catch (FileNotFoundException e) {
+			/* 205 */return null;
+		}
+
+		/* 208 */String zeile = null;
+		try {
+			/* 210 */while ((zeile = in.readLine()) != null) {
+				/* 211 */String[] splitted = zeile.replace("\"", "").split(
+						"[,]");
+				/* 212 */if (splitted.length == 2)
+					/* 213 */if (relativ)
+						/* 214 */languages.put(this.docDir + "/" + splitted[0],
+								splitted[1]);
+					else
+						/* 216 */languages.put(splitted[0], splitted[1]);
+			}
+		} catch (IOException e) {
+			/* 220 */e.printStackTrace();
+			/* 221 */return null;
+		}
+
+		/* 224 */return languages;
+	}
+
+	public void harvestFromRDF(String rdffilepath) throws InterruptedException,
+			JDOMException {
+		/* 228 */Date start = new Date();
+		/* 229 */boolean create = true;
+		try {
+			/* 240 */System.out.println("Indexing to directory '"
+					+ this.index_dir + "'...");
+			/* 241 */ArrayList<String> files = getFileListFromRDF(rdffilepath);
+			/* 242 */indexDocs(files);
+			/* 243 */System.out.println("Optimizing...");
+			/* 244 */this.languageAnalyzers.optimize();
+			/* 245 */this.languageAnalyzers.close();
+
+			/* 247 */Date end = new Date();
+			/* 248 */System.out.println(end.getTime() - start.getTime()
+					+ " total milliseconds");
+		} catch (IOException e) {
+			/* 251 */System.out.println(" caught a " + e.getClass() +
+			/* 252 */"\n with message: " + e.getMessage());
+		}
+	}
+
+	private ArrayList<String> getFileListFromRDF(String rdffilepath)
+			throws JDOMException, IOException {
+		/* 260 */ArrayList ret = new ArrayList();
+		/* 261 */SAXBuilder builder = new SAXBuilder();
+
+		/* 263 */Document doc = builder.build(rdffilepath);
+
+		/* 265 */Element el = doc.getRootElement();
+
+		/* 267 */XPath xpath = XPath.newInstance("//MPIWG:archive-path");
+		/* 268 */xpath.addNamespace("MPIWG",
+				"http://www.mpiwg-berlin.mpg.de/ns/mpiwg");
+		/* 269 */List<Element> paths = xpath.selectNodes(el);
+		/* 270 */for (Element path : paths) {
+			/* 271 */ret.add(path.getText());
+		}
+
+		/* 274 */return ret;
+	}
+
+	public void harvestFolder() throws InterruptedException {
+		/* 278 */Date start = new Date();
+		/* 279 */boolean create = true;
+		try {
+			/* 290 */System.out.println("Indexing to directory '"
+					+ this.index_dir + "'...");
+			/* 291 */indexDocs(this.docDir);
+			/* 292 */System.out.println("Optimizing...");
+			/* 293 */this.languageAnalyzers.optimize();
+			/* 294 */this.languageAnalyzers.close();
+
+			/* 296 */Date end = new Date();
+			/* 297 */System.out.println(end.getTime() - start.getTime()
+					+ " total milliseconds");
+		} catch (IOException e) {
+			/* 300 */System.out.println(" caught a " + e.getClass() +
+			/* 301 */"\n with message: " + e.getMessage());
+		}
+	}
+
+	private void indexDocs(ArrayList<String> files) throws IOException,
+			InterruptedException {
+		/* 308 */for (String filename : files) {
+			/* 310 */indexDocs(new File(this.docDir.getAbsolutePath()
+					+ filename));
+			if ((DEBUG == true) & (this.filecount > MAXFILES))
+				break;
+		}
+	}
+
+	void indexDocs(File file) throws IOException, InterruptedException {
+		/* 317 */if (!file.canRead())
+			return;
+		/* 319 */
+		/* 321 */if ((DEBUG == true) && (this.filecount > MAXFILES))
+			return;
+		/* 325 */String[] files = file.list();
+
+		/* 327 */String folderName = file.getName();
+
+		boolean notExists = !checkFileExists(file);
+		boolean createNew = CREATE_NEW || notExists;
+		// boolean createNew = true;
+		
+		boolean fileStillEmpty = true;
+		if (createNew) {
+			clearFile(file); // loesche das gesamtfile
+		} else {
+			fileStillEmpty = false; // assume that file is not empty, if it already exists
+		}
+		
+		
+		if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0
+				: 1)) != 0) {
+			for (int i = 0; i < files.length; ++i) {
+				File nextFile = new File(file, files[i]);
+
+				if (nextFile.isDirectory()) // directory dann gehe in die
+											// naechste ebene
+					indexDocs(nextFile);
+
+				else if (isTextFile(nextFile)) {
+
+					if (createNew) {
+						fileStillEmpty = false; //datei hat jetzt einen Inhalt
+						compose(file, nextFile); // fuege das file an das
+						// gesamtfilean
+					}
+
+				}
+				if ((DEBUG == true) && (this.filecount > MAXFILES))
+					break;
+			}
+			if (createNew) {
+				if (fileStillEmpty){
+					deleteComposedFile(file); // file hat keinen inhalt dann loeschen
+				} else {
+				finishFile(file);
+				}
+			}
+
+			if (!fileStillEmpty)
+				processCompleteFile(file);
+			/* 335 */} else {
+			/* 342 */System.out.println("not adding " + file);
+		}
+	}
+
+	private void finishFile(File folder) {
+		File cf = getComposedFile(folder);
+		System.out.println();
+		try {
+			System.out.println("finish file:" + cf.getCanonicalPath());
+			FileWriter fw = new FileWriter(cf, true);
+
+			fw.write("</document>");
+			fw.close();
+
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+	}
+	
+	private boolean deleteComposedFile(File folder) {
+		File cf = getComposedFile(folder);
+		try {
+			System.out.println("file deleted, because empty:" + cf.getCanonicalPath());
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		return cf.delete();
+	}
+
+
+	private void processCompleteFile(File folder) {
+		System.out.println("Completed File:"+String.valueOf(completedFiles++));
+		File cf = getComposedFile(folder);
+		try {
+			processFile(cf);
+		} catch (CorruptIndexException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} catch (LockObtainFailedException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+	}
+
+	private boolean checkFileExists(File folder) {
+		File cf = getComposedFile(folder);
+		return cf.exists();
+
+	}
+
+	private void clearFile(File folder) {
+		File cf = getComposedFile(folder);
+		cf.delete();
+		try {
+			File dir = cf.getParentFile();
+			if (false == dir.exists()) {
+				dir.mkdirs();
+			}
+
+			cf.createNewFile();
+
+			FileWriter fw = new FileWriter(cf);
+			fw.write("<document>");
+			fw.close();
+
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+	}
+
+	private void compose(File folder, File file) {
+		File cf = getComposedFile(folder);
+		try {
+			System.out.println("Adding" + file.getCanonicalPath());
+			//FileWriter fw = new FileWriter(cf, true);
+			
+			FileOutputStream stream = new FileOutputStream(cf,true);
+			
+			OutputStreamWriter fw = new OutputStreamWriter(stream, "utf-8");
+			
+			String filteredDocument="";
+			try {
+				filteredDocument = getFilteredFile(file);
+			} catch (TransformerException e) {
+				filteredDocument = "";
+			}catch (ParserConfigurationException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			} catch (SAXException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+
+			fw.append(filteredDocument);
+			fw.write("<pb name=\"");
+			fw.write(file.getName());
+			fw.write("\"/>");
+			fw.close();
+
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+	}
+
+	private String getFilteredFile(File file) throws IOException,
+			TransformerException, ParserConfigurationException, SAXException {
+
+		// String txt = IOUtils.toString(new FileInputStream(file));
+		// get rid of the entities
+		TransformerFactory tf = TransformerFactory.newInstance();
+		Transformer t = tf.newTransformer();
+
+		
+		//OutputStream output = new ByteArrayOutputStream();
+		
+		//BufferedWriter sw = new BufferedWriter(new OutputStreamWriter(output, "utf-8"));
+
+		DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+		dbf.setNamespaceAware(true);
+		dbf.setValidating(false);
+		DocumentBuilder db = dbf.newDocumentBuilder();
+		db.setEntityResolver(new MyResolver());
+		org.w3c.dom.Document doc = db.parse(file);
+		
+		StringWriter sw = new StringWriter();
+		StreamResult sr = new StreamResult(sw);
+		
+		org.w3c.dom.Document tgtDoc = db.newDocument(); 
+		DocumentFragment fragment = tgtDoc.createDocumentFragment(); 
+		DOMResult tgtDom = new DOMResult( fragment ); 
+		
+		t.setOutputProperty(OutputKeys.ENCODING, "utf-8");
+		t.transform(new DOMSource(doc), sr);
+		t.transform(new DOMSource(doc), tgtDom);
+		
+		String txt = sw.toString();
+		
+	
+
+		Pattern p = Pattern.compile("<body>(.*)</body>", Pattern.DOTALL);
+		Matcher m = p.matcher(txt);
+		if (m.find())
+			if (m.groupCount() > 0) {
+				return m.group(1);
+			}
+		return "";
+	}
+
+	private File getComposedFile(File folder) {
+		try {
+			String path = folder.getCanonicalPath();
+			String newPath = PREFIX + path + "/" + COMPOSEDFN;
+			return new File(newPath);
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		return null;
+	}
+
+	protected void processFile(File file) throws CorruptIndexException,
+			LockObtainFailedException, IOException {
+		/* 348 */int freeThread = -1;
+		/* 349 */while (freeThread == -1) {
+			/* 351 */freeThread = waitForFreeThread();
+		}
+
+		/* 355 */if (this.textLanguage == null)
+			/* 356 */this.textLanguage = loadLanguages();
+		/* 357 */this.mythreads[freeThread] = new ProcessFileThread(
+				this.languageAnalyzers, file, this.languageFileName,
+				this.textLanguage, this.mdProviderUrl, this.preferedLanguage,
+				this.languageToISO, this.supportedLanguageFolder);
+		/* 358 */this.mythreads[freeThread].start();
+		/* 359 */System.out.println("New process started:" + freeThread);
+	}
+
+	protected int waitForFreeThread() {
+		/* 367 */for (int i = 0; i < maxThread; ++i) {
+			/* 369 */if (this.mythreads[i] == null)
+				/* 370 */return i;
+			/* 371 */if (!this.mythreads[i].done)
+				continue;
+			/* 373 */this.filecount += 1;
+			/* 374 */System.out.println("filecount:" + this.filecount);
+			/* 375 */return i;
+		}
+
+		/* 378 */return -1;
+	}
+
+	private boolean isTextFile(File file) {
+		/* 392 */String fn = file.getName();
+
+		/* 394 */String[] splitted = fn.split("[.]");
+
+		/* 396 */String ext = "";
+
+		/* 398 */if (splitted.length > 1) {
+			/* 400 */ext = splitted[(splitted.length - 1)];
+		}
+		boolean ret = fileTypesToIndex.contains(ext);
+		/* 403 */return ret;
+	}
+
+}
+
+/*
+ * Location: /private/tmp/fulltextIndexer.jar Qualified Name:
+ * de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded JD-Core Version:
+ * 0.5.4
+ */
\ No newline at end of file