view src/de/mpiwg/dwinter/fulltextIndexer/harvester/.svn/text-base/DocHarvesterThreaded.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line source

package de.mpiwg.dwinter.fulltextIndexer.harvester;

/*   Harveste jeweils ein komplettes Buch in einen Eintrag
 * 
 *   */
import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;

import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;

import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;

import java.io.IOException;

import java.io.PrintStream;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Date;

import java.util.HashMap;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMResult;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.de.GermanAnalyzer;

import org.apache.lucene.analysis.fr.FrenchAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.index.CorruptIndexException;

import org.apache.lucene.store.LockObtainFailedException;

import org.apache.lucene.util.Version;
import org.apache.ws.commons.serialize.XMLWriterImpl;

import org.jdom.Document;

import org.jdom.Element;

import org.jdom.JDOMException;

import org.jdom.input.SAXBuilder;
import org.jdom.xpath.XPath;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.SAXException;

import com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl;

public class DocHarvesterThreaded {
	private static final boolean DEBUG = false;
	private static final int MAXFILES = 3;

	//private static final String PREFIX = "/tmp/composed/files";
	private static final String PREFIX = "/Volumes/data/composed/files";
	private static final String COMPOSEDFN = "doc.xml";
	private static final boolean CREATE_NEW = false;

	protected static ArrayList<String> fileTypesToIndex = new ArrayList(
			Arrays.asList(new String[] { "xml" }));

	protected static ArrayList<String> excludeFolders = new ArrayList(
			Arrays.asList(new String[] { "OCR" }));
	protected static boolean indexMetaPriority = false;

	private static String specialMode = "";
	protected static int maxThread = 30;
	protected File docDir;
	protected File index_dir;
	protected HashMap<String, String> textLanguage = null;
	protected HashMap<String, String> languageToISO = new HashMap();
	protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();

	private int counter = 0;
	protected String languageFileName;
	protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread];
	private int filecount = 0;

	protected String mdProviderUrl = null;
	private String preferedLanguage;
	protected HashMap<String, String> supportedLanguageFolder = new HashMap();
	private int completedFiles = 0; // counter for all files completed and indexed

	public DocHarvesterThreaded() {
	}

	public DocHarvesterThreaded(File docDir, File index_dir,
			String languageFileName, String mdProviderUrl, String lang)
			throws CorruptIndexException, LockObtainFailedException,
			IOException {
		/* 119 */this.docDir = docDir;
		/* 120 */this.languageFileName = languageFileName;
		/* 121 */this.preferedLanguage = lang;

		/* 133 */this.mdProviderUrl = mdProviderUrl;

		/* 135 */this.index_dir = index_dir;

		/* 137 */for (int i = 0; i < maxThread; ++i) {
			/* 139 */this.mythreads[i] = null;
		}

		/* 142 */init_languages();
	}

	private void init_languages() {
		/* 146 */this.languageToISO.put("German", "de");
		/* 147 */this.languageToISO.put("French", "fr");
		/* 148 */this.languageToISO.put("English", "en");
		/* 149 */this.languageToISO.put("German-f", "de-f");

		/* 151 */this.supportedLanguageFolder.put("deu", "de");
		/* 152 */this.supportedLanguageFolder.put("deu-f", "de");
		/* 153 */this.supportedLanguageFolder.put("fra", "fr");
		/* 154 */this.supportedLanguageFolder.put("eng", "en");
		/* 155 */this.supportedLanguageFolder.put("lic", "la");
		try {
			/* 158 */this.languageAnalyzers.add(new LanguageAnalyzer("de",
					new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
			/* 160 */this.languageAnalyzers.add(new LanguageAnalyzer("en",
					new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
			/* 161 */this.languageAnalyzers.add(new LanguageAnalyzer("fr",
					new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
			/* 162 */this.languageAnalyzers.add(new LanguageAnalyzer("la",
					new StandardAnalyzer(Version.LUCENE_30), this.index_dir));

			/* 164 */this.languageAnalyzers.add(new LanguageAnalyzer("all",
					new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
			/* 165 */this.languageAnalyzers.add(new LanguageAnalyzer("morph",
					new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
		} catch (CorruptIndexException e) {
			/* 167 */e.printStackTrace();
			/* 168 */System.exit(1);
		} catch (LockObtainFailedException e) {
			/* 170 */e.printStackTrace();
			/* 171 */System.exit(1);
		} catch (IOException e) {
			/* 173 */e.printStackTrace();
			/* 174 */System.exit(1);
		}
	}

	public DocHarvesterThreaded(File docDir, File index_dir,
			String mdProviderUrl) throws CorruptIndexException,
			LockObtainFailedException, IOException {
		/* 180 */this(docDir, index_dir, null, mdProviderUrl, null);
	}

	protected HashMap<String, String> loadLanguages() {
		/* 187 */File languageFile = new File(this.docDir + "/"
				+ this.languageFileName);
		/* 188 */String languageFilePath = this.docDir + "/"
				+ this.languageFileName;
		/* 189 */HashMap languages = new HashMap();
		/* 190 */boolean relativ = true;
		/* 191 */if (this.languageFileName == null)
			/* 192 */return null;
		/* 193 */if (!languageFile.exists()) {
			/* 195 */languageFile = new File(this.languageFileName);
			/* 196 */languageFilePath = this.languageFileName;
			/* 197 */relativ = false;
			/* 198 */if (!languageFile.exists())
				/* 199 */return null;
		}
		BufferedReader in;
		try {
			/* 203 */in = new BufferedReader(new FileReader(languageFilePath));
		} catch (FileNotFoundException e) {
			/* 205 */return null;
		}

		/* 208 */String zeile = null;
		try {
			/* 210 */while ((zeile = in.readLine()) != null) {
				/* 211 */String[] splitted = zeile.replace("\"", "").split(
						"[,]");
				/* 212 */if (splitted.length == 2)
					/* 213 */if (relativ)
						/* 214 */languages.put(this.docDir + "/" + splitted[0],
								splitted[1]);
					else
						/* 216 */languages.put(splitted[0], splitted[1]);
			}
		} catch (IOException e) {
			/* 220 */e.printStackTrace();
			/* 221 */return null;
		}

		/* 224 */return languages;
	}

	public void harvestFromRDF(String rdffilepath) throws InterruptedException,
			JDOMException {
		/* 228 */Date start = new Date();
		/* 229 */boolean create = true;
		try {
			/* 240 */System.out.println("Indexing to directory '"
					+ this.index_dir + "'...");
			/* 241 */ArrayList<String> files = getFileListFromRDF(rdffilepath);
			/* 242 */indexDocs(files);
			/* 243 */System.out.println("Optimizing...");
			/* 244 */this.languageAnalyzers.optimize();
			/* 245 */this.languageAnalyzers.close();

			/* 247 */Date end = new Date();
			/* 248 */System.out.println(end.getTime() - start.getTime()
					+ " total milliseconds");
		} catch (IOException e) {
			/* 251 */System.out.println(" caught a " + e.getClass() +
			/* 252 */"\n with message: " + e.getMessage());
		}
	}

	private ArrayList<String> getFileListFromRDF(String rdffilepath)
			throws JDOMException, IOException {
		/* 260 */ArrayList ret = new ArrayList();
		/* 261 */SAXBuilder builder = new SAXBuilder();

		/* 263 */Document doc = builder.build(rdffilepath);

		/* 265 */Element el = doc.getRootElement();

		/* 267 */XPath xpath = XPath.newInstance("//MPIWG:archive-path");
		/* 268 */xpath.addNamespace("MPIWG",
				"http://www.mpiwg-berlin.mpg.de/ns/mpiwg");
		/* 269 */List<Element> paths = xpath.selectNodes(el);
		/* 270 */for (Element path : paths) {
			/* 271 */ret.add(path.getText());
		}

		/* 274 */return ret;
	}

	public void harvestFolder() throws InterruptedException {
		/* 278 */Date start = new Date();
		/* 279 */boolean create = true;
		try {
			/* 290 */System.out.println("Indexing to directory '"
					+ this.index_dir + "'...");
			/* 291 */indexDocs(this.docDir);
			/* 292 */System.out.println("Optimizing...");
			/* 293 */this.languageAnalyzers.optimize();
			/* 294 */this.languageAnalyzers.close();

			/* 296 */Date end = new Date();
			/* 297 */System.out.println(end.getTime() - start.getTime()
					+ " total milliseconds");
		} catch (IOException e) {
			/* 300 */System.out.println(" caught a " + e.getClass() +
			/* 301 */"\n with message: " + e.getMessage());
		}
	}

	private void indexDocs(ArrayList<String> files) throws IOException,
			InterruptedException {
		/* 308 */for (String filename : files) {
			/* 310 */indexDocs(new File(this.docDir.getAbsolutePath()
					+ filename));
			if ((DEBUG == true) & (this.filecount > MAXFILES))
				break;
		}
	}

	void indexDocs(File file) throws IOException, InterruptedException {
		/* 317 */if (!file.canRead())
			return;
		/* 319 */
		/* 321 */if ((DEBUG == true) && (this.filecount > MAXFILES))
			return;
		/* 325 */String[] files = file.list();

		/* 327 */String folderName = file.getName();

		boolean notExists = !checkFileExists(file);
		boolean createNew = CREATE_NEW || notExists;
		// boolean createNew = true;
		
		boolean fileStillEmpty = true;
		if (createNew) {
			clearFile(file); // loesche das gesamtfile
		} else {
			fileStillEmpty = false; // assume that file is not empty, if it already exists
		}
		
		
		if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0
				: 1)) != 0) {
			for (int i = 0; i < files.length; ++i) {
				File nextFile = new File(file, files[i]);

				if (nextFile.isDirectory()) // directory dann gehe in die
											// naechste ebene
					indexDocs(nextFile);

				else if (isTextFile(nextFile)) {

					if (createNew) {
						fileStillEmpty = false; //datei hat jetzt einen Inhalt
						compose(file, nextFile); // fuege das file an das
						// gesamtfilean
					}

				}
				if ((DEBUG == true) && (this.filecount > MAXFILES))
					break;
			}
			if (createNew) {
				if (fileStillEmpty){
					deleteComposedFile(file); // file hat keinen inhalt dann loeschen
				} else {
				finishFile(file);
				}
			}

			if (!fileStillEmpty)
				processCompleteFile(file);
			/* 335 */} else {
			/* 342 */System.out.println("not adding " + file);
		}
	}

	private void finishFile(File folder) {
		File cf = getComposedFile(folder);
		System.out.println();
		try {
			System.out.println("finish file:" + cf.getCanonicalPath());
			FileWriter fw = new FileWriter(cf, true);

			fw.write("</document>");
			fw.close();

		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}
	
	private boolean deleteComposedFile(File folder) {
		File cf = getComposedFile(folder);
		try {
			System.out.println("file deleted, because empty:" + cf.getCanonicalPath());
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return cf.delete();
	}


	private void processCompleteFile(File folder) {
		System.out.println("Completed File:"+String.valueOf(completedFiles++));
		File cf = getComposedFile(folder);
		try {
			processFile(cf);
		} catch (CorruptIndexException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (LockObtainFailedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	private boolean checkFileExists(File folder) {
		File cf = getComposedFile(folder);
		return cf.exists();

	}

	private void clearFile(File folder) {
		File cf = getComposedFile(folder);
		cf.delete();
		try {
			File dir = cf.getParentFile();
			if (false == dir.exists()) {
				dir.mkdirs();
			}

			cf.createNewFile();

			FileWriter fw = new FileWriter(cf);
			fw.write("<document>");
			fw.close();

		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	private void compose(File folder, File file) {
		File cf = getComposedFile(folder);
		try {
			System.out.println("Adding" + file.getCanonicalPath());
			//FileWriter fw = new FileWriter(cf, true);
			
			FileOutputStream stream = new FileOutputStream(cf,true);
			
			OutputStreamWriter fw = new OutputStreamWriter(stream, "utf-8");
			
			String filteredDocument="";
			try {
				filteredDocument = getFilteredFile(file);
			} catch (TransformerException e) {
				filteredDocument = "";
			}catch (ParserConfigurationException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (SAXException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}

			fw.append(filteredDocument);
			fw.write("<pb name=\"");
			fw.write(file.getName());
			fw.write("\"/>");
			fw.close();

		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	private String getFilteredFile(File file) throws IOException,
			TransformerException, ParserConfigurationException, SAXException {

		// String txt = IOUtils.toString(new FileInputStream(file));
		// get rid of the entities
		TransformerFactory tf = TransformerFactory.newInstance();
		Transformer t = tf.newTransformer();

		
		//OutputStream output = new ByteArrayOutputStream();
		
		//BufferedWriter sw = new BufferedWriter(new OutputStreamWriter(output, "utf-8"));

		DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
		dbf.setNamespaceAware(true);
		dbf.setValidating(false);
		DocumentBuilder db = dbf.newDocumentBuilder();
		db.setEntityResolver(new MyResolver());
		org.w3c.dom.Document doc = db.parse(file);
		
		StringWriter sw = new StringWriter();
		StreamResult sr = new StreamResult(sw);
		
		org.w3c.dom.Document tgtDoc = db.newDocument(); 
		DocumentFragment fragment = tgtDoc.createDocumentFragment(); 
		DOMResult tgtDom = new DOMResult( fragment ); 
		
		t.setOutputProperty(OutputKeys.ENCODING, "utf-8");
		t.transform(new DOMSource(doc), sr);
		t.transform(new DOMSource(doc), tgtDom);
		
		String txt = sw.toString();
		
	

		Pattern p = Pattern.compile("<body>(.*)</body>", Pattern.DOTALL);
		Matcher m = p.matcher(txt);
		if (m.find())
			if (m.groupCount() > 0) {
				return m.group(1);
			}
		return "";
	}

	private File getComposedFile(File folder) {
		try {
			String path = folder.getCanonicalPath();
			String newPath = PREFIX + path + "/" + COMPOSEDFN;
			return new File(newPath);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}

	protected void processFile(File file) throws CorruptIndexException,
			LockObtainFailedException, IOException {
		/* 348 */int freeThread = -1;
		/* 349 */while (freeThread == -1) {
			/* 351 */freeThread = waitForFreeThread();
		}

		/* 355 */if (this.textLanguage == null)
			/* 356 */this.textLanguage = loadLanguages();
		/* 357 */this.mythreads[freeThread] = new ProcessFileThread(
				this.languageAnalyzers, file, this.languageFileName,
				this.textLanguage, this.mdProviderUrl, this.preferedLanguage,
				this.languageToISO, this.supportedLanguageFolder);
		/* 358 */this.mythreads[freeThread].start();
		/* 359 */System.out.println("New process started:" + freeThread);
	}

	protected int waitForFreeThread() {
		/* 367 */for (int i = 0; i < maxThread; ++i) {
			/* 369 */if (this.mythreads[i] == null)
				/* 370 */return i;
			/* 371 */if (!this.mythreads[i].done)
				continue;
			/* 373 */this.filecount += 1;
			/* 374 */System.out.println("filecount:" + this.filecount);
			/* 375 */return i;
		}

		/* 378 */return -1;
	}

	private boolean isTextFile(File file) {
		/* 392 */String fn = file.getName();

		/* 394 */String[] splitted = fn.split("[.]");

		/* 396 */String ext = "";

		/* 398 */if (splitted.length > 1) {
			/* 400 */ext = splitted[(splitted.length - 1)];
		}
		boolean ret = fileTypesToIndex.contains(ext);
		/* 403 */return ret;
	}

}

/*
 * Location: /private/tmp/fulltextIndexer.jar Qualified Name:
 * de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded JD-Core Version:
 * 0.5.4
 */