Mercurial > hg > fulltextIndexer
diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/DocHarvesterThreaded.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/DocHarvesterThreaded.java Wed Nov 03 12:33:16 2010 +0100 @@ -0,0 +1,577 @@ +package de.mpiwg.dwinter.fulltextIndexer.harvester; + +/* Harveste jeweils ein komplettes Buch in einen Eintrag + * + * */ +import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread; + +import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer; + +import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; + +import java.io.BufferedReader; + +import java.io.File; + +import java.io.FileNotFoundException; + +import java.io.BufferedInputStream; +import java.io.BufferedWriter; +import java.io.ByteArrayOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.StringWriter; + +import java.io.IOException; + +import java.io.PrintStream; + +import java.util.ArrayList; + +import java.util.Arrays; + +import java.util.Date; + +import java.util.HashMap; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMResult; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; + +import org.apache.commons.io.IOUtils; +import org.apache.lucene.analysis.de.GermanAnalyzer; + +import org.apache.lucene.analysis.fr.FrenchAnalyzer; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +import org.apache.lucene.index.CorruptIndexException; + +import org.apache.lucene.store.LockObtainFailedException; + +import org.apache.lucene.util.Version; +import org.apache.ws.commons.serialize.XMLWriterImpl; + +import org.jdom.Document; + +import org.jdom.Element; + +import org.jdom.JDOMException; + +import org.jdom.input.SAXBuilder; +import org.jdom.xpath.XPath; +import org.w3c.dom.DocumentFragment; +import org.xml.sax.SAXException; + +import com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl; + +public class DocHarvesterThreaded { + private static final boolean DEBUG = false; + private static final int MAXFILES = 3; + + //private static final String PREFIX = "/tmp/composed/files"; + private static final String PREFIX = "/Volumes/data/composed/files"; + private static final String COMPOSEDFN = "doc.xml"; + private static final boolean CREATE_NEW = false; + + protected static ArrayList<String> fileTypesToIndex = new ArrayList( + Arrays.asList(new String[] { "xml" })); + + protected static ArrayList<String> excludeFolders = new ArrayList( + Arrays.asList(new String[] { "OCR" })); + protected static boolean indexMetaPriority = false; + + private static String specialMode = ""; + protected static int maxThread = 30; + protected File docDir; + protected File index_dir; + protected HashMap<String, String> textLanguage = null; + protected HashMap<String, String> languageToISO = new HashMap(); + protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers(); + + private int counter = 0; + protected String languageFileName; + protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread]; + private int filecount = 0; + + protected String mdProviderUrl = null; + private String preferedLanguage; + protected HashMap<String, String> supportedLanguageFolder = new HashMap(); + private int completedFiles = 0; // counter for all files completed and indexed + + public DocHarvesterThreaded() { + } + + public DocHarvesterThreaded(File docDir, File index_dir, + String languageFileName, String mdProviderUrl, String lang) + throws CorruptIndexException, LockObtainFailedException, + IOException { + /* 119 */this.docDir = docDir; + /* 120 */this.languageFileName = languageFileName; + /* 121 */this.preferedLanguage = lang; + + /* 133 */this.mdProviderUrl = mdProviderUrl; + + /* 135 */this.index_dir = index_dir; + + /* 137 */for (int i = 0; i < maxThread; ++i) { + /* 139 */this.mythreads[i] = null; + } + + /* 142 */init_languages(); + } + + private void init_languages() { + /* 146 */this.languageToISO.put("German", "de"); + /* 147 */this.languageToISO.put("French", "fr"); + /* 148 */this.languageToISO.put("English", "en"); + /* 149 */this.languageToISO.put("German-f", "de-f"); + + /* 151 */this.supportedLanguageFolder.put("deu", "de"); + /* 152 */this.supportedLanguageFolder.put("deu-f", "de"); + /* 153 */this.supportedLanguageFolder.put("fra", "fr"); + /* 154 */this.supportedLanguageFolder.put("eng", "en"); + /* 155 */this.supportedLanguageFolder.put("lic", "la"); + try { + /* 158 */this.languageAnalyzers.add(new LanguageAnalyzer("de", + new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); + /* 160 */this.languageAnalyzers.add(new LanguageAnalyzer("en", + new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); + /* 161 */this.languageAnalyzers.add(new LanguageAnalyzer("fr", + new FrenchAnalyzer(Version.LUCENE_30), this.index_dir)); + /* 162 */this.languageAnalyzers.add(new LanguageAnalyzer("la", + new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); + + /* 164 */this.languageAnalyzers.add(new LanguageAnalyzer("all", + new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); + /* 165 */this.languageAnalyzers.add(new LanguageAnalyzer("morph", + new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); + } catch (CorruptIndexException e) { + /* 167 */e.printStackTrace(); + /* 168 */System.exit(1); + } catch (LockObtainFailedException e) { + /* 170 */e.printStackTrace(); + /* 171 */System.exit(1); + } catch (IOException e) { + /* 173 */e.printStackTrace(); + /* 174 */System.exit(1); + } + } + + public DocHarvesterThreaded(File docDir, File index_dir, + String mdProviderUrl) throws CorruptIndexException, + LockObtainFailedException, IOException { + /* 180 */this(docDir, index_dir, null, mdProviderUrl, null); + } + + protected HashMap<String, String> loadLanguages() { + /* 187 */File languageFile = new File(this.docDir + "/" + + this.languageFileName); + /* 188 */String languageFilePath = this.docDir + "/" + + this.languageFileName; + /* 189 */HashMap languages = new HashMap(); + /* 190 */boolean relativ = true; + /* 191 */if (this.languageFileName == null) + /* 192 */return null; + /* 193 */if (!languageFile.exists()) { + /* 195 */languageFile = new File(this.languageFileName); + /* 196 */languageFilePath = this.languageFileName; + /* 197 */relativ = false; + /* 198 */if (!languageFile.exists()) + /* 199 */return null; + } + BufferedReader in; + try { + /* 203 */in = new BufferedReader(new FileReader(languageFilePath)); + } catch (FileNotFoundException e) { + /* 205 */return null; + } + + /* 208 */String zeile = null; + try { + /* 210 */while ((zeile = in.readLine()) != null) { + /* 211 */String[] splitted = zeile.replace("\"", "").split( + "[,]"); + /* 212 */if (splitted.length == 2) + /* 213 */if (relativ) + /* 214 */languages.put(this.docDir + "/" + splitted[0], + splitted[1]); + else + /* 216 */languages.put(splitted[0], splitted[1]); + } + } catch (IOException e) { + /* 220 */e.printStackTrace(); + /* 221 */return null; + } + + /* 224 */return languages; + } + + public void harvestFromRDF(String rdffilepath) throws InterruptedException, + JDOMException { + /* 228 */Date start = new Date(); + /* 229 */boolean create = true; + try { + /* 240 */System.out.println("Indexing to directory '" + + this.index_dir + "'..."); + /* 241 */ArrayList<String> files = getFileListFromRDF(rdffilepath); + /* 242 */indexDocs(files); + /* 243 */System.out.println("Optimizing..."); + /* 244 */this.languageAnalyzers.optimize(); + /* 245 */this.languageAnalyzers.close(); + + /* 247 */Date end = new Date(); + /* 248 */System.out.println(end.getTime() - start.getTime() + + " total milliseconds"); + } catch (IOException e) { + /* 251 */System.out.println(" caught a " + e.getClass() + + /* 252 */"\n with message: " + e.getMessage()); + } + } + + private ArrayList<String> getFileListFromRDF(String rdffilepath) + throws JDOMException, IOException { + /* 260 */ArrayList ret = new ArrayList(); + /* 261 */SAXBuilder builder = new SAXBuilder(); + + /* 263 */Document doc = builder.build(rdffilepath); + + /* 265 */Element el = doc.getRootElement(); + + /* 267 */XPath xpath = XPath.newInstance("//MPIWG:archive-path"); + /* 268 */xpath.addNamespace("MPIWG", + "http://www.mpiwg-berlin.mpg.de/ns/mpiwg"); + /* 269 */List<Element> paths = xpath.selectNodes(el); + /* 270 */for (Element path : paths) { + /* 271 */ret.add(path.getText()); + } + + /* 274 */return ret; + } + + public void harvestFolder() throws InterruptedException { + /* 278 */Date start = new Date(); + /* 279 */boolean create = true; + try { + /* 290 */System.out.println("Indexing to directory '" + + this.index_dir + "'..."); + /* 291 */indexDocs(this.docDir); + /* 292 */System.out.println("Optimizing..."); + /* 293 */this.languageAnalyzers.optimize(); + /* 294 */this.languageAnalyzers.close(); + + /* 296 */Date end = new Date(); + /* 297 */System.out.println(end.getTime() - start.getTime() + + " total milliseconds"); + } catch (IOException e) { + /* 300 */System.out.println(" caught a " + e.getClass() + + /* 301 */"\n with message: " + e.getMessage()); + } + } + + private void indexDocs(ArrayList<String> files) throws IOException, + InterruptedException { + /* 308 */for (String filename : files) { + /* 310 */indexDocs(new File(this.docDir.getAbsolutePath() + + filename)); + if ((DEBUG == true) & (this.filecount > MAXFILES)) + break; + } + } + + void indexDocs(File file) throws IOException, InterruptedException { + /* 317 */if (!file.canRead()) + return; + /* 319 */ + /* 321 */if ((DEBUG == true) && (this.filecount > MAXFILES)) + return; + /* 325 */String[] files = file.list(); + + /* 327 */String folderName = file.getName(); + + boolean notExists = !checkFileExists(file); + boolean createNew = CREATE_NEW || notExists; + // boolean createNew = true; + + boolean fileStillEmpty = true; + if (createNew) { + clearFile(file); // loesche das gesamtfile + } else { + fileStillEmpty = false; // assume that file is not empty, if it already exists + } + + + if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0 + : 1)) != 0) { + for (int i = 0; i < files.length; ++i) { + File nextFile = new File(file, files[i]); + + if (nextFile.isDirectory()) // directory dann gehe in die + // naechste ebene + indexDocs(nextFile); + + else if (isTextFile(nextFile)) { + + if (createNew) { + fileStillEmpty = false; //datei hat jetzt einen Inhalt + compose(file, nextFile); // fuege das file an das + // gesamtfilean + } + + } + if ((DEBUG == true) && (this.filecount > MAXFILES)) + break; + } + if (createNew) { + if (fileStillEmpty){ + deleteComposedFile(file); // file hat keinen inhalt dann loeschen + } else { + finishFile(file); + } + } + + if (!fileStillEmpty) + processCompleteFile(file); + /* 335 */} else { + /* 342 */System.out.println("not adding " + file); + } + } + + private void finishFile(File folder) { + File cf = getComposedFile(folder); + System.out.println(); + try { + System.out.println("finish file:" + cf.getCanonicalPath()); + FileWriter fw = new FileWriter(cf, true); + + fw.write("</document>"); + fw.close(); + + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + private boolean deleteComposedFile(File folder) { + File cf = getComposedFile(folder); + try { + System.out.println("file deleted, because empty:" + cf.getCanonicalPath()); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return cf.delete(); + } + + + private void processCompleteFile(File folder) { + System.out.println("Completed File:"+String.valueOf(completedFiles++)); + File cf = getComposedFile(folder); + try { + processFile(cf); + } catch (CorruptIndexException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (LockObtainFailedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + private boolean checkFileExists(File folder) { + File cf = getComposedFile(folder); + return cf.exists(); + + } + + private void clearFile(File folder) { + File cf = getComposedFile(folder); + cf.delete(); + try { + File dir = cf.getParentFile(); + if (false == dir.exists()) { + dir.mkdirs(); + } + + cf.createNewFile(); + + FileWriter fw = new FileWriter(cf); + fw.write("<document>"); + fw.close(); + + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + private void compose(File folder, File file) { + File cf = getComposedFile(folder); + try { + System.out.println("Adding" + file.getCanonicalPath()); + //FileWriter fw = new FileWriter(cf, true); + + FileOutputStream stream = new FileOutputStream(cf,true); + + OutputStreamWriter fw = new OutputStreamWriter(stream, "utf-8"); + + String filteredDocument=""; + try { + filteredDocument = getFilteredFile(file); + } catch (TransformerException e) { + filteredDocument = ""; + }catch (ParserConfigurationException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (SAXException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + fw.append(filteredDocument); + fw.write("<pb name=\""); + fw.write(file.getName()); + fw.write("\"/>"); + fw.close(); + + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + private String getFilteredFile(File file) throws IOException, + TransformerException, ParserConfigurationException, SAXException { + + // String txt = IOUtils.toString(new FileInputStream(file)); + // get rid of the entities + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer t = tf.newTransformer(); + + + //OutputStream output = new ByteArrayOutputStream(); + + //BufferedWriter sw = new BufferedWriter(new OutputStreamWriter(output, "utf-8")); + + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + dbf.setNamespaceAware(true); + dbf.setValidating(false); + DocumentBuilder db = dbf.newDocumentBuilder(); + db.setEntityResolver(new MyResolver()); + org.w3c.dom.Document doc = db.parse(file); + + StringWriter sw = new StringWriter(); + StreamResult sr = new StreamResult(sw); + + org.w3c.dom.Document tgtDoc = db.newDocument(); + DocumentFragment fragment = tgtDoc.createDocumentFragment(); + DOMResult tgtDom = new DOMResult( fragment ); + + t.setOutputProperty(OutputKeys.ENCODING, "utf-8"); + t.transform(new DOMSource(doc), sr); + t.transform(new DOMSource(doc), tgtDom); + + String txt = sw.toString(); + + + + Pattern p = Pattern.compile("<body>(.*)</body>", Pattern.DOTALL); + Matcher m = p.matcher(txt); + if (m.find()) + if (m.groupCount() > 0) { + return m.group(1); + } + return ""; + } + + private File getComposedFile(File folder) { + try { + String path = folder.getCanonicalPath(); + String newPath = PREFIX + path + "/" + COMPOSEDFN; + return new File(newPath); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return null; + } + + protected void processFile(File file) throws CorruptIndexException, + LockObtainFailedException, IOException { + /* 348 */int freeThread = -1; + /* 349 */while (freeThread == -1) { + /* 351 */freeThread = waitForFreeThread(); + } + + /* 355 */if (this.textLanguage == null) + /* 356 */this.textLanguage = loadLanguages(); + /* 357 */this.mythreads[freeThread] = new ProcessFileThread( + this.languageAnalyzers, file, this.languageFileName, + this.textLanguage, this.mdProviderUrl, this.preferedLanguage, + this.languageToISO, this.supportedLanguageFolder); + /* 358 */this.mythreads[freeThread].start(); + /* 359 */System.out.println("New process started:" + freeThread); + } + + protected int waitForFreeThread() { + /* 367 */for (int i = 0; i < maxThread; ++i) { + /* 369 */if (this.mythreads[i] == null) + /* 370 */return i; + /* 371 */if (!this.mythreads[i].done) + continue; + /* 373 */this.filecount += 1; + /* 374 */System.out.println("filecount:" + this.filecount); + /* 375 */return i; + } + + /* 378 */return -1; + } + + private boolean isTextFile(File file) { + /* 392 */String fn = file.getName(); + + /* 394 */String[] splitted = fn.split("[.]"); + + /* 396 */String ext = ""; + + /* 398 */if (splitted.length > 1) { + /* 400 */ext = splitted[(splitted.length - 1)]; + } + boolean ret = fileTypesToIndex.contains(ext); + /* 403 */return ret; + } + +} + +/* + * Location: /private/tmp/fulltextIndexer.jar Qualified Name: + * de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded JD-Core Version: + * 0.5.4 + */ \ No newline at end of file