Mercurial > hg > fulltextIndexer
view src/de/mpiwg/dwinter/fulltextIndexer/harvester/.svn/text-base/DocHarvesterThreaded.java.svn-base @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.dwinter.fulltextIndexer.harvester; /* Harveste jeweils ein komplettes Buch in einen Eintrag * * */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread; import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer; import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.BufferedInputStream; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.StringWriter; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMResult; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import org.apache.commons.io.IOUtils; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import org.apache.ws.commons.serialize.XMLWriterImpl; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import org.jdom.xpath.XPath; import org.w3c.dom.DocumentFragment; import org.xml.sax.SAXException; import com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl; public class DocHarvesterThreaded { private static final boolean DEBUG = false; private static final int MAXFILES = 3; //private static final String PREFIX = "/tmp/composed/files"; private static final String PREFIX = "/Volumes/data/composed/files"; private static final String COMPOSEDFN = "doc.xml"; private static final boolean CREATE_NEW = false; protected static ArrayList<String> fileTypesToIndex = new ArrayList( Arrays.asList(new String[] { "xml" })); protected static ArrayList<String> excludeFolders = new ArrayList( Arrays.asList(new String[] { "OCR" })); protected static boolean indexMetaPriority = false; private static String specialMode = ""; protected static int maxThread = 30; protected File docDir; protected File index_dir; protected HashMap<String, String> textLanguage = null; protected HashMap<String, String> languageToISO = new HashMap(); protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers(); private int counter = 0; protected String languageFileName; protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread]; private int filecount = 0; protected String mdProviderUrl = null; private String preferedLanguage; protected HashMap<String, String> supportedLanguageFolder = new HashMap(); private int completedFiles = 0; // counter for all files completed and indexed public DocHarvesterThreaded() { } public DocHarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang) throws CorruptIndexException, LockObtainFailedException, IOException { /* 119 */this.docDir = docDir; /* 120 */this.languageFileName = languageFileName; /* 121 */this.preferedLanguage = lang; /* 133 */this.mdProviderUrl = mdProviderUrl; /* 135 */this.index_dir = index_dir; /* 137 */for (int i = 0; i < maxThread; ++i) { /* 139 */this.mythreads[i] = null; } /* 142 */init_languages(); } private void init_languages() { /* 146 */this.languageToISO.put("German", "de"); /* 147 */this.languageToISO.put("French", "fr"); /* 148 */this.languageToISO.put("English", "en"); /* 149 */this.languageToISO.put("German-f", "de-f"); /* 151 */this.supportedLanguageFolder.put("deu", "de"); /* 152 */this.supportedLanguageFolder.put("deu-f", "de"); /* 153 */this.supportedLanguageFolder.put("fra", "fr"); /* 154 */this.supportedLanguageFolder.put("eng", "en"); /* 155 */this.supportedLanguageFolder.put("lic", "la"); try { /* 158 */this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); /* 160 */this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); /* 161 */this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir)); /* 162 */this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); /* 164 */this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); /* 165 */this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); } catch (CorruptIndexException e) { /* 167 */e.printStackTrace(); /* 168 */System.exit(1); } catch (LockObtainFailedException e) { /* 170 */e.printStackTrace(); /* 171 */System.exit(1); } catch (IOException e) { /* 173 */e.printStackTrace(); /* 174 */System.exit(1); } } public DocHarvesterThreaded(File docDir, File index_dir, String mdProviderUrl) throws CorruptIndexException, LockObtainFailedException, IOException { /* 180 */this(docDir, index_dir, null, mdProviderUrl, null); } protected HashMap<String, String> loadLanguages() { /* 187 */File languageFile = new File(this.docDir + "/" + this.languageFileName); /* 188 */String languageFilePath = this.docDir + "/" + this.languageFileName; /* 189 */HashMap languages = new HashMap(); /* 190 */boolean relativ = true; /* 191 */if (this.languageFileName == null) /* 192 */return null; /* 193 */if (!languageFile.exists()) { /* 195 */languageFile = new File(this.languageFileName); /* 196 */languageFilePath = this.languageFileName; /* 197 */relativ = false; /* 198 */if (!languageFile.exists()) /* 199 */return null; } BufferedReader in; try { /* 203 */in = new BufferedReader(new FileReader(languageFilePath)); } catch (FileNotFoundException e) { /* 205 */return null; } /* 208 */String zeile = null; try { /* 210 */while ((zeile = in.readLine()) != null) { /* 211 */String[] splitted = zeile.replace("\"", "").split( "[,]"); /* 212 */if (splitted.length == 2) /* 213 */if (relativ) /* 214 */languages.put(this.docDir + "/" + splitted[0], splitted[1]); else /* 216 */languages.put(splitted[0], splitted[1]); } } catch (IOException e) { /* 220 */e.printStackTrace(); /* 221 */return null; } /* 224 */return languages; } public void harvestFromRDF(String rdffilepath) throws InterruptedException, JDOMException { /* 228 */Date start = new Date(); /* 229 */boolean create = true; try { /* 240 */System.out.println("Indexing to directory '" + this.index_dir + "'..."); /* 241 */ArrayList<String> files = getFileListFromRDF(rdffilepath); /* 242 */indexDocs(files); /* 243 */System.out.println("Optimizing..."); /* 244 */this.languageAnalyzers.optimize(); /* 245 */this.languageAnalyzers.close(); /* 247 */Date end = new Date(); /* 248 */System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { /* 251 */System.out.println(" caught a " + e.getClass() + /* 252 */"\n with message: " + e.getMessage()); } } private ArrayList<String> getFileListFromRDF(String rdffilepath) throws JDOMException, IOException { /* 260 */ArrayList ret = new ArrayList(); /* 261 */SAXBuilder builder = new SAXBuilder(); /* 263 */Document doc = builder.build(rdffilepath); /* 265 */Element el = doc.getRootElement(); /* 267 */XPath xpath = XPath.newInstance("//MPIWG:archive-path"); /* 268 */xpath.addNamespace("MPIWG", "http://www.mpiwg-berlin.mpg.de/ns/mpiwg"); /* 269 */List<Element> paths = xpath.selectNodes(el); /* 270 */for (Element path : paths) { /* 271 */ret.add(path.getText()); } /* 274 */return ret; } public void harvestFolder() throws InterruptedException { /* 278 */Date start = new Date(); /* 279 */boolean create = true; try { /* 290 */System.out.println("Indexing to directory '" + this.index_dir + "'..."); /* 291 */indexDocs(this.docDir); /* 292 */System.out.println("Optimizing..."); /* 293 */this.languageAnalyzers.optimize(); /* 294 */this.languageAnalyzers.close(); /* 296 */Date end = new Date(); /* 297 */System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { /* 300 */System.out.println(" caught a " + e.getClass() + /* 301 */"\n with message: " + e.getMessage()); } } private void indexDocs(ArrayList<String> files) throws IOException, InterruptedException { /* 308 */for (String filename : files) { /* 310 */indexDocs(new File(this.docDir.getAbsolutePath() + filename)); if ((DEBUG == true) & (this.filecount > MAXFILES)) break; } } void indexDocs(File file) throws IOException, InterruptedException { /* 317 */if (!file.canRead()) return; /* 319 */ /* 321 */if ((DEBUG == true) && (this.filecount > MAXFILES)) return; /* 325 */String[] files = file.list(); /* 327 */String folderName = file.getName(); boolean notExists = !checkFileExists(file); boolean createNew = CREATE_NEW || notExists; // boolean createNew = true; boolean fileStillEmpty = true; if (createNew) { clearFile(file); // loesche das gesamtfile } else { fileStillEmpty = false; // assume that file is not empty, if it already exists } if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0 : 1)) != 0) { for (int i = 0; i < files.length; ++i) { File nextFile = new File(file, files[i]); if (nextFile.isDirectory()) // directory dann gehe in die // naechste ebene indexDocs(nextFile); else if (isTextFile(nextFile)) { if (createNew) { fileStillEmpty = false; //datei hat jetzt einen Inhalt compose(file, nextFile); // fuege das file an das // gesamtfilean } } if ((DEBUG == true) && (this.filecount > MAXFILES)) break; } if (createNew) { if (fileStillEmpty){ deleteComposedFile(file); // file hat keinen inhalt dann loeschen } else { finishFile(file); } } if (!fileStillEmpty) processCompleteFile(file); /* 335 */} else { /* 342 */System.out.println("not adding " + file); } } private void finishFile(File folder) { File cf = getComposedFile(folder); System.out.println(); try { System.out.println("finish file:" + cf.getCanonicalPath()); FileWriter fw = new FileWriter(cf, true); fw.write("</document>"); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private boolean deleteComposedFile(File folder) { File cf = getComposedFile(folder); try { System.out.println("file deleted, because empty:" + cf.getCanonicalPath()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return cf.delete(); } private void processCompleteFile(File folder) { System.out.println("Completed File:"+String.valueOf(completedFiles++)); File cf = getComposedFile(folder); try { processFile(cf); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private boolean checkFileExists(File folder) { File cf = getComposedFile(folder); return cf.exists(); } private void clearFile(File folder) { File cf = getComposedFile(folder); cf.delete(); try { File dir = cf.getParentFile(); if (false == dir.exists()) { dir.mkdirs(); } cf.createNewFile(); FileWriter fw = new FileWriter(cf); fw.write("<document>"); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private void compose(File folder, File file) { File cf = getComposedFile(folder); try { System.out.println("Adding" + file.getCanonicalPath()); //FileWriter fw = new FileWriter(cf, true); FileOutputStream stream = new FileOutputStream(cf,true); OutputStreamWriter fw = new OutputStreamWriter(stream, "utf-8"); String filteredDocument=""; try { filteredDocument = getFilteredFile(file); } catch (TransformerException e) { filteredDocument = ""; }catch (ParserConfigurationException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); } fw.append(filteredDocument); fw.write("<pb name=\""); fw.write(file.getName()); fw.write("\"/>"); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private String getFilteredFile(File file) throws IOException, TransformerException, ParserConfigurationException, SAXException { // String txt = IOUtils.toString(new FileInputStream(file)); // get rid of the entities TransformerFactory tf = TransformerFactory.newInstance(); Transformer t = tf.newTransformer(); //OutputStream output = new ByteArrayOutputStream(); //BufferedWriter sw = new BufferedWriter(new OutputStreamWriter(output, "utf-8")); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setNamespaceAware(true); dbf.setValidating(false); DocumentBuilder db = dbf.newDocumentBuilder(); db.setEntityResolver(new MyResolver()); org.w3c.dom.Document doc = db.parse(file); StringWriter sw = new StringWriter(); StreamResult sr = new StreamResult(sw); org.w3c.dom.Document tgtDoc = db.newDocument(); DocumentFragment fragment = tgtDoc.createDocumentFragment(); DOMResult tgtDom = new DOMResult( fragment ); t.setOutputProperty(OutputKeys.ENCODING, "utf-8"); t.transform(new DOMSource(doc), sr); t.transform(new DOMSource(doc), tgtDom); String txt = sw.toString(); Pattern p = Pattern.compile("<body>(.*)</body>", Pattern.DOTALL); Matcher m = p.matcher(txt); if (m.find()) if (m.groupCount() > 0) { return m.group(1); } return ""; } private File getComposedFile(File folder) { try { String path = folder.getCanonicalPath(); String newPath = PREFIX + path + "/" + COMPOSEDFN; return new File(newPath); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException { /* 348 */int freeThread = -1; /* 349 */while (freeThread == -1) { /* 351 */freeThread = waitForFreeThread(); } /* 355 */if (this.textLanguage == null) /* 356 */this.textLanguage = loadLanguages(); /* 357 */this.mythreads[freeThread] = new ProcessFileThread( this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder); /* 358 */this.mythreads[freeThread].start(); /* 359 */System.out.println("New process started:" + freeThread); } protected int waitForFreeThread() { /* 367 */for (int i = 0; i < maxThread; ++i) { /* 369 */if (this.mythreads[i] == null) /* 370 */return i; /* 371 */if (!this.mythreads[i].done) continue; /* 373 */this.filecount += 1; /* 374 */System.out.println("filecount:" + this.filecount); /* 375 */return i; } /* 378 */return -1; } private boolean isTextFile(File file) { /* 392 */String fn = file.getName(); /* 394 */String[] splitted = fn.split("[.]"); /* 396 */String ext = ""; /* 398 */if (splitted.length > 1) { /* 400 */ext = splitted[(splitted.length - 1)]; } boolean ret = fileTypesToIndex.contains(ext); /* 403 */return ret; } } /* * Location: /private/tmp/fulltextIndexer.jar Qualified Name: * de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded JD-Core Version: * 0.5.4 */