diff src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java @ 0:1384a0d382fa

first input
author dwinter
date Thu, 30 Jun 2011 11:44:24 +0200
parents
children b8333fab0d95
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java	Thu Jun 30 11:44:24 2011 +0200
@@ -0,0 +1,146 @@
+/*
+ * Klasse importiert Werte von Triple in einen Lucene Index fŸr die schnellere Suche.
+ */
+package de.mpiwg.itgroup.nimanager.luceneIndices;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.log4j.BasicConfigurator;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldSelectorResult;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+import org.openrdf.model.Statement;
+import org.openrdf.model.Value;
+import org.openrdf.repository.RepositoryException;
+import org.openrdf.repository.RepositoryResult;
+
+import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException;
+import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler;
+
+public class Importer {
+
+	private Logger logger = Logger.getRootLogger();
+	private String context;
+	private String[] pred;
+	private TripleStoreHandler th;
+	private String indexPath;
+	private IndexWriter writer;
+	private int counter=0;
+
+	public Importer(String context, String[] indexFields, TripleStoreHandler th, String indexPath){
+		this.context= context;
+		this.pred= indexFields;
+		this.th=th;
+		this.indexPath = indexPath;
+
+
+	}
+
+	public int writeStatementsToIndex() throws RepositoryException, CorruptIndexException, IOException{
+		counter=0;
+		for (int i=0;i<pred.length;i++) {
+			RepositoryResult<Statement> statements = th.getStatements(null, pred[i], null,context);
+			while (statements.hasNext()){
+				Statement statement = statements.next();
+				writeStatementToIndex(statement);
+				counter++;
+			}
+
+		}
+		return counter;
+	}
+	
+	public int writeStatementsToIndex(String subj) throws RepositoryException, CorruptIndexException, IOException{
+		counter=0;
+		for (int i=0;i<pred.length;i++) {
+			RepositoryResult<Statement> statements = th.getStatements(subj, pred[i], null,context);
+			while (statements.hasNext()){
+				Statement statement = statements.next();
+				writeStatementToIndex(statement);
+				counter++;
+			}
+
+		}
+		return counter;
+	}
+
+	public void initializeIndexer(Boolean create) throws IOException{
+		Directory dir = FSDirectory.open(new File(indexPath));
+		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
+		IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
+
+		if (create) {
+			// Create a new index in the directory, removing any
+			// previously indexed documents:
+			iwc.setOpenMode(OpenMode.CREATE);
+		} else {
+			// Add new documents to an existing index:
+			iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
+		}
+
+		writer = new IndexWriter(dir, iwc);
+	}
+	private void writeStatementToIndex(Statement statement) throws CorruptIndexException, IOException {
+		Document doc = new Document();
+		logger.debug("Adding:"+String.format("%s :%s",statement.getSubject().stringValue(),statement.getObject().stringValue()));
+		Field pathField = new Field("identifier",statement.getSubject().stringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+		pathField.setOmitTermFreqAndPositions(true);
+		doc.add(pathField);
+
+		Field modifiedField = new Field(statement.getPredicate().stringValue(),statement.getObject().stringValue(),Field.Store.YES, Field.Index.ANALYZED);
+		doc.add(modifiedField);
+
+
+		writer.addDocument(doc);
+
+	}
+
+	public void close() throws CorruptIndexException, IOException{
+		writer.close();
+	}
+	static public void main(String args[]) throws RepositoryException, IOException, TripleStoreHandlerException{
+		importFromMPIWG_persons(args);
+	}
+	static public void importFromMPIWG_persons(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
+		Logger.getRootLogger().setLevel(Level.DEBUG);
+		BasicConfigurator.configure();
+		TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "wa55er");
+
+		String[] indexFields=new String[]{"http://xmlns.com/foaf/0.1/name","http://xmlns.com/foaf/0.1/lastName","http://xmlns.com/foaf/0.1/firstName"};
+
+		Importer im = new Importer("file://mpiwg_persons.rdf", indexFields, th, "/tmp/tripleIndex");
+		im.initializeIndexer(true);
+		im.writeStatementsToIndex();
+		im.writer.optimize();
+		im.writer.close();
+		System.out.println("Number of Statements:"+String.valueOf(im.counter));
+	}
+	
+	static public void importAlternativesFromWikipedia(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
+		Logger.getRootLogger().setLevel(Level.DEBUG);
+		BasicConfigurator.configure();
+		TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "dba");
+
+		String[] indexFields=new String[]{"http://www.w3.org/2000/01/rdf-schema#label","http://dbpedia.org/property/alternativeNames"};
+
+		Importer im = new Importer("file://personendataWikipedia", indexFields, th, "/tmp/tripleIndex");
+		im.initializeIndexer(true);
+		im.writeStatementsToIndex();
+		im.writer.optimize();
+		im.writer.close();
+		System.out.println("Number of Statemens:"+String.valueOf(im.counter));
+	}
+	
+}