view src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java @ 4:f986e74583eb

removed triplestorehandler componentes
author dwinter
date Tue, 13 Dec 2011 17:46:51 +0100
parents b8333fab0d95
children cdc4c12262b1
line wrap: on
line source

/*
 * Klasse importiert Werte von Triple in einen Lucene Index f�r die schnellere Suche.
 */
package de.mpiwg.itgroup.nimanager.luceneIndices;

import java.io.File;
import java.io.IOException;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.openrdf.model.Statement;
import org.openrdf.model.Value;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;

import de.mpiwg.itgroup.triplestoremanager.exceptions.TripleStoreHandlerException;
import de.mpiwg.itgroup.triplestoremanager.owl.TripleStoreHandler;

public class Importer {

	private Logger logger = Logger.getRootLogger();
	private String context;
	private String[] pred;
	private TripleStoreHandler th;
	private String indexPath;
	private IndexWriter writer;
	private int counter=0;

	public Importer(String context, String[] indexFields, TripleStoreHandler th, String indexPath){
		this.context= context;
		this.pred= indexFields;
		this.th=th;
		this.indexPath = indexPath;


	}

	public int writeStatementsToIndex() throws RepositoryException, CorruptIndexException, IOException{
		counter=0;
		for (int i=0;i<pred.length;i++) {
			RepositoryResult<Statement> statements = th.getStatements(null, pred[i], null,context);
			while (statements.hasNext()){
				Statement statement = statements.next();
				writeStatementToIndex(statement);
				counter++;
			}

		}
		return counter;
	}
	
	public int writeStatementsToIndex(String subj) throws RepositoryException, CorruptIndexException, IOException{
		counter=0;
		for (int i=0;i<pred.length;i++) {
			RepositoryResult<Statement> statements = th.getStatements(subj, pred[i], null,context);
			while (statements.hasNext()){
				Statement statement = statements.next();
				writeStatementToIndex(statement);
				counter++;
			}

		}
		return counter;
	}

	public void initializeIndexer(Boolean create) throws IOException{
		Directory dir = FSDirectory.open(new File(indexPath));
		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
		IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);

		if (create) {
			// Create a new index in the directory, removing any
			// previously indexed documents:
			iwc.setOpenMode(OpenMode.CREATE);
		} else {
			// Add new documents to an existing index:
			iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
		}

		writer = new IndexWriter(dir, iwc);
	}
	private void writeStatementToIndex(Statement statement) throws CorruptIndexException, IOException {
		Document doc = new Document();
		logger.debug("Adding:"+String.format("%s :%s",statement.getSubject().stringValue(),statement.getObject().stringValue()));
		Field pathField = new Field("identifier",statement.getSubject().stringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
		pathField.setOmitTermFreqAndPositions(true);
		doc.add(pathField);

		Field modifiedField = new Field(statement.getPredicate().stringValue(),statement.getObject().stringValue(),Field.Store.YES, Field.Index.ANALYZED);
		doc.add(modifiedField);


		writer.addDocument(doc);

	}

	public void close() throws CorruptIndexException, IOException{
		writer.close();
	}
	static public void main(String args[]) throws RepositoryException, IOException, TripleStoreHandlerException{
		importFromMPIWG_persons(args);
	}
	static public void importFromMPIWG_persons(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
		Logger.getRootLogger().setLevel(Level.DEBUG);
		BasicConfigurator.configure();
		if (args.length < 2) {
			System.out.println("usage: import user pw offset limit ");
			System.exit(1);
		}
		TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111",args[0], args[1]);

		String[] indexFields=new String[]{"http://xmlns.com/foaf/0.1/name","http://xmlns.com/foaf/0.1/lastName","http://xmlns.com/foaf/0.1/firstName"};

		Importer im = new Importer("file://mpiwg_persons.rdf", indexFields, th, "/tmp/tripleIndex");
		im.initializeIndexer(true);
		im.writeStatementsToIndex();
		im.writer.optimize();
		im.writer.close();
		System.out.println("Number of Statements:"+String.valueOf(im.counter));
	}
	
	static public void importAlternativesFromWikipedia(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
		Logger.getRootLogger().setLevel(Level.DEBUG);
		BasicConfigurator.configure();
		TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "dba");

		String[] indexFields=new String[]{"http://www.w3.org/2000/01/rdf-schema#label","http://dbpedia.org/property/alternativeNames"};

		Importer im = new Importer("file://personendataWikipedia", indexFields, th, "/tmp/tripleIndex");
		im.initializeIndexer(true);
		im.writeStatementsToIndex();
		im.writer.optimize();
		im.writer.close();
		System.out.println("Number of Statemens:"+String.valueOf(im.counter));
	}
	
}