Mercurial > hg > NamedIdentityManager
diff src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java @ 0:1384a0d382fa
first input
author | dwinter |
---|---|
date | Thu, 30 Jun 2011 11:44:24 +0200 |
parents | |
children | b8333fab0d95 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java Thu Jun 30 11:44:24 2011 +0200 @@ -0,0 +1,146 @@ +/* + * Klasse importiert Werte von Triple in einen Lucene Index fŸr die schnellere Suche. + */ +package de.mpiwg.itgroup.nimanager.luceneIndices; + +import java.io.File; +import java.io.IOException; + +import org.apache.log4j.BasicConfigurator; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; +import org.openrdf.model.Statement; +import org.openrdf.model.Value; +import org.openrdf.repository.RepositoryException; +import org.openrdf.repository.RepositoryResult; + +import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException; +import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler; + +public class Importer { + + private Logger logger = Logger.getRootLogger(); + private String context; + private String[] pred; + private TripleStoreHandler th; + private String indexPath; + private IndexWriter writer; + private int counter=0; + + public Importer(String context, String[] indexFields, TripleStoreHandler th, String indexPath){ + this.context= context; + this.pred= indexFields; + this.th=th; + this.indexPath = indexPath; + + + } + + public int writeStatementsToIndex() throws RepositoryException, CorruptIndexException, IOException{ + counter=0; + for (int i=0;i<pred.length;i++) { + RepositoryResult<Statement> statements = th.getStatements(null, pred[i], null,context); + while (statements.hasNext()){ + Statement statement = statements.next(); + writeStatementToIndex(statement); + counter++; + } + + } + return counter; + } + + public int writeStatementsToIndex(String subj) throws RepositoryException, CorruptIndexException, IOException{ + counter=0; + for (int i=0;i<pred.length;i++) { + RepositoryResult<Statement> statements = th.getStatements(subj, pred[i], null,context); + while (statements.hasNext()){ + Statement statement = statements.next(); + writeStatementToIndex(statement); + counter++; + } + + } + return counter; + } + + public void initializeIndexer(Boolean create) throws IOException{ + Directory dir = FSDirectory.open(new File(indexPath)); + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); + IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); + + if (create) { + // Create a new index in the directory, removing any + // previously indexed documents: + iwc.setOpenMode(OpenMode.CREATE); + } else { + // Add new documents to an existing index: + iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); + } + + writer = new IndexWriter(dir, iwc); + } + private void writeStatementToIndex(Statement statement) throws CorruptIndexException, IOException { + Document doc = new Document(); + logger.debug("Adding:"+String.format("%s :%s",statement.getSubject().stringValue(),statement.getObject().stringValue())); + Field pathField = new Field("identifier",statement.getSubject().stringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); + pathField.setOmitTermFreqAndPositions(true); + doc.add(pathField); + + Field modifiedField = new Field(statement.getPredicate().stringValue(),statement.getObject().stringValue(),Field.Store.YES, Field.Index.ANALYZED); + doc.add(modifiedField); + + + writer.addDocument(doc); + + } + + public void close() throws CorruptIndexException, IOException{ + writer.close(); + } + static public void main(String args[]) throws RepositoryException, IOException, TripleStoreHandlerException{ + importFromMPIWG_persons(args); + } + static public void importFromMPIWG_persons(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{ + Logger.getRootLogger().setLevel(Level.DEBUG); + BasicConfigurator.configure(); + TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "wa55er"); + + String[] indexFields=new String[]{"http://xmlns.com/foaf/0.1/name","http://xmlns.com/foaf/0.1/lastName","http://xmlns.com/foaf/0.1/firstName"}; + + Importer im = new Importer("file://mpiwg_persons.rdf", indexFields, th, "/tmp/tripleIndex"); + im.initializeIndexer(true); + im.writeStatementsToIndex(); + im.writer.optimize(); + im.writer.close(); + System.out.println("Number of Statements:"+String.valueOf(im.counter)); + } + + static public void importAlternativesFromWikipedia(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{ + Logger.getRootLogger().setLevel(Level.DEBUG); + BasicConfigurator.configure(); + TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "dba"); + + String[] indexFields=new String[]{"http://www.w3.org/2000/01/rdf-schema#label","http://dbpedia.org/property/alternativeNames"}; + + Importer im = new Importer("file://personendataWikipedia", indexFields, th, "/tmp/tripleIndex"); + im.initializeIndexer(true); + im.writeStatementsToIndex(); + im.writer.optimize(); + im.writer.close(); + System.out.println("Number of Statemens:"+String.valueOf(im.counter)); + } + +}