Mercurial > hg > NamedIdentityManager
view src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java @ 0:1384a0d382fa
first input
author | dwinter |
---|---|
date | Thu, 30 Jun 2011 11:44:24 +0200 |
parents | |
children | b8333fab0d95 |
line wrap: on
line source
/* * Klasse importiert Werte von Triple in einen Lucene Index fŸr die schnellere Suche. */ package de.mpiwg.itgroup.nimanager.luceneIndices; import java.io.File; import java.io.IOException; import org.apache.log4j.BasicConfigurator; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.openrdf.model.Statement; import org.openrdf.model.Value; import org.openrdf.repository.RepositoryException; import org.openrdf.repository.RepositoryResult; import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException; import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler; public class Importer { private Logger logger = Logger.getRootLogger(); private String context; private String[] pred; private TripleStoreHandler th; private String indexPath; private IndexWriter writer; private int counter=0; public Importer(String context, String[] indexFields, TripleStoreHandler th, String indexPath){ this.context= context; this.pred= indexFields; this.th=th; this.indexPath = indexPath; } public int writeStatementsToIndex() throws RepositoryException, CorruptIndexException, IOException{ counter=0; for (int i=0;i<pred.length;i++) { RepositoryResult<Statement> statements = th.getStatements(null, pred[i], null,context); while (statements.hasNext()){ Statement statement = statements.next(); writeStatementToIndex(statement); counter++; } } return counter; } public int writeStatementsToIndex(String subj) throws RepositoryException, CorruptIndexException, IOException{ counter=0; for (int i=0;i<pred.length;i++) { RepositoryResult<Statement> statements = th.getStatements(subj, pred[i], null,context); while (statements.hasNext()){ Statement statement = statements.next(); writeStatementToIndex(statement); counter++; } } return counter; } public void initializeIndexer(Boolean create) throws IOException{ Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } writer = new IndexWriter(dir, iwc); } private void writeStatementToIndex(Statement statement) throws CorruptIndexException, IOException { Document doc = new Document(); logger.debug("Adding:"+String.format("%s :%s",statement.getSubject().stringValue(),statement.getObject().stringValue())); Field pathField = new Field("identifier",statement.getSubject().stringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setOmitTermFreqAndPositions(true); doc.add(pathField); Field modifiedField = new Field(statement.getPredicate().stringValue(),statement.getObject().stringValue(),Field.Store.YES, Field.Index.ANALYZED); doc.add(modifiedField); writer.addDocument(doc); } public void close() throws CorruptIndexException, IOException{ writer.close(); } static public void main(String args[]) throws RepositoryException, IOException, TripleStoreHandlerException{ importFromMPIWG_persons(args); } static public void importFromMPIWG_persons(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{ Logger.getRootLogger().setLevel(Level.DEBUG); BasicConfigurator.configure(); TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "wa55er"); String[] indexFields=new String[]{"http://xmlns.com/foaf/0.1/name","http://xmlns.com/foaf/0.1/lastName","http://xmlns.com/foaf/0.1/firstName"}; Importer im = new Importer("file://mpiwg_persons.rdf", indexFields, th, "/tmp/tripleIndex"); im.initializeIndexer(true); im.writeStatementsToIndex(); im.writer.optimize(); im.writer.close(); System.out.println("Number of Statements:"+String.valueOf(im.counter)); } static public void importAlternativesFromWikipedia(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{ Logger.getRootLogger().setLevel(Level.DEBUG); BasicConfigurator.configure(); TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "dba"); String[] indexFields=new String[]{"http://www.w3.org/2000/01/rdf-schema#label","http://dbpedia.org/property/alternativeNames"}; Importer im = new Importer("file://personendataWikipedia", indexFields, th, "/tmp/tripleIndex"); im.initializeIndexer(true); im.writeStatementsToIndex(); im.writer.optimize(); im.writer.close(); System.out.println("Number of Statemens:"+String.valueOf(im.counter)); } }