comparison src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java @ 0:1384a0d382fa

first input
author dwinter
date Thu, 30 Jun 2011 11:44:24 +0200
parents
children b8333fab0d95
comparison
equal deleted inserted replaced
-1:000000000000 0:1384a0d382fa
1 /*
2 * Klasse importiert Werte von Triple in einen Lucene Index fŸr die schnellere Suche.
3 */
4 package de.mpiwg.itgroup.nimanager.luceneIndices;
5
6 import java.io.File;
7 import java.io.IOException;
8
9 import org.apache.log4j.BasicConfigurator;
10 import org.apache.log4j.Level;
11 import org.apache.log4j.Logger;
12 import org.apache.lucene.analysis.Analyzer;
13 import org.apache.lucene.analysis.standard.StandardAnalyzer;
14 import org.apache.lucene.document.Document;
15 import org.apache.lucene.document.Field;
16 import org.apache.lucene.document.FieldSelectorResult;
17 import org.apache.lucene.index.CorruptIndexException;
18 import org.apache.lucene.index.IndexWriter;
19 import org.apache.lucene.index.IndexWriterConfig;
20 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
21 import org.apache.lucene.store.Directory;
22 import org.apache.lucene.store.FSDirectory;
23 import org.apache.lucene.util.Version;
24 import org.openrdf.model.Statement;
25 import org.openrdf.model.Value;
26 import org.openrdf.repository.RepositoryException;
27 import org.openrdf.repository.RepositoryResult;
28
29 import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException;
30 import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler;
31
32 public class Importer {
33
34 private Logger logger = Logger.getRootLogger();
35 private String context;
36 private String[] pred;
37 private TripleStoreHandler th;
38 private String indexPath;
39 private IndexWriter writer;
40 private int counter=0;
41
42 public Importer(String context, String[] indexFields, TripleStoreHandler th, String indexPath){
43 this.context= context;
44 this.pred= indexFields;
45 this.th=th;
46 this.indexPath = indexPath;
47
48
49 }
50
51 public int writeStatementsToIndex() throws RepositoryException, CorruptIndexException, IOException{
52 counter=0;
53 for (int i=0;i<pred.length;i++) {
54 RepositoryResult<Statement> statements = th.getStatements(null, pred[i], null,context);
55 while (statements.hasNext()){
56 Statement statement = statements.next();
57 writeStatementToIndex(statement);
58 counter++;
59 }
60
61 }
62 return counter;
63 }
64
65 public int writeStatementsToIndex(String subj) throws RepositoryException, CorruptIndexException, IOException{
66 counter=0;
67 for (int i=0;i<pred.length;i++) {
68 RepositoryResult<Statement> statements = th.getStatements(subj, pred[i], null,context);
69 while (statements.hasNext()){
70 Statement statement = statements.next();
71 writeStatementToIndex(statement);
72 counter++;
73 }
74
75 }
76 return counter;
77 }
78
79 public void initializeIndexer(Boolean create) throws IOException{
80 Directory dir = FSDirectory.open(new File(indexPath));
81 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
82 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
83
84 if (create) {
85 // Create a new index in the directory, removing any
86 // previously indexed documents:
87 iwc.setOpenMode(OpenMode.CREATE);
88 } else {
89 // Add new documents to an existing index:
90 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
91 }
92
93 writer = new IndexWriter(dir, iwc);
94 }
95 private void writeStatementToIndex(Statement statement) throws CorruptIndexException, IOException {
96 Document doc = new Document();
97 logger.debug("Adding:"+String.format("%s :%s",statement.getSubject().stringValue(),statement.getObject().stringValue()));
98 Field pathField = new Field("identifier",statement.getSubject().stringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
99 pathField.setOmitTermFreqAndPositions(true);
100 doc.add(pathField);
101
102 Field modifiedField = new Field(statement.getPredicate().stringValue(),statement.getObject().stringValue(),Field.Store.YES, Field.Index.ANALYZED);
103 doc.add(modifiedField);
104
105
106 writer.addDocument(doc);
107
108 }
109
110 public void close() throws CorruptIndexException, IOException{
111 writer.close();
112 }
113 static public void main(String args[]) throws RepositoryException, IOException, TripleStoreHandlerException{
114 importFromMPIWG_persons(args);
115 }
116 static public void importFromMPIWG_persons(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
117 Logger.getRootLogger().setLevel(Level.DEBUG);
118 BasicConfigurator.configure();
119 TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "wa55er");
120
121 String[] indexFields=new String[]{"http://xmlns.com/foaf/0.1/name","http://xmlns.com/foaf/0.1/lastName","http://xmlns.com/foaf/0.1/firstName"};
122
123 Importer im = new Importer("file://mpiwg_persons.rdf", indexFields, th, "/tmp/tripleIndex");
124 im.initializeIndexer(true);
125 im.writeStatementsToIndex();
126 im.writer.optimize();
127 im.writer.close();
128 System.out.println("Number of Statements:"+String.valueOf(im.counter));
129 }
130
131 static public void importAlternativesFromWikipedia(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
132 Logger.getRootLogger().setLevel(Level.DEBUG);
133 BasicConfigurator.configure();
134 TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "dba");
135
136 String[] indexFields=new String[]{"http://www.w3.org/2000/01/rdf-schema#label","http://dbpedia.org/property/alternativeNames"};
137
138 Importer im = new Importer("file://personendataWikipedia", indexFields, th, "/tmp/tripleIndex");
139 im.initializeIndexer(true);
140 im.writeStatementsToIndex();
141 im.writer.optimize();
142 im.writer.close();
143 System.out.println("Number of Statemens:"+String.valueOf(im.counter));
144 }
145
146 }