0
|
1 /*
|
|
2 * Klasse importiert Werte von Triple in einen Lucene Index fŸr die schnellere Suche.
|
|
3 */
|
|
4 package de.mpiwg.itgroup.nimanager.luceneIndices;
|
|
5
|
|
6 import java.io.File;
|
|
7 import java.io.IOException;
|
|
8
|
|
9 import org.apache.log4j.BasicConfigurator;
|
|
10 import org.apache.log4j.Level;
|
|
11 import org.apache.log4j.Logger;
|
|
12 import org.apache.lucene.analysis.Analyzer;
|
|
13 import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
14 import org.apache.lucene.document.Document;
|
|
15 import org.apache.lucene.document.Field;
|
|
16 import org.apache.lucene.document.FieldSelectorResult;
|
|
17 import org.apache.lucene.index.CorruptIndexException;
|
|
18 import org.apache.lucene.index.IndexWriter;
|
|
19 import org.apache.lucene.index.IndexWriterConfig;
|
|
20 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
|
21 import org.apache.lucene.store.Directory;
|
|
22 import org.apache.lucene.store.FSDirectory;
|
|
23 import org.apache.lucene.util.Version;
|
|
24 import org.openrdf.model.Statement;
|
|
25 import org.openrdf.model.Value;
|
|
26 import org.openrdf.repository.RepositoryException;
|
|
27 import org.openrdf.repository.RepositoryResult;
|
|
28
|
|
29 import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException;
|
|
30 import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler;
|
|
31
|
|
32 public class Importer {
|
|
33
|
|
34 private Logger logger = Logger.getRootLogger();
|
|
35 private String context;
|
|
36 private String[] pred;
|
|
37 private TripleStoreHandler th;
|
|
38 private String indexPath;
|
|
39 private IndexWriter writer;
|
|
40 private int counter=0;
|
|
41
|
|
42 public Importer(String context, String[] indexFields, TripleStoreHandler th, String indexPath){
|
|
43 this.context= context;
|
|
44 this.pred= indexFields;
|
|
45 this.th=th;
|
|
46 this.indexPath = indexPath;
|
|
47
|
|
48
|
|
49 }
|
|
50
|
|
51 public int writeStatementsToIndex() throws RepositoryException, CorruptIndexException, IOException{
|
|
52 counter=0;
|
|
53 for (int i=0;i<pred.length;i++) {
|
|
54 RepositoryResult<Statement> statements = th.getStatements(null, pred[i], null,context);
|
|
55 while (statements.hasNext()){
|
|
56 Statement statement = statements.next();
|
|
57 writeStatementToIndex(statement);
|
|
58 counter++;
|
|
59 }
|
|
60
|
|
61 }
|
|
62 return counter;
|
|
63 }
|
|
64
|
|
65 public int writeStatementsToIndex(String subj) throws RepositoryException, CorruptIndexException, IOException{
|
|
66 counter=0;
|
|
67 for (int i=0;i<pred.length;i++) {
|
|
68 RepositoryResult<Statement> statements = th.getStatements(subj, pred[i], null,context);
|
|
69 while (statements.hasNext()){
|
|
70 Statement statement = statements.next();
|
|
71 writeStatementToIndex(statement);
|
|
72 counter++;
|
|
73 }
|
|
74
|
|
75 }
|
|
76 return counter;
|
|
77 }
|
|
78
|
|
79 public void initializeIndexer(Boolean create) throws IOException{
|
|
80 Directory dir = FSDirectory.open(new File(indexPath));
|
|
81 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
|
|
82 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
|
|
83
|
|
84 if (create) {
|
|
85 // Create a new index in the directory, removing any
|
|
86 // previously indexed documents:
|
|
87 iwc.setOpenMode(OpenMode.CREATE);
|
|
88 } else {
|
|
89 // Add new documents to an existing index:
|
|
90 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
|
|
91 }
|
|
92
|
|
93 writer = new IndexWriter(dir, iwc);
|
|
94 }
|
|
95 private void writeStatementToIndex(Statement statement) throws CorruptIndexException, IOException {
|
|
96 Document doc = new Document();
|
|
97 logger.debug("Adding:"+String.format("%s :%s",statement.getSubject().stringValue(),statement.getObject().stringValue()));
|
|
98 Field pathField = new Field("identifier",statement.getSubject().stringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
|
|
99 pathField.setOmitTermFreqAndPositions(true);
|
|
100 doc.add(pathField);
|
|
101
|
|
102 Field modifiedField = new Field(statement.getPredicate().stringValue(),statement.getObject().stringValue(),Field.Store.YES, Field.Index.ANALYZED);
|
|
103 doc.add(modifiedField);
|
|
104
|
|
105
|
|
106 writer.addDocument(doc);
|
|
107
|
|
108 }
|
|
109
|
|
110 public void close() throws CorruptIndexException, IOException{
|
|
111 writer.close();
|
|
112 }
|
|
113 static public void main(String args[]) throws RepositoryException, IOException, TripleStoreHandlerException{
|
|
114 importFromMPIWG_persons(args);
|
|
115 }
|
|
116 static public void importFromMPIWG_persons(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
|
|
117 Logger.getRootLogger().setLevel(Level.DEBUG);
|
|
118 BasicConfigurator.configure();
|
1
|
119 if (args.length < 2) {
|
|
120 System.out.println("usage: import user pw offset limit ");
|
|
121 System.exit(1);
|
|
122 }
|
|
123 TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111",args[0], args[1]);
|
0
|
124
|
|
125 String[] indexFields=new String[]{"http://xmlns.com/foaf/0.1/name","http://xmlns.com/foaf/0.1/lastName","http://xmlns.com/foaf/0.1/firstName"};
|
|
126
|
|
127 Importer im = new Importer("file://mpiwg_persons.rdf", indexFields, th, "/tmp/tripleIndex");
|
|
128 im.initializeIndexer(true);
|
|
129 im.writeStatementsToIndex();
|
|
130 im.writer.optimize();
|
|
131 im.writer.close();
|
|
132 System.out.println("Number of Statements:"+String.valueOf(im.counter));
|
|
133 }
|
|
134
|
|
135 static public void importAlternativesFromWikipedia(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
|
|
136 Logger.getRootLogger().setLevel(Level.DEBUG);
|
|
137 BasicConfigurator.configure();
|
|
138 TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "dba");
|
|
139
|
|
140 String[] indexFields=new String[]{"http://www.w3.org/2000/01/rdf-schema#label","http://dbpedia.org/property/alternativeNames"};
|
|
141
|
|
142 Importer im = new Importer("file://personendataWikipedia", indexFields, th, "/tmp/tripleIndex");
|
|
143 im.initializeIndexer(true);
|
|
144 im.writeStatementsToIndex();
|
|
145 im.writer.optimize();
|
|
146 im.writer.close();
|
|
147 System.out.println("Number of Statemens:"+String.valueOf(im.counter));
|
|
148 }
|
|
149
|
|
150 }
|