annotate src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java @ 1:b8333fab0d95

minor bugs
author dwinter
date Thu, 30 Jun 2011 12:43:35 +0200
parents 1384a0d382fa
children f986e74583eb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1384a0d382fa first input
dwinter
parents:
diff changeset
1 /*
1384a0d382fa first input
dwinter
parents:
diff changeset
2 * Klasse importiert Werte von Triple in einen Lucene Index fŸr die schnellere Suche.
1384a0d382fa first input
dwinter
parents:
diff changeset
3 */
1384a0d382fa first input
dwinter
parents:
diff changeset
4 package de.mpiwg.itgroup.nimanager.luceneIndices;
1384a0d382fa first input
dwinter
parents:
diff changeset
5
1384a0d382fa first input
dwinter
parents:
diff changeset
6 import java.io.File;
1384a0d382fa first input
dwinter
parents:
diff changeset
7 import java.io.IOException;
1384a0d382fa first input
dwinter
parents:
diff changeset
8
1384a0d382fa first input
dwinter
parents:
diff changeset
9 import org.apache.log4j.BasicConfigurator;
1384a0d382fa first input
dwinter
parents:
diff changeset
10 import org.apache.log4j.Level;
1384a0d382fa first input
dwinter
parents:
diff changeset
11 import org.apache.log4j.Logger;
1384a0d382fa first input
dwinter
parents:
diff changeset
12 import org.apache.lucene.analysis.Analyzer;
1384a0d382fa first input
dwinter
parents:
diff changeset
13 import org.apache.lucene.analysis.standard.StandardAnalyzer;
1384a0d382fa first input
dwinter
parents:
diff changeset
14 import org.apache.lucene.document.Document;
1384a0d382fa first input
dwinter
parents:
diff changeset
15 import org.apache.lucene.document.Field;
1384a0d382fa first input
dwinter
parents:
diff changeset
16 import org.apache.lucene.document.FieldSelectorResult;
1384a0d382fa first input
dwinter
parents:
diff changeset
17 import org.apache.lucene.index.CorruptIndexException;
1384a0d382fa first input
dwinter
parents:
diff changeset
18 import org.apache.lucene.index.IndexWriter;
1384a0d382fa first input
dwinter
parents:
diff changeset
19 import org.apache.lucene.index.IndexWriterConfig;
1384a0d382fa first input
dwinter
parents:
diff changeset
20 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
1384a0d382fa first input
dwinter
parents:
diff changeset
21 import org.apache.lucene.store.Directory;
1384a0d382fa first input
dwinter
parents:
diff changeset
22 import org.apache.lucene.store.FSDirectory;
1384a0d382fa first input
dwinter
parents:
diff changeset
23 import org.apache.lucene.util.Version;
1384a0d382fa first input
dwinter
parents:
diff changeset
24 import org.openrdf.model.Statement;
1384a0d382fa first input
dwinter
parents:
diff changeset
25 import org.openrdf.model.Value;
1384a0d382fa first input
dwinter
parents:
diff changeset
26 import org.openrdf.repository.RepositoryException;
1384a0d382fa first input
dwinter
parents:
diff changeset
27 import org.openrdf.repository.RepositoryResult;
1384a0d382fa first input
dwinter
parents:
diff changeset
28
1384a0d382fa first input
dwinter
parents:
diff changeset
29 import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException;
1384a0d382fa first input
dwinter
parents:
diff changeset
30 import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler;
1384a0d382fa first input
dwinter
parents:
diff changeset
31
1384a0d382fa first input
dwinter
parents:
diff changeset
32 public class Importer {
1384a0d382fa first input
dwinter
parents:
diff changeset
33
1384a0d382fa first input
dwinter
parents:
diff changeset
34 private Logger logger = Logger.getRootLogger();
1384a0d382fa first input
dwinter
parents:
diff changeset
35 private String context;
1384a0d382fa first input
dwinter
parents:
diff changeset
36 private String[] pred;
1384a0d382fa first input
dwinter
parents:
diff changeset
37 private TripleStoreHandler th;
1384a0d382fa first input
dwinter
parents:
diff changeset
38 private String indexPath;
1384a0d382fa first input
dwinter
parents:
diff changeset
39 private IndexWriter writer;
1384a0d382fa first input
dwinter
parents:
diff changeset
40 private int counter=0;
1384a0d382fa first input
dwinter
parents:
diff changeset
41
1384a0d382fa first input
dwinter
parents:
diff changeset
42 public Importer(String context, String[] indexFields, TripleStoreHandler th, String indexPath){
1384a0d382fa first input
dwinter
parents:
diff changeset
43 this.context= context;
1384a0d382fa first input
dwinter
parents:
diff changeset
44 this.pred= indexFields;
1384a0d382fa first input
dwinter
parents:
diff changeset
45 this.th=th;
1384a0d382fa first input
dwinter
parents:
diff changeset
46 this.indexPath = indexPath;
1384a0d382fa first input
dwinter
parents:
diff changeset
47
1384a0d382fa first input
dwinter
parents:
diff changeset
48
1384a0d382fa first input
dwinter
parents:
diff changeset
49 }
1384a0d382fa first input
dwinter
parents:
diff changeset
50
1384a0d382fa first input
dwinter
parents:
diff changeset
51 public int writeStatementsToIndex() throws RepositoryException, CorruptIndexException, IOException{
1384a0d382fa first input
dwinter
parents:
diff changeset
52 counter=0;
1384a0d382fa first input
dwinter
parents:
diff changeset
53 for (int i=0;i<pred.length;i++) {
1384a0d382fa first input
dwinter
parents:
diff changeset
54 RepositoryResult<Statement> statements = th.getStatements(null, pred[i], null,context);
1384a0d382fa first input
dwinter
parents:
diff changeset
55 while (statements.hasNext()){
1384a0d382fa first input
dwinter
parents:
diff changeset
56 Statement statement = statements.next();
1384a0d382fa first input
dwinter
parents:
diff changeset
57 writeStatementToIndex(statement);
1384a0d382fa first input
dwinter
parents:
diff changeset
58 counter++;
1384a0d382fa first input
dwinter
parents:
diff changeset
59 }
1384a0d382fa first input
dwinter
parents:
diff changeset
60
1384a0d382fa first input
dwinter
parents:
diff changeset
61 }
1384a0d382fa first input
dwinter
parents:
diff changeset
62 return counter;
1384a0d382fa first input
dwinter
parents:
diff changeset
63 }
1384a0d382fa first input
dwinter
parents:
diff changeset
64
1384a0d382fa first input
dwinter
parents:
diff changeset
65 public int writeStatementsToIndex(String subj) throws RepositoryException, CorruptIndexException, IOException{
1384a0d382fa first input
dwinter
parents:
diff changeset
66 counter=0;
1384a0d382fa first input
dwinter
parents:
diff changeset
67 for (int i=0;i<pred.length;i++) {
1384a0d382fa first input
dwinter
parents:
diff changeset
68 RepositoryResult<Statement> statements = th.getStatements(subj, pred[i], null,context);
1384a0d382fa first input
dwinter
parents:
diff changeset
69 while (statements.hasNext()){
1384a0d382fa first input
dwinter
parents:
diff changeset
70 Statement statement = statements.next();
1384a0d382fa first input
dwinter
parents:
diff changeset
71 writeStatementToIndex(statement);
1384a0d382fa first input
dwinter
parents:
diff changeset
72 counter++;
1384a0d382fa first input
dwinter
parents:
diff changeset
73 }
1384a0d382fa first input
dwinter
parents:
diff changeset
74
1384a0d382fa first input
dwinter
parents:
diff changeset
75 }
1384a0d382fa first input
dwinter
parents:
diff changeset
76 return counter;
1384a0d382fa first input
dwinter
parents:
diff changeset
77 }
1384a0d382fa first input
dwinter
parents:
diff changeset
78
1384a0d382fa first input
dwinter
parents:
diff changeset
79 public void initializeIndexer(Boolean create) throws IOException{
1384a0d382fa first input
dwinter
parents:
diff changeset
80 Directory dir = FSDirectory.open(new File(indexPath));
1384a0d382fa first input
dwinter
parents:
diff changeset
81 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
1384a0d382fa first input
dwinter
parents:
diff changeset
82 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
1384a0d382fa first input
dwinter
parents:
diff changeset
83
1384a0d382fa first input
dwinter
parents:
diff changeset
84 if (create) {
1384a0d382fa first input
dwinter
parents:
diff changeset
85 // Create a new index in the directory, removing any
1384a0d382fa first input
dwinter
parents:
diff changeset
86 // previously indexed documents:
1384a0d382fa first input
dwinter
parents:
diff changeset
87 iwc.setOpenMode(OpenMode.CREATE);
1384a0d382fa first input
dwinter
parents:
diff changeset
88 } else {
1384a0d382fa first input
dwinter
parents:
diff changeset
89 // Add new documents to an existing index:
1384a0d382fa first input
dwinter
parents:
diff changeset
90 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
1384a0d382fa first input
dwinter
parents:
diff changeset
91 }
1384a0d382fa first input
dwinter
parents:
diff changeset
92
1384a0d382fa first input
dwinter
parents:
diff changeset
93 writer = new IndexWriter(dir, iwc);
1384a0d382fa first input
dwinter
parents:
diff changeset
94 }
1384a0d382fa first input
dwinter
parents:
diff changeset
95 private void writeStatementToIndex(Statement statement) throws CorruptIndexException, IOException {
1384a0d382fa first input
dwinter
parents:
diff changeset
96 Document doc = new Document();
1384a0d382fa first input
dwinter
parents:
diff changeset
97 logger.debug("Adding:"+String.format("%s :%s",statement.getSubject().stringValue(),statement.getObject().stringValue()));
1384a0d382fa first input
dwinter
parents:
diff changeset
98 Field pathField = new Field("identifier",statement.getSubject().stringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
1384a0d382fa first input
dwinter
parents:
diff changeset
99 pathField.setOmitTermFreqAndPositions(true);
1384a0d382fa first input
dwinter
parents:
diff changeset
100 doc.add(pathField);
1384a0d382fa first input
dwinter
parents:
diff changeset
101
1384a0d382fa first input
dwinter
parents:
diff changeset
102 Field modifiedField = new Field(statement.getPredicate().stringValue(),statement.getObject().stringValue(),Field.Store.YES, Field.Index.ANALYZED);
1384a0d382fa first input
dwinter
parents:
diff changeset
103 doc.add(modifiedField);
1384a0d382fa first input
dwinter
parents:
diff changeset
104
1384a0d382fa first input
dwinter
parents:
diff changeset
105
1384a0d382fa first input
dwinter
parents:
diff changeset
106 writer.addDocument(doc);
1384a0d382fa first input
dwinter
parents:
diff changeset
107
1384a0d382fa first input
dwinter
parents:
diff changeset
108 }
1384a0d382fa first input
dwinter
parents:
diff changeset
109
1384a0d382fa first input
dwinter
parents:
diff changeset
110 public void close() throws CorruptIndexException, IOException{
1384a0d382fa first input
dwinter
parents:
diff changeset
111 writer.close();
1384a0d382fa first input
dwinter
parents:
diff changeset
112 }
1384a0d382fa first input
dwinter
parents:
diff changeset
113 static public void main(String args[]) throws RepositoryException, IOException, TripleStoreHandlerException{
1384a0d382fa first input
dwinter
parents:
diff changeset
114 importFromMPIWG_persons(args);
1384a0d382fa first input
dwinter
parents:
diff changeset
115 }
1384a0d382fa first input
dwinter
parents:
diff changeset
116 static public void importFromMPIWG_persons(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
1384a0d382fa first input
dwinter
parents:
diff changeset
117 Logger.getRootLogger().setLevel(Level.DEBUG);
1384a0d382fa first input
dwinter
parents:
diff changeset
118 BasicConfigurator.configure();
1
b8333fab0d95 minor bugs
dwinter
parents: 0
diff changeset
119 if (args.length < 2) {
b8333fab0d95 minor bugs
dwinter
parents: 0
diff changeset
120 System.out.println("usage: import user pw offset limit ");
b8333fab0d95 minor bugs
dwinter
parents: 0
diff changeset
121 System.exit(1);
b8333fab0d95 minor bugs
dwinter
parents: 0
diff changeset
122 }
b8333fab0d95 minor bugs
dwinter
parents: 0
diff changeset
123 TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111",args[0], args[1]);
0
1384a0d382fa first input
dwinter
parents:
diff changeset
124
1384a0d382fa first input
dwinter
parents:
diff changeset
125 String[] indexFields=new String[]{"http://xmlns.com/foaf/0.1/name","http://xmlns.com/foaf/0.1/lastName","http://xmlns.com/foaf/0.1/firstName"};
1384a0d382fa first input
dwinter
parents:
diff changeset
126
1384a0d382fa first input
dwinter
parents:
diff changeset
127 Importer im = new Importer("file://mpiwg_persons.rdf", indexFields, th, "/tmp/tripleIndex");
1384a0d382fa first input
dwinter
parents:
diff changeset
128 im.initializeIndexer(true);
1384a0d382fa first input
dwinter
parents:
diff changeset
129 im.writeStatementsToIndex();
1384a0d382fa first input
dwinter
parents:
diff changeset
130 im.writer.optimize();
1384a0d382fa first input
dwinter
parents:
diff changeset
131 im.writer.close();
1384a0d382fa first input
dwinter
parents:
diff changeset
132 System.out.println("Number of Statements:"+String.valueOf(im.counter));
1384a0d382fa first input
dwinter
parents:
diff changeset
133 }
1384a0d382fa first input
dwinter
parents:
diff changeset
134
1384a0d382fa first input
dwinter
parents:
diff changeset
135 static public void importAlternativesFromWikipedia(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{
1384a0d382fa first input
dwinter
parents:
diff changeset
136 Logger.getRootLogger().setLevel(Level.DEBUG);
1384a0d382fa first input
dwinter
parents:
diff changeset
137 BasicConfigurator.configure();
1384a0d382fa first input
dwinter
parents:
diff changeset
138 TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "dba");
1384a0d382fa first input
dwinter
parents:
diff changeset
139
1384a0d382fa first input
dwinter
parents:
diff changeset
140 String[] indexFields=new String[]{"http://www.w3.org/2000/01/rdf-schema#label","http://dbpedia.org/property/alternativeNames"};
1384a0d382fa first input
dwinter
parents:
diff changeset
141
1384a0d382fa first input
dwinter
parents:
diff changeset
142 Importer im = new Importer("file://personendataWikipedia", indexFields, th, "/tmp/tripleIndex");
1384a0d382fa first input
dwinter
parents:
diff changeset
143 im.initializeIndexer(true);
1384a0d382fa first input
dwinter
parents:
diff changeset
144 im.writeStatementsToIndex();
1384a0d382fa first input
dwinter
parents:
diff changeset
145 im.writer.optimize();
1384a0d382fa first input
dwinter
parents:
diff changeset
146 im.writer.close();
1384a0d382fa first input
dwinter
parents:
diff changeset
147 System.out.println("Number of Statemens:"+String.valueOf(im.counter));
1384a0d382fa first input
dwinter
parents:
diff changeset
148 }
1384a0d382fa first input
dwinter
parents:
diff changeset
149
1384a0d382fa first input
dwinter
parents:
diff changeset
150 }