Mercurial > hg > NamedIdentityManager
comparison src/de/mpiwg/itgroup/nimanager/luceneIndices/Importer.java @ 0:1384a0d382fa
first input
author | dwinter |
---|---|
date | Thu, 30 Jun 2011 11:44:24 +0200 |
parents | |
children | b8333fab0d95 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1384a0d382fa |
---|---|
1 /* | |
2 * Klasse importiert Werte von Triple in einen Lucene Index fŸr die schnellere Suche. | |
3 */ | |
4 package de.mpiwg.itgroup.nimanager.luceneIndices; | |
5 | |
6 import java.io.File; | |
7 import java.io.IOException; | |
8 | |
9 import org.apache.log4j.BasicConfigurator; | |
10 import org.apache.log4j.Level; | |
11 import org.apache.log4j.Logger; | |
12 import org.apache.lucene.analysis.Analyzer; | |
13 import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
14 import org.apache.lucene.document.Document; | |
15 import org.apache.lucene.document.Field; | |
16 import org.apache.lucene.document.FieldSelectorResult; | |
17 import org.apache.lucene.index.CorruptIndexException; | |
18 import org.apache.lucene.index.IndexWriter; | |
19 import org.apache.lucene.index.IndexWriterConfig; | |
20 import org.apache.lucene.index.IndexWriterConfig.OpenMode; | |
21 import org.apache.lucene.store.Directory; | |
22 import org.apache.lucene.store.FSDirectory; | |
23 import org.apache.lucene.util.Version; | |
24 import org.openrdf.model.Statement; | |
25 import org.openrdf.model.Value; | |
26 import org.openrdf.repository.RepositoryException; | |
27 import org.openrdf.repository.RepositoryResult; | |
28 | |
29 import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException; | |
30 import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler; | |
31 | |
32 public class Importer { | |
33 | |
34 private Logger logger = Logger.getRootLogger(); | |
35 private String context; | |
36 private String[] pred; | |
37 private TripleStoreHandler th; | |
38 private String indexPath; | |
39 private IndexWriter writer; | |
40 private int counter=0; | |
41 | |
42 public Importer(String context, String[] indexFields, TripleStoreHandler th, String indexPath){ | |
43 this.context= context; | |
44 this.pred= indexFields; | |
45 this.th=th; | |
46 this.indexPath = indexPath; | |
47 | |
48 | |
49 } | |
50 | |
51 public int writeStatementsToIndex() throws RepositoryException, CorruptIndexException, IOException{ | |
52 counter=0; | |
53 for (int i=0;i<pred.length;i++) { | |
54 RepositoryResult<Statement> statements = th.getStatements(null, pred[i], null,context); | |
55 while (statements.hasNext()){ | |
56 Statement statement = statements.next(); | |
57 writeStatementToIndex(statement); | |
58 counter++; | |
59 } | |
60 | |
61 } | |
62 return counter; | |
63 } | |
64 | |
65 public int writeStatementsToIndex(String subj) throws RepositoryException, CorruptIndexException, IOException{ | |
66 counter=0; | |
67 for (int i=0;i<pred.length;i++) { | |
68 RepositoryResult<Statement> statements = th.getStatements(subj, pred[i], null,context); | |
69 while (statements.hasNext()){ | |
70 Statement statement = statements.next(); | |
71 writeStatementToIndex(statement); | |
72 counter++; | |
73 } | |
74 | |
75 } | |
76 return counter; | |
77 } | |
78 | |
79 public void initializeIndexer(Boolean create) throws IOException{ | |
80 Directory dir = FSDirectory.open(new File(indexPath)); | |
81 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); | |
82 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); | |
83 | |
84 if (create) { | |
85 // Create a new index in the directory, removing any | |
86 // previously indexed documents: | |
87 iwc.setOpenMode(OpenMode.CREATE); | |
88 } else { | |
89 // Add new documents to an existing index: | |
90 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); | |
91 } | |
92 | |
93 writer = new IndexWriter(dir, iwc); | |
94 } | |
95 private void writeStatementToIndex(Statement statement) throws CorruptIndexException, IOException { | |
96 Document doc = new Document(); | |
97 logger.debug("Adding:"+String.format("%s :%s",statement.getSubject().stringValue(),statement.getObject().stringValue())); | |
98 Field pathField = new Field("identifier",statement.getSubject().stringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); | |
99 pathField.setOmitTermFreqAndPositions(true); | |
100 doc.add(pathField); | |
101 | |
102 Field modifiedField = new Field(statement.getPredicate().stringValue(),statement.getObject().stringValue(),Field.Store.YES, Field.Index.ANALYZED); | |
103 doc.add(modifiedField); | |
104 | |
105 | |
106 writer.addDocument(doc); | |
107 | |
108 } | |
109 | |
110 public void close() throws CorruptIndexException, IOException{ | |
111 writer.close(); | |
112 } | |
113 static public void main(String args[]) throws RepositoryException, IOException, TripleStoreHandlerException{ | |
114 importFromMPIWG_persons(args); | |
115 } | |
116 static public void importFromMPIWG_persons(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{ | |
117 Logger.getRootLogger().setLevel(Level.DEBUG); | |
118 BasicConfigurator.configure(); | |
119 TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "wa55er"); | |
120 | |
121 String[] indexFields=new String[]{"http://xmlns.com/foaf/0.1/name","http://xmlns.com/foaf/0.1/lastName","http://xmlns.com/foaf/0.1/firstName"}; | |
122 | |
123 Importer im = new Importer("file://mpiwg_persons.rdf", indexFields, th, "/tmp/tripleIndex"); | |
124 im.initializeIndexer(true); | |
125 im.writeStatementsToIndex(); | |
126 im.writer.optimize(); | |
127 im.writer.close(); | |
128 System.out.println("Number of Statements:"+String.valueOf(im.counter)); | |
129 } | |
130 | |
131 static public void importAlternativesFromWikipedia(String args[]) throws IOException, RepositoryException, TripleStoreHandlerException{ | |
132 Logger.getRootLogger().setLevel(Level.DEBUG); | |
133 BasicConfigurator.configure(); | |
134 TripleStoreHandler th = new TripleStoreHandler("jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "dba"); | |
135 | |
136 String[] indexFields=new String[]{"http://www.w3.org/2000/01/rdf-schema#label","http://dbpedia.org/property/alternativeNames"}; | |
137 | |
138 Importer im = new Importer("file://personendataWikipedia", indexFields, th, "/tmp/tripleIndex"); | |
139 im.initializeIndexer(true); | |
140 im.writeStatementsToIndex(); | |
141 im.writer.optimize(); | |
142 im.writer.close(); | |
143 System.out.println("Number of Statemens:"+String.valueOf(im.counter)); | |
144 } | |
145 | |
146 } |