Mercurial > hg > NamedIdentityManager
view src/de/mpiwg/itgroup/nimanager/importer/Import.java @ 0:1384a0d382fa
first input
author | dwinter |
---|---|
date | Thu, 30 Jun 2011 11:44:24 +0200 |
parents | |
children | b8333fab0d95 |
line wrap: on
line source
package de.mpiwg.itgroup.nimanager.importer; import java.net.URISyntaxException; import java.net.URL; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.log4j.BasicConfigurator; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.openrdf.model.Resource; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; import org.openrdf.model.impl.LiteralImpl; import org.openrdf.query.BindingSet; import org.openrdf.query.MalformedQueryException; import org.openrdf.query.QueryEvaluationException; import org.openrdf.query.QueryLanguage; import org.openrdf.query.TupleQuery; import org.openrdf.query.TupleQueryResult; import org.openrdf.repository.RepositoryConnection; import org.openrdf.repository.RepositoryException; import org.openrdf.repository.RepositoryResult; import com.hp.hpl.jena.graph.impl.LiteralLabel; import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException; import de.mpiwg.itgroup.nimanager.owl.MetaDataHandler; import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler; import edu.stanford.smi.protegex.owl.jena.JenaOWLModel; import edu.stanford.smi.protegex.owl.model.OWLIndividual; import edu.stanford.smi.protegex.owl.model.RDFProperty; import edu.stanford.smi.protegex.owl.model.RDFSLiteral; public class Import { private MetaDataHandler mh; private TripleStoreHandler th; private Logger logger = Logger.getRootLogger(); // private Connection con; private String offset; private String limit; public Import(MetaDataHandler mh, TripleStoreHandler th, String offset, String limit) throws SQLException, ClassNotFoundException { this.mh = mh; this.th = th; // Class.forName("virtuoso.jdbc4.Driver"); // String // connectString="jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111/charset=UTF-8"; // this.con = // DriverManager.getConnection(connectString,"dwinter","weikiki7"); this.offset = offset; this.limit = limit; } private void createMPIWGIdentifiers(String clsName, HashMap<String, String> mapping, String inCtx, String outCtx) throws RepositoryException, MalformedQueryException, QueryEvaluationException, URISyntaxException, TripleStoreHandlerException, SQLException { int newPersonID = 0; int namedEntityIdentifierID= 0; int namedEntityIdentifierCreationID= 0; List<String> classes = mh.getEquivalentClasses(clsName); // suche alle // aequivalenten // Klassen classes.add(clsName); // add the classname it self; // RepositoryResult<Statement> objects= new // RepositoryResult<Statement>(null); OWLIndividual providerMPIWG = th.getProvider(mh, TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG"); if (providerMPIWG == null) { providerMPIWG = createMPIWFProvider(outCtx); } OWLIndividual providerDbPedia = th.getProvider(mh, TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia"); if (providerDbPedia == null) { providerDbPedia = createDbPediaProvider(outCtx); } for (String cl : classes) { // gehe durch die klassen RepositoryConnection con = th.getRepository().getConnection(); // String queryString = "SELECT DISTINCT ?s FROM <" + // inCtx+"> WHERE {?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <" // + cl + "> .} OFFSET 199999 LIMIT 100000"; String queryString = "SELECT DISTINCT ?s FROM <" + inCtx + "> WHERE {?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <" + cl + "> .} OFFSET " + offset + " LIMIT " + limit; // TupleQuery tupleQuery = // con.prepareTupleQuery(QueryLanguage.SPARQL, queryString); TupleQueryResult result = th.querySPARQL(queryString); // RepositoryResult<Statement> objects = th.getStatements(null, // "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", cl); // while(objects.hasNext()){ // Statement st=objects.next(); // Resource subj = st.getSubject(); // while (result.hasNext()) { BindingSet bindingSet = result.next(); Value subjValue = bindingSet.getValue("s"); URI subj = th.getRepository().getValueFactory() .createURI(subjValue.stringValue()); // Value p = bindingSet.getValue("p"); // Value o = bindingSet.getValue("o"); HashMap<String, Object> newValues = new HashMap<String, Object>();// werte // die // neu // eingetragen // werden // muessen for (String key : mapping.keySet()) { RepositoryResult<Statement> namesStatements = con .getStatements(subj, th.getRepository() .getValueFactory().createURI(key), null, false); Statement firstStatement = TripleStoreHandler .getFirstStatement(namesStatements); if (firstStatement != null) { Object newValue; Value val = firstStatement.getObject(); if (LiteralImpl.class.isInstance(val)) { // wenn ein // string // literal, // dann // uebersetze // in jena // string // literal LiteralImpl li = (LiteralImpl) val; newValue = mh.getOwlModel() .createRDFSLiteralOrString(li.getLabel(), li.getLanguage()); } else { // anderfalls dern string wert = uri newValue = val.stringValue(); } newValues.put(mapping.get(key), newValue); } } // first create the new person Boolean ex = checkExistance(newValues,th, mapping.values(), outCtx); if (ex) { logger.info("nothing to be done!"); continue; } newPersonID = getNewId(newPersonID, "Person", outCtx); logger.info("New ID choosen:" + String.valueOf(newPersonID)); OWLIndividual person = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "Person", TripleStoreHandler.ONTOLOGY_NS + "Person:" + String.valueOf(newPersonID), newValues); mh.printIndividual(person); newPersonID += 1; // Boolean ex = checkExistance(person, th, mapping.values(), // outCtx); // if (ex) { // logger.info("nothing to be done!"); // person.delete(); // continue; // } th.write(person, outCtx); // now we create the MPIWG identifier and connect it to the // person HashMap<String, Object> idValues = new HashMap<String, Object>(); idValues.put(TripleStoreHandler.ONTOLOGY_NS + "is_preferred_namedEntityIdentifier", person); idValues.put( "http://erlangen-crm.org/plus/xdt/110404/has_XSD_String", "Person:" + String.valueOf(newPersonID)); // TODO PID // GENERATOR namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx); OWLIndividual mpiwgIdentifier = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:" + String.valueOf(namedEntityIdentifierID), idValues); namedEntityIdentifierID+=1; th.write(mpiwgIdentifier, outCtx); // now create the creation object and connect it to the MPIWG // identifier and the provider MPIWG HashMap<String, Object> creationValues = new HashMap<String, Object>(); creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "carriedOutByNamedEntityProvider", providerMPIWG); creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "created_NamedEntityIdentifier", mpiwgIdentifier); namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx); OWLIndividual creation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:" + String.valueOf(namedEntityIdentifierCreationID), creationValues); namedEntityIdentifierCreationID+=1; th.write(creation, outCtx); // now create the wikipedia identifier and connect it to the // person HashMap<String, Object> db_idValues = new HashMap<String, Object>(); db_idValues.put(TripleStoreHandler.ONTOLOGY_NS + "identifies_NamedEntity", person); // identifier is the url at dbpedia db_idValues .put("http://erlangen-crm.org/plus/xdt/110404/has_XSD_String", subjValue.stringValue()); namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx); OWLIndividual dbIdentifier = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:" + String.valueOf(namedEntityIdentifierID), db_idValues); namedEntityIdentifierID+=1; th.write(dbIdentifier, outCtx); // now create the creation object and connect it to the dbpedia // identifier and the provider pdbedia HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "carriedOutByNamedEntityProvider", providerDbPedia); db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "created_NamedEntityIdentifier", dbIdentifier); namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:" + String.valueOf(namedEntityIdentifierCreationID), db_creationValues); namedEntityIdentifierCreationID+=1; th.write(dbcreation, outCtx); // add the bbpedia identifier to the triple store th.write(subj.stringValue(), "http://erlangen-crm.org/110404/P1_is_identified_by", dbIdentifier.getURI(), outCtx); ; } } } private Boolean checkExistance(HashMap<String, Object> newValues, TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException { String queryString = "select ?x FROM <" + outCtx + "> " + "where {"; for (String propString : props) { //RDFProperty rdfProp = model.getRDFProperty(propString); RDFSLiteral val = (RDFSLiteral) newValues.get(propString); if (val==null) continue; String lang = val.getLanguage(); String str = val.getString(); str = str.replace("\"", "\\\""); queryString += "?x <" + propString + "> \"" + str + "\"@" + lang + "."; } queryString += " }"; TupleQueryResult result; try { result = th.querySPARQL(queryString); } catch (MalformedQueryException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (TripleStoreHandlerException e) { e.printStackTrace(); logger.error("Query String cannot be handled:" + queryString); return false; } try { if (result.hasNext()) return true; else return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } } private int getNewId(int startnumber, String identifier, String ctx) throws SQLException { java.sql.Statement smt = th.sqlCon.createStatement(); Boolean exists = true; while (exists) { String cmdString = String.format( "sparql select count(*) from <%s> where {<%s> ?x ?y}", ctx, TripleStoreHandler.ONTOLOGY_NS + "Person:" + String.valueOf(startnumber)); smt.execute(cmdString); ResultSet rs = smt.getResultSet(); rs.next(); int count = rs.getInt(1); if (count > 0) { startnumber += 1; } else { exists = false; } } return startnumber; } private Boolean checkExistance(OWLIndividual person, TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException { JenaOWLModel model = mh.getOwlModel(); //Map<String, String> vals = new HashMap<String, String>(); String queryString = "select ?x FROM <" + outCtx + "> " + "where {"; for (String propString : props) { RDFProperty rdfProp = model.getRDFProperty(propString); RDFSLiteral val = (RDFSLiteral) person.getPropertyValue(rdfProp); if (val == null) continue; String lang = val.getLanguage(); String str = val.getString(); str = str.replace("\"", "\\\""); queryString += "?x <" + propString + "> \"" + str + "\"@" + lang + "."; } queryString += " }"; TupleQueryResult result; try { result = th.querySPARQL(queryString); } catch (MalformedQueryException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (TripleStoreHandlerException e) { logger.error("Query String cannot be handled:" + queryString); return false; } try { if (result.hasNext()) return true; else return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } } private OWLIndividual createDbPediaProvider(String ctx) throws RepositoryException, TripleStoreHandlerException { HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia"); th.write(dbcreation, ctx); return dbcreation; } private OWLIndividual createMPIWFProvider(String ctx) throws RepositoryException, TripleStoreHandlerException { HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG"); th.write(dbcreation, ctx); return dbcreation; } public static void main(String args[]) throws Exception { if (args.length < 2) { System.out.println("usage: import offset limit"); System.exit(1); } Logger.getRootLogger().setLevel(Level.INFO); BasicConfigurator.configure(); MetaDataHandler mh = new MetaDataHandler(); TripleStoreHandler th = new TripleStoreHandler( "jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", "wa55er"); System.out.println(mh); System.out.println(th); Import imp = new Import(mh, th, args[0], args[1]); HashMap<String, String> mapping = new HashMap<String, String>(); mapping.put("http://xmlns.com/foaf/0.1/surname", "http://xmlns.com/foaf/0.1/lastName"); mapping.put("http://xmlns.com/foaf/0.1/givenName", "http://xmlns.com/foaf/0.1/firstName"); mapping.put("http://xmlns.com/foaf/0.1/name", "http://xmlns.com/foaf/0.1/name"); imp.createMPIWGIdentifiers("http://dbpedia.org/ontology/Person", mapping, "file://personendataWikipedia", "file://mpiwg_persons.rdf"); // mh.getOwlModel().save(new java.net.URI("file:///tmp/prot.owl")); } }