Mercurial > hg > NamedIdentityManager
view src/de/mpiwg/itgroup/nimanager/importer/Import.java @ 2:e3ecb88314a5
minor bugs
ontologies added
author | dwinter |
---|---|
date | Fri, 02 Dec 2011 08:37:03 +0100 |
parents | b8333fab0d95 |
children | f986e74583eb |
line wrap: on
line source
package de.mpiwg.itgroup.nimanager.importer; import java.net.URISyntaxException; import java.net.URL; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.log4j.BasicConfigurator; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.openrdf.model.Resource; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; import org.openrdf.model.impl.LiteralImpl; import org.openrdf.query.BindingSet; import org.openrdf.query.MalformedQueryException; import org.openrdf.query.QueryEvaluationException; import org.openrdf.query.QueryLanguage; import org.openrdf.query.TupleQuery; import org.openrdf.query.TupleQueryResult; import org.openrdf.repository.RepositoryConnection; import org.openrdf.repository.RepositoryException; import org.openrdf.repository.RepositoryResult; import com.hp.hpl.jena.graph.impl.LiteralLabel; import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException; import de.mpiwg.itgroup.nimanager.owl.MetaDataHandler; import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler; import edu.stanford.smi.protegex.owl.jena.JenaOWLModel; import edu.stanford.smi.protegex.owl.model.OWLIndividual; import edu.stanford.smi.protegex.owl.model.RDFProperty; import edu.stanford.smi.protegex.owl.model.RDFSLiteral; public class Import { private MetaDataHandler mh; private TripleStoreHandler th; private Logger logger = Logger.getRootLogger(); // private Connection con; private String offset; private String limit; public Import(MetaDataHandler mh, TripleStoreHandler th, String offset, String limit) throws SQLException, ClassNotFoundException { this.mh = mh; this.th = th; this.offset = offset; this.limit = limit; } private void createMPIWGIdentifiers(String clsName, HashMap<String, String> mapping, String inCtx, String outCtx) throws RepositoryException, MalformedQueryException, QueryEvaluationException, URISyntaxException, TripleStoreHandlerException, SQLException { int newPersonID = 0; int namedEntityIdentifierID= 0; int namedEntityIdentifierCreationID= 0; List<String> classes = mh.getEquivalentClasses(clsName); // suche alle // aequivalenten // Klassen classes.add(clsName); // add the classname it self; OWLIndividual providerMPIWG = th.getProvider(mh, TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG"); if (providerMPIWG == null) { providerMPIWG = createMPIWFProvider(outCtx); } OWLIndividual providerDbPedia = th.getProvider(mh, TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia"); if (providerDbPedia == null) { providerDbPedia = createDbPediaProvider(outCtx); } for (String cl : classes) { // gehe durch die klassen RepositoryConnection con = th.getRepository().getConnection(); String queryString = "SELECT DISTINCT ?s FROM <" + inCtx + "> WHERE {?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <" + cl + "> .} OFFSET " + offset + " LIMIT " + limit; TupleQueryResult result = th.querySPARQL(queryString); while (result.hasNext()) { BindingSet bindingSet = result.next(); Value subjValue = bindingSet.getValue("s"); URI subj = th.getRepository().getValueFactory() .createURI(subjValue.stringValue()); HashMap<String, Object> newValues = new HashMap<String, Object>();// werte // die // neu // eingetragen // werden // muessen for (String key : mapping.keySet()) { RepositoryResult<Statement> namesStatements = con .getStatements(subj, th.getRepository() .getValueFactory().createURI(key), null, false); Statement firstStatement = TripleStoreHandler .getFirstStatement(namesStatements); if (firstStatement != null) { Object newValue; Value val = firstStatement.getObject(); if (LiteralImpl.class.isInstance(val)) { // wenn ein // string // literal, // dann // uebersetze // in jena // string // literal LiteralImpl li = (LiteralImpl) val; newValue = mh.getOwlModel() .createRDFSLiteralOrString(li.getLabel(), li.getLanguage()); } else { // anderfalls dern string wert = uri newValue = val.stringValue(); } newValues.put(mapping.get(key), newValue); } } // first create the new person Boolean ex = checkExistance(newValues,th, mapping.values(), outCtx); if (ex) { logger.info("nothing to be done!"); continue; } newPersonID = getNewId(newPersonID, "Person", outCtx); logger.info("New ID choosen:" + String.valueOf(newPersonID)); OWLIndividual person = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "Person", TripleStoreHandler.ONTOLOGY_NS + "Person:" + String.valueOf(newPersonID), newValues); mh.printIndividual(person); newPersonID += 1; // Boolean ex = checkExistance(person, th, mapping.values(), // outCtx); // if (ex) { // logger.info("nothing to be done!"); // person.delete(); // continue; // } th.write(person, outCtx); // now we create the MPIWG identifier and connect it to the // person HashMap<String, Object> idValues = new HashMap<String, Object>(); idValues.put(TripleStoreHandler.ONTOLOGY_NS + "is_preferred_namedEntityIdentifier", person); idValues.put( "http://erlangen-crm.org/plus/xdt/110404/has_XSD_String", "Person:" + String.valueOf(newPersonID)); // TODO PID // GENERATOR namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx); OWLIndividual mpiwgIdentifier = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:" + String.valueOf(namedEntityIdentifierID), idValues); namedEntityIdentifierID+=1; th.write(mpiwgIdentifier, outCtx); // now create the creation object and connect it to the MPIWG // identifier and the provider MPIWG HashMap<String, Object> creationValues = new HashMap<String, Object>(); creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "carriedOutByNamedEntityProvider", providerMPIWG); creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "created_NamedEntityIdentifier", mpiwgIdentifier); namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx); OWLIndividual creation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:" + String.valueOf(namedEntityIdentifierCreationID), creationValues); namedEntityIdentifierCreationID+=1; th.write(creation, outCtx); // now create the wikipedia identifier and connect it to the // person HashMap<String, Object> db_idValues = new HashMap<String, Object>(); db_idValues.put(TripleStoreHandler.ONTOLOGY_NS + "identifies_NamedEntity", person); // identifier is the url at dbpedia db_idValues .put("http://erlangen-crm.org/plus/xdt/110404/has_XSD_String", subjValue.stringValue()); namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx); OWLIndividual dbIdentifier = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:" + String.valueOf(namedEntityIdentifierID), db_idValues); namedEntityIdentifierID+=1; th.write(dbIdentifier, outCtx); // now create the creation object and connect it to the dbpedia // identifier and the provider pdbedia HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "carriedOutByNamedEntityProvider", providerDbPedia); db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "created_NamedEntityIdentifier", dbIdentifier); namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:" + String.valueOf(namedEntityIdentifierCreationID), db_creationValues); namedEntityIdentifierCreationID+=1; th.write(dbcreation, outCtx); // add the bbpedia identifier to the triple store th.write(subj.stringValue(), "http://erlangen-crm.org/110404/P1_is_identified_by", dbIdentifier.getURI(), outCtx); ; } } } private Boolean checkExistance(HashMap<String, Object> newValues, TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException { String queryString = "select ?x FROM <" + outCtx + "> " + "where {"; for (String propString : props) { //RDFProperty rdfProp = model.getRDFProperty(propString); RDFSLiteral val = (RDFSLiteral) newValues.get(propString); if (val==null) continue; String lang = val.getLanguage(); String str = val.getString(); str = str.replace("\"", "\\\""); queryString += "?x <" + propString + "> \"" + str + "\"@" + lang + "."; } queryString += " }"; TupleQueryResult result; try { result = th.querySPARQL(queryString); } catch (MalformedQueryException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (TripleStoreHandlerException e) { e.printStackTrace(); logger.error("Query String cannot be handled:" + queryString); return false; } try { if (result.hasNext()) return true; else return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } } private int getNewId(int startnumber, String identifier, String ctx) throws SQLException { java.sql.Statement smt = th.sqlCon.createStatement(); Boolean exists = true; while (exists) { String cmdString = String.format( "sparql select count(*) from <%s> where {<%s> ?x ?y}", ctx, TripleStoreHandler.ONTOLOGY_NS + identifier +":" + String.valueOf(startnumber)); smt.execute(cmdString); ResultSet rs = smt.getResultSet(); rs.next(); int count = rs.getInt(1); if (count > 0) { startnumber += 1; } else { exists = false; } } return startnumber; } private Boolean checkExistance(OWLIndividual person, TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException { JenaOWLModel model = mh.getOwlModel(); //Map<String, String> vals = new HashMap<String, String>(); String queryString = "select ?x FROM <" + outCtx + "> " + "where {"; for (String propString : props) { RDFProperty rdfProp = model.getRDFProperty(propString); RDFSLiteral val = (RDFSLiteral) person.getPropertyValue(rdfProp); if (val == null) continue; String lang = val.getLanguage(); String str = val.getString(); str = str.replace("\"", "\\\""); queryString += "?x <" + propString + "> \"" + str + "\"@" + lang + "."; } queryString += " }"; TupleQueryResult result; try { result = th.querySPARQL(queryString); } catch (MalformedQueryException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (TripleStoreHandlerException e) { logger.error("Query String cannot be handled:" + queryString); return false; } try { if (result.hasNext()) return true; else return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } } private OWLIndividual createDbPediaProvider(String ctx) throws RepositoryException, TripleStoreHandlerException { HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia"); th.write(dbcreation, ctx); return dbcreation; } private OWLIndividual createMPIWFProvider(String ctx) throws RepositoryException, TripleStoreHandlerException { HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG"); th.write(dbcreation, ctx); return dbcreation; } public static void main(String args[]) throws Exception { if (args.length < 4) { System.out.println("usage: import user pw offset limit "); System.exit(1); } Logger.getRootLogger().setLevel(Level.INFO); BasicConfigurator.configure(); MetaDataHandler mh = new MetaDataHandler(); TripleStoreHandler th = new TripleStoreHandler( "jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111",args[0], args[1]); System.out.println(mh); System.out.println(th); Import imp = new Import(mh, th, args[2], args[3]); HashMap<String, String> mapping = new HashMap<String, String>(); mapping.put("http://xmlns.com/foaf/0.1/surname", "http://xmlns.com/foaf/0.1/lastName"); mapping.put("http://xmlns.com/foaf/0.1/givenName", "http://xmlns.com/foaf/0.1/firstName"); mapping.put("http://xmlns.com/foaf/0.1/name", "http://xmlns.com/foaf/0.1/name"); imp.createMPIWGIdentifiers("http://dbpedia.org/ontology/Person", mapping, "file://personendataWikipedia", "file://mpiwg_persons_2.rdf"); // mh.getOwlModel().save(new java.net.URI("file:///tmp/prot.owl")); } }