Mercurial > hg > NamedIdentityManager
view src/de/mpiwg/itgroup/nimanager/importer/ImportGND.java @ 2:e3ecb88314a5
minor bugs
ontologies added
author | dwinter |
---|---|
date | Fri, 02 Dec 2011 08:37:03 +0100 |
parents | |
children | f986e74583eb |
line wrap: on
line source
package de.mpiwg.itgroup.nimanager.importer; import java.awt.dnd.DnDConstants; import java.io.FileWriter; import java.io.IOException; import java.io.PrintStream; import java.net.URISyntaxException; import java.net.URL; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.log4j.BasicConfigurator; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.openrdf.model.Resource; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; import org.openrdf.model.impl.LiteralImpl; import org.openrdf.query.Binding; import org.openrdf.query.BindingSet; import org.openrdf.query.MalformedQueryException; import org.openrdf.query.Query; import org.openrdf.query.QueryEvaluationException; import org.openrdf.query.QueryLanguage; import org.openrdf.query.TupleQuery; import org.openrdf.query.TupleQueryResult; import org.openrdf.repository.RepositoryConnection; import org.openrdf.repository.RepositoryException; import org.openrdf.repository.RepositoryResult; import com.hp.hpl.jena.graph.impl.LiteralLabel; import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException; import de.mpiwg.itgroup.nimanager.owl.MetaDataHandler; import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler; import edu.stanford.smi.protegex.owl.jena.JenaOWLModel; import edu.stanford.smi.protegex.owl.model.OWLIndividual; import edu.stanford.smi.protegex.owl.model.RDFProperty; import edu.stanford.smi.protegex.owl.model.RDFSLiteral; public class ImportGND { private MetaDataHandler mh; private TripleStoreHandler th; private Logger logger = Logger.getRootLogger(); // private Connection con; private String offset; private String limit; private FileWriter dbpediaMissing; private String mpiwgPerson = "file://mpiwg_persons_2.rdf"; public ImportGND(MetaDataHandler mh, TripleStoreHandler th, String offset, String limit) throws SQLException, ClassNotFoundException { this.mh = mh; this.th = th; this.offset = offset; this.limit = limit; } private void createMPIWGFromGNDIdentifiers(String predicate, HashMap<String, String> mapping, String inCtx, String outCtx) throws RepositoryException, MalformedQueryException, QueryEvaluationException, URISyntaxException, TripleStoreHandlerException, SQLException, IOException { createMPIWGFromGNDIdentifiers(predicate, mapping, new HashMap<String, String>(), inCtx, outCtx); } // benutze predicate zur identifizierung der gnd eintrage, diese haben keine // rdfs:type attribute, daher nehme ich alle eintraege die ein bestimmtes // attribute hhaben private void createMPIWGFromGNDIdentifiers(String predicate, HashMap<String, String> mapping, HashMap<String, String> complexMapping, String inCtx, String outCtx) throws RepositoryException, MalformedQueryException, QueryEvaluationException, URISyntaxException, TripleStoreHandlerException, SQLException, IOException { dbpediaMissing = new FileWriter("/tmp/missingDBPedia.txt"); int newPersonID = 0; int namedEntityIdentifierID = 0; int namedEntityIdentifierCreationID = 0; // List<String> classes = mh.getEquivalentClasses(clsName); // suche // alle // aequivalenten // Klassen // classes.add(clsName); // add the classname it self; OWLIndividual providerMPIWG = th.getProvider(mh, TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG"); if (providerMPIWG == null) { providerMPIWG = createMPIWFProvider(outCtx); } OWLIndividual providerDbPedia = th.getProvider(mh, TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia"); if (providerDbPedia == null) { providerDbPedia = createDbPediaProvider(outCtx); } OWLIndividual providerDNB = th.getProvider(mh, TripleStoreHandler.ONTOLOGY_NS + "provider_DNB"); if (providerDNB == null) { providerDNB = createDNBProvider(outCtx); } RepositoryConnection con = th.getRepository().getConnection(); // find all object String queryString = "SELECT DISTINCT ?s FROM <" + inCtx + "> WHERE {?s <" + predicate + ">" + "?o .} OFFSET " + offset + " LIMIT " + limit; TupleQueryResult result = th.querySPARQL(queryString); while (result.hasNext()) { BindingSet bindingSet = result.next(); Value subjValue = bindingSet.getValue("s"); URI gndPerson = th.getRepository().getValueFactory() .createURI(subjValue.stringValue()); // check if dbpedia link exists boolean createNew = false; URI dbpedia = getDBPediaLink(gndPerson, inCtx); if (dbpedia != null) { // gibt es den entsprechenden Eintrag schon, dann hole die // entprechende person URI person = getPersonFromDBPedia(dbpedia); if (person != null) { OWLIndividual personInd = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "Person", person.toString()); addGNDToMPIWGIdentifier(gndPerson, personInd, namedEntityIdentifierID, outCtx, providerDNB, namedEntityIdentifierCreationID); } else { dbpediaMissing.write(subjValue.stringValue()); createNew = true; } } else { createNew = true; } // zur GND gibt es noch keinen personen eintrag related zur DB if (createNew) { HashMap<String, Object> newValues = new HashMap<String, Object>();// werte // die // neu // eingetragen // werden // muessen for (String key : mapping.keySet()) { RepositoryResult<Statement> namesStatements = con .getStatements(gndPerson, th.getRepository() .getValueFactory().createURI(key), null, false); while (namesStatements.hasNext()) { Statement stmt = namesStatements.next(); Object newValue; Value val = stmt.getObject(); // sollte literal sein if (LiteralImpl.class.isInstance(val)) { // wenn ein // string // literal, // dann // uebersetze // in jena // string // literal LiteralImpl li = (LiteralImpl) val; newValue = mh.getOwlModel() .createRDFSLiteralOrString(li.getLabel(), li.getLanguage()); newValues.put(mapping.get(key), newValue); break; // nicht mehr weiter suche, nimm also immer den // ersten literal } } } for (String key : complexMapping.keySet()) { String cmd = "select ?o from <" + inCtx + "> where {" + String.format(key, gndPerson.stringValue()) + "}"; TupleQueryResult results = th.querySPARQL(cmd); if (results.hasNext()) { // nimm nur das erste BindingSet firstStatement = results.next(); Object newValue; Value val = firstStatement.getBinding("o").getValue(); if (LiteralImpl.class.isInstance(val)) { // wenn ein // string // literal, // dann // uebersetze // in jena // string // literal LiteralImpl li = (LiteralImpl) val; newValue = mh.getOwlModel() .createRDFSLiteralOrString(li.getLabel(), li.getLanguage()); } else { // anderfalls dern string wert = uri newValue = val.stringValue(); } newValues.put(complexMapping.get(key), newValue); } } // first create the new person Boolean ex = checkExistance(newValues, th, mapping.values(), outCtx); if (ex) { logger.info("nothing to be done!"); continue; } newPersonID = getNewId(newPersonID, "Person", new String[] { outCtx, mpiwgPerson }); logger.info("New ID choosen:" + String.valueOf(newPersonID)); OWLIndividual person = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "Person", TripleStoreHandler.ONTOLOGY_NS + "Person:" + String.valueOf(newPersonID), newValues); mh.printIndividual(person); newPersonID += 1; // Boolean ex = checkExistance(person, th, mapping.values(), // outCtx); // if (ex) { // logger.info("nothing to be done!"); // person.delete(); // continue; // } th.write(person, outCtx); // now we create the MPIWG identifier and connect it to the // person HashMap<String, Object> idValues = new HashMap<String, Object>(); idValues.put(TripleStoreHandler.ONTOLOGY_NS + "is_preferred_namedEntityIdentifier", person); idValues.put( "http://erlangen-crm.org/plus/xdt/110404/has_XSD_String", "Person:" + String.valueOf(newPersonID)); // TODO PID // GENERATOR namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", new String[] { outCtx, mpiwgPerson }); OWLIndividual mpiwgIdentifier = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:" + String.valueOf(namedEntityIdentifierID), idValues); namedEntityIdentifierID += 1; th.write(mpiwgIdentifier, outCtx); // now create the creation object and connect it to the MPIWG // identifier and the provider MPIWG HashMap<String, Object> creationValues = new HashMap<String, Object>(); creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "carriedOutByNamedEntityProvider", providerMPIWG); creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "created_NamedEntityIdentifier", mpiwgIdentifier); namedEntityIdentifierCreationID = getNewId( namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", new String[] { outCtx, mpiwgPerson }); OWLIndividual creation = mh .generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:" + String.valueOf(namedEntityIdentifierCreationID), creationValues); namedEntityIdentifierCreationID += 1; th.write(creation, outCtx); addGNDToMPIWGIdentifier(gndPerson, person, namedEntityIdentifierID, outCtx, providerDNB, namedEntityIdentifierCreationID); } } } private void addGNDToMPIWGIdentifier(URI gndPerson, OWLIndividual person, int namedEntityIdentifierID, String outCtx, Object gndProvider, int namedEntityIdentifierCreationID) throws SQLException, RepositoryException, TripleStoreHandlerException { // now create the gnd identifier and connect it to the // person HashMap<String, Object> db_idValues = new HashMap<String, Object>(); db_idValues.put(TripleStoreHandler.ONTOLOGY_NS + "identifies_NamedEntity", person); // identifier is the url at dng db_idValues.put( "http://erlangen-crm.org/plus/xdt/110404/has_XSD_String", gndPerson.stringValue()); namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", new String[] { outCtx, mpiwgPerson }); OWLIndividual dbIdentifier = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:" + String.valueOf(namedEntityIdentifierID), db_idValues); namedEntityIdentifierID += 1; th.write(dbIdentifier, outCtx); // now create the creation object and connect it to the gnd // identifier and the provider gnd HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "carriedOutByNamedEntityProvider", gndProvider); db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS + "created_NamedEntityIdentifier", dbIdentifier); namedEntityIdentifierCreationID = getNewId( namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", new String[] { outCtx, mpiwgPerson }); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation", TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:" + String.valueOf(namedEntityIdentifierCreationID), db_creationValues); namedEntityIdentifierCreationID += 1; th.write(dbcreation, outCtx); // add the bbpedia identifier to the triple store th.write(gndPerson.stringValue(), "http://erlangen-crm.org/110404/P1_is_identified_by", dbIdentifier.getURI(), outCtx); ; } private URI getPersonFromDBPedia(URI dbpedia) { String query = "select distinct ?x ?y ?person " + "from <file://mpiwg_persons.rdf> " + "where { ?y <http://ontologies.mpiwg-berlin.mpg.de/authorities/namedIdentities#identifies_NamedEntity> ?person." + "<" + dbpedia.stringValue() + "> <http://erlangen-crm.org/110404/P1_is_identified_by> ?y.}"; try { TupleQueryResult results = th.querySPARQL(query); while (results.hasNext()) { BindingSet stm = results.next(); Binding person = stm.getBinding("person"); return (URI) person.getValue(); } } catch (MalformedQueryException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (QueryEvaluationException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (TripleStoreHandlerException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } private URI getDBPediaLink(URI subj, String inCtx) { try { RepositoryResult<Statement> statements = th.getStatements(subj, th.createUri("http://www.w3.org/2002/07/owl#sameAs"), null, inCtx); while (statements.hasNext()) { Statement smt = statements.next(); URI obj = (URI) smt.getObject(); if (obj.getNamespace().equals("http://dbpedia.org/resource/")) { return obj; } } return null; } catch (RepositoryException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } private Boolean checkExistance(HashMap<String, Object> newValues, TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException { String queryString = "select ?x FROM <" + outCtx + "> " + "where {"; for (String propString : props) { // RDFProperty rdfProp = model.getRDFProperty(propString); String str = ""; String lang = ""; Object valObj = newValues.get(propString); if (RDFSLiteral.class.isInstance(valObj)) { RDFSLiteral val = (RDFSLiteral) valObj; if (val == null) continue; lang = val.getLanguage(); str = val.getString(); str = str.replace("\"", "\\\""); queryString += "?x <" + propString + "> \"" + str + "\"@" + lang + "."; } else { str = (String) valObj; str = str.replace("\"", "\\\""); queryString += "?x <" + propString + "> \"" + str + "\"" + "."; } } queryString += " }"; TupleQueryResult result; try { result = th.querySPARQL(queryString); } catch (MalformedQueryException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (TripleStoreHandlerException e) { e.printStackTrace(); logger.error("Query String cannot be handled:" + queryString); return false; } try { if (result.hasNext()) return true; else return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } } private int getNewId(int startnumber, String identifier, String[] ctx) throws SQLException { java.sql.Statement smt = th.sqlCon.createStatement(); Boolean exists = true; while (exists) { String fromString = ""; for (int i = 0; i < ctx.length; i++) { fromString += String.format(" from <%s> ", ctx[i]); } String cmdString = String.format( "sparql select count(*) %s where {<%s> ?x ?y}", fromString, TripleStoreHandler.ONTOLOGY_NS + identifier + ":" + String.valueOf(startnumber)); smt.execute(cmdString); ResultSet rs = smt.getResultSet(); rs.next(); int count = rs.getInt(1); if (count > 0) { startnumber += 1; } else { exists = false; } } return startnumber; } private Boolean checkExistance(OWLIndividual person, TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException { JenaOWLModel model = mh.getOwlModel(); // Map<String, String> vals = new HashMap<String, String>(); String queryString = "select ?x FROM <" + outCtx + "> " + "where {"; for (String propString : props) { RDFProperty rdfProp = model.getRDFProperty(propString); RDFSLiteral val = (RDFSLiteral) person.getPropertyValue(rdfProp); if (val == null) continue; String lang = val.getLanguage(); String str = val.getString(); str = str.replace("\"", "\\\""); queryString += "?x <" + propString + "> \"" + str + "\"@" + lang + "."; } queryString += " }"; TupleQueryResult result; try { result = th.querySPARQL(queryString); } catch (MalformedQueryException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } catch (TripleStoreHandlerException e) { logger.error("Query String cannot be handled:" + queryString); return false; } try { if (result.hasNext()) return true; else return false; } catch (QueryEvaluationException e) { logger.error("Query String cannot be handled:" + queryString); return false; } } private OWLIndividual createDbPediaProvider(String ctx) throws RepositoryException, TripleStoreHandlerException { HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia"); th.write(dbcreation, ctx); return dbcreation; } private OWLIndividual createDNBProvider(String ctx) throws RepositoryException, TripleStoreHandlerException { HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", TripleStoreHandler.ONTOLOGY_NS + "provider_DNB"); th.write(dbcreation, ctx); return dbcreation; } private OWLIndividual createMPIWFProvider(String ctx) throws RepositoryException, TripleStoreHandlerException { HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); OWLIndividual dbcreation = mh.generateEntity( TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG"); th.write(dbcreation, ctx); return dbcreation; } public static void main(String args[]) throws Exception { if (args.length < 4) { System.out.println("usage: import user pw offset limit "); System.exit(1); } Logger.getRootLogger().setLevel(Level.INFO); BasicConfigurator.configure(); MetaDataHandler mh = new MetaDataHandler(); // TripleStoreHandler th = new TripleStoreHandler( // "jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111",args[0], // args[1]); TripleStoreHandler th = new TripleStoreHandler( "jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", args[0], args[1]); System.out.println(mh); System.out.println(th); ImportGND imp = new ImportGND(mh, th, args[2], args[3]); HashMap<String, String> mapping = new HashMap<String, String>(); HashMap<String, String> complexMapping = new HashMap<String, String>(); // SELECT DISTINCT * // FROM <file://mpiwg_persons.rdf> // FROM <file:///GND.rdf> // WHERE { ?p <http://d-nb.info/gnd/foreName> ?o. // <http://d-nb.info/gnd/100004776> // <http://d-nb.info/gnd/preferredNameForThePerson> ?p } // SELECT DISTINCT * // FROM <file://mpiwg_persons.rdf> // FROM <file:///GND.rdf> // WHERE { ?o <http://d-nb.info/gnd/surname> ?o2. // <http://d-nb.info/gnd/100004776> // <http://d-nb.info/gnd/preferredNameForThePerson> ?o } complexMapping.put("?p <http://d-nb.info/gnd/surname> ?o." + "<%s> <http://d-nb.info/gnd/preferredNameForThePerson> ?p ", "http://xmlns.com/foaf/0.1/lastName"); complexMapping.put("?p <http://d-nb.info/gnd/foreName> ?o." + "<%s> <http://d-nb.info/gnd/preferredNameForThePerson> ?p ", "http://xmlns.com/foaf/0.1/firstName"); mapping.put("http://d-nb.info/gnd/preferredNameForThePerson", "http://xmlns.com/foaf/0.1/name"); imp.createMPIWGFromGNDIdentifiers( "http://RDVocab.info/ElementsGr2/identifierForThePerson", mapping, complexMapping, "file:///GND.rdf", "file://mpiwg_persons_dnb.rdf"); // mh.getOwlModel().save(new java.net.URI("file:///tmp/prot.owl")); } }