Mercurial > hg > NamedIdentityManager
diff src/de/mpiwg/itgroup/nimanager/importer/Import.java @ 0:1384a0d382fa
first input
author | dwinter |
---|---|
date | Thu, 30 Jun 2011 11:44:24 +0200 |
parents | |
children | b8333fab0d95 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/itgroup/nimanager/importer/Import.java Thu Jun 30 11:44:24 2011 +0200 @@ -0,0 +1,469 @@ +package de.mpiwg.itgroup.nimanager.importer; + +import java.net.URISyntaxException; +import java.net.URL; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.log4j.BasicConfigurator; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.openrdf.model.Resource; +import org.openrdf.model.Statement; +import org.openrdf.model.URI; +import org.openrdf.model.Value; +import org.openrdf.model.impl.LiteralImpl; +import org.openrdf.query.BindingSet; +import org.openrdf.query.MalformedQueryException; +import org.openrdf.query.QueryEvaluationException; +import org.openrdf.query.QueryLanguage; +import org.openrdf.query.TupleQuery; +import org.openrdf.query.TupleQueryResult; +import org.openrdf.repository.RepositoryConnection; +import org.openrdf.repository.RepositoryException; +import org.openrdf.repository.RepositoryResult; + +import com.hp.hpl.jena.graph.impl.LiteralLabel; + +import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException; +import de.mpiwg.itgroup.nimanager.owl.MetaDataHandler; +import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler; +import edu.stanford.smi.protegex.owl.jena.JenaOWLModel; +import edu.stanford.smi.protegex.owl.model.OWLIndividual; +import edu.stanford.smi.protegex.owl.model.RDFProperty; +import edu.stanford.smi.protegex.owl.model.RDFSLiteral; + +public class Import { + + private MetaDataHandler mh; + private TripleStoreHandler th; + private Logger logger = Logger.getRootLogger(); + // private Connection con; + private String offset; + private String limit; + + public Import(MetaDataHandler mh, TripleStoreHandler th, String offset, + String limit) throws SQLException, ClassNotFoundException { + this.mh = mh; + this.th = th; + // Class.forName("virtuoso.jdbc4.Driver"); + // String + // connectString="jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111/charset=UTF-8"; + // this.con = + // DriverManager.getConnection(connectString,"dwinter","weikiki7"); + this.offset = offset; + this.limit = limit; + + } + + private void createMPIWGIdentifiers(String clsName, + HashMap<String, String> mapping, String inCtx, String outCtx) + throws RepositoryException, MalformedQueryException, + QueryEvaluationException, URISyntaxException, + TripleStoreHandlerException, SQLException { + + int newPersonID = 0; + int namedEntityIdentifierID= 0; + int namedEntityIdentifierCreationID= 0; + + List<String> classes = mh.getEquivalentClasses(clsName); // suche alle + // aequivalenten + // Klassen + classes.add(clsName); // add the classname it self; + // RepositoryResult<Statement> objects= new + // RepositoryResult<Statement>(null); + + OWLIndividual providerMPIWG = th.getProvider(mh, + TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG"); + + if (providerMPIWG == null) { + providerMPIWG = createMPIWFProvider(outCtx); + } + + OWLIndividual providerDbPedia = th.getProvider(mh, + TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia"); + + if (providerDbPedia == null) { + providerDbPedia = createDbPediaProvider(outCtx); + } + for (String cl : classes) { // gehe durch die klassen + RepositoryConnection con = th.getRepository().getConnection(); + // String queryString = "SELECT DISTINCT ?s FROM <" + + // inCtx+"> WHERE {?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <" + // + cl + "> .} OFFSET 199999 LIMIT 100000"; + String queryString = "SELECT DISTINCT ?s FROM <" + + inCtx + + "> WHERE {?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <" + + cl + "> .} OFFSET " + offset + " LIMIT " + limit; + // TupleQuery tupleQuery = + // con.prepareTupleQuery(QueryLanguage.SPARQL, queryString); + + TupleQueryResult result = th.querySPARQL(queryString); + + // RepositoryResult<Statement> objects = th.getStatements(null, + // "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", cl); + + // while(objects.hasNext()){ + // Statement st=objects.next(); + // Resource subj = st.getSubject(); + // + + while (result.hasNext()) { + BindingSet bindingSet = result.next(); + Value subjValue = bindingSet.getValue("s"); + URI subj = th.getRepository().getValueFactory() + .createURI(subjValue.stringValue()); + + // Value p = bindingSet.getValue("p"); + // Value o = bindingSet.getValue("o"); + HashMap<String, Object> newValues = new HashMap<String, Object>();// werte + // die + // neu + // eingetragen + // werden + // muessen + for (String key : mapping.keySet()) { + + RepositoryResult<Statement> namesStatements = con + .getStatements(subj, th.getRepository() + .getValueFactory().createURI(key), null, + false); + Statement firstStatement = TripleStoreHandler + .getFirstStatement(namesStatements); + + if (firstStatement != null) { + Object newValue; + Value val = firstStatement.getObject(); + + if (LiteralImpl.class.isInstance(val)) { // wenn ein + // string + // literal, + // dann + // uebersetze + // in jena + // string + // literal + LiteralImpl li = (LiteralImpl) val; + + newValue = mh.getOwlModel() + .createRDFSLiteralOrString(li.getLabel(), + li.getLanguage()); + } else { // anderfalls dern string wert = uri + newValue = val.stringValue(); + + } + + newValues.put(mapping.get(key), newValue); + } + } + + // first create the new person + + + Boolean ex = checkExistance(newValues,th, mapping.values(), + outCtx); + + if (ex) { + logger.info("nothing to be done!"); + continue; + } + newPersonID = getNewId(newPersonID, "Person", outCtx); + logger.info("New ID choosen:" + String.valueOf(newPersonID)); + OWLIndividual person = mh.generateEntity( + TripleStoreHandler.ONTOLOGY_NS + "Person", + TripleStoreHandler.ONTOLOGY_NS + "Person:" + + String.valueOf(newPersonID), newValues); + mh.printIndividual(person); + + newPersonID += 1; +// Boolean ex = checkExistance(person, th, mapping.values(), +// outCtx); +// if (ex) { +// logger.info("nothing to be done!"); +// person.delete(); +// continue; +// } + th.write(person, outCtx); + + // now we create the MPIWG identifier and connect it to the + // person + HashMap<String, Object> idValues = new HashMap<String, Object>(); + idValues.put(TripleStoreHandler.ONTOLOGY_NS + + "is_preferred_namedEntityIdentifier", person); + idValues.put( + "http://erlangen-crm.org/plus/xdt/110404/has_XSD_String", + "Person:" + String.valueOf(newPersonID)); // TODO PID + // GENERATOR + + namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx); + OWLIndividual mpiwgIdentifier = mh.generateEntity( + TripleStoreHandler.ONTOLOGY_NS + + "NamedEntityIdentifier", + TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:" + + String.valueOf(namedEntityIdentifierID), + idValues); + + namedEntityIdentifierID+=1; + th.write(mpiwgIdentifier, outCtx); + + // now create the creation object and connect it to the MPIWG + // identifier and the provider MPIWG + HashMap<String, Object> creationValues = new HashMap<String, Object>(); + creationValues.put(TripleStoreHandler.ONTOLOGY_NS + + "carriedOutByNamedEntityProvider", providerMPIWG); + creationValues.put(TripleStoreHandler.ONTOLOGY_NS + + "created_NamedEntityIdentifier", mpiwgIdentifier); + + + namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx); + + OWLIndividual creation = mh.generateEntity( + TripleStoreHandler.ONTOLOGY_NS + + "NamedEntityIdentifierCreation", + TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:" + + String.valueOf(namedEntityIdentifierCreationID), + creationValues); + namedEntityIdentifierCreationID+=1; + + th.write(creation, outCtx); + + // now create the wikipedia identifier and connect it to the + // person + HashMap<String, Object> db_idValues = new HashMap<String, Object>(); + db_idValues.put(TripleStoreHandler.ONTOLOGY_NS + + "identifies_NamedEntity", person); + + // identifier is the url at dbpedia + db_idValues + .put("http://erlangen-crm.org/plus/xdt/110404/has_XSD_String", + subjValue.stringValue()); + + namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx); + + OWLIndividual dbIdentifier = mh.generateEntity( + TripleStoreHandler.ONTOLOGY_NS + + "NamedEntityIdentifier", + TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:" + + String.valueOf(namedEntityIdentifierID), + db_idValues); + + namedEntityIdentifierID+=1; + + th.write(dbIdentifier, outCtx); + + // now create the creation object and connect it to the dbpedia + // identifier and the provider pdbedia + HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); + db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS + + "carriedOutByNamedEntityProvider", providerDbPedia); + db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS + + "created_NamedEntityIdentifier", dbIdentifier); + + + namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx); + + OWLIndividual dbcreation = mh.generateEntity( + TripleStoreHandler.ONTOLOGY_NS + + "NamedEntityIdentifierCreation", + TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:" + + String.valueOf(namedEntityIdentifierCreationID), + db_creationValues); + namedEntityIdentifierCreationID+=1; + + + + th.write(dbcreation, outCtx); + + // add the bbpedia identifier to the triple store + th.write(subj.stringValue(), + "http://erlangen-crm.org/110404/P1_is_identified_by", + dbIdentifier.getURI(), outCtx); + ; + + } + } + + } + + private Boolean checkExistance(HashMap<String, Object> newValues, + TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException { + + String queryString = "select ?x FROM <" + outCtx + "> " + "where {"; + for (String propString : props) { + //RDFProperty rdfProp = model.getRDFProperty(propString); + RDFSLiteral val = (RDFSLiteral) newValues.get(propString); + if (val==null) + continue; + String lang = val.getLanguage(); + String str = val.getString(); + str = str.replace("\"", "\\\""); + queryString += "?x <" + propString + "> \"" + str + "\"@" + lang + + "."; + } + + queryString += " }"; + TupleQueryResult result; + try { + result = th.querySPARQL(queryString); + } catch (MalformedQueryException e) { + logger.error("Query String cannot be handled:" + queryString); + return false; + } catch (QueryEvaluationException e) { + logger.error("Query String cannot be handled:" + queryString); + return false; + } catch (TripleStoreHandlerException e) { + e.printStackTrace(); + logger.error("Query String cannot be handled:" + queryString); + return false; + } + try { + if (result.hasNext()) + return true; + else + return false; + } catch (QueryEvaluationException e) { + logger.error("Query String cannot be handled:" + queryString); + return false; + } + + } + private int getNewId(int startnumber, String identifier, String ctx) + throws SQLException { + + java.sql.Statement smt = th.sqlCon.createStatement(); + Boolean exists = true; + + while (exists) { + String cmdString = String.format( + "sparql select count(*) from <%s> where {<%s> ?x ?y}", + ctx, + TripleStoreHandler.ONTOLOGY_NS + "Person:" + + String.valueOf(startnumber)); + smt.execute(cmdString); + ResultSet rs = smt.getResultSet(); + rs.next(); + int count = rs.getInt(1); + if (count > 0) { + startnumber += 1; + } else { + exists = false; + } + } + + return startnumber; + } + + private Boolean checkExistance(OWLIndividual person, + TripleStoreHandler th2, Collection<String> props, String outCtx) + throws RepositoryException { + JenaOWLModel model = mh.getOwlModel(); + //Map<String, String> vals = new HashMap<String, String>(); + + String queryString = "select ?x FROM <" + outCtx + "> " + "where {"; + for (String propString : props) { + RDFProperty rdfProp = model.getRDFProperty(propString); + RDFSLiteral val = (RDFSLiteral) person.getPropertyValue(rdfProp); + if (val == null) + continue; + String lang = val.getLanguage(); + String str = val.getString(); + str = str.replace("\"", "\\\""); + queryString += "?x <" + propString + "> \"" + str + "\"@" + lang + + "."; + } + + queryString += " }"; + TupleQueryResult result; + try { + result = th.querySPARQL(queryString); + } catch (MalformedQueryException e) { + logger.error("Query String cannot be handled:" + queryString); + return false; + } catch (QueryEvaluationException e) { + logger.error("Query String cannot be handled:" + queryString); + return false; + } catch (TripleStoreHandlerException e) { + logger.error("Query String cannot be handled:" + queryString); + return false; + } + try { + if (result.hasNext()) + return true; + else + return false; + } catch (QueryEvaluationException e) { + logger.error("Query String cannot be handled:" + queryString); + return false; + } + + } + + private OWLIndividual createDbPediaProvider(String ctx) + throws RepositoryException, TripleStoreHandlerException { + HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); + + OWLIndividual dbcreation = mh.generateEntity( + TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", + TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia"); + + th.write(dbcreation, ctx); + + return dbcreation; + + } + + private OWLIndividual createMPIWFProvider(String ctx) + throws RepositoryException, TripleStoreHandlerException { + HashMap<String, Object> db_creationValues = new HashMap<String, Object>(); + + OWLIndividual dbcreation = mh.generateEntity( + TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider", + TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG"); + + th.write(dbcreation, ctx); + + return dbcreation; + + } + + public static void main(String args[]) throws Exception { + if (args.length < 2) { + System.out.println("usage: import offset limit"); + System.exit(1); + } + + Logger.getRootLogger().setLevel(Level.INFO); + BasicConfigurator.configure(); + + MetaDataHandler mh = new MetaDataHandler(); + + TripleStoreHandler th = new TripleStoreHandler( + "jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba", + "wa55er"); + System.out.println(mh); + System.out.println(th); + + Import imp = new Import(mh, th, args[0], args[1]); + + HashMap<String, String> mapping = new HashMap<String, String>(); + mapping.put("http://xmlns.com/foaf/0.1/surname", + "http://xmlns.com/foaf/0.1/lastName"); + mapping.put("http://xmlns.com/foaf/0.1/givenName", + "http://xmlns.com/foaf/0.1/firstName"); + mapping.put("http://xmlns.com/foaf/0.1/name", + "http://xmlns.com/foaf/0.1/name"); + + imp.createMPIWGIdentifiers("http://dbpedia.org/ontology/Person", + mapping, "file://personendataWikipedia", + "file://mpiwg_persons.rdf"); + + // mh.getOwlModel().save(new java.net.URI("file:///tmp/prot.owl")); + } + +}