diff src/de/mpiwg/itgroup/nimanager/importer/Import.java @ 0:1384a0d382fa

first input
author dwinter
date Thu, 30 Jun 2011 11:44:24 +0200
parents
children b8333fab0d95
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/itgroup/nimanager/importer/Import.java	Thu Jun 30 11:44:24 2011 +0200
@@ -0,0 +1,469 @@
+package de.mpiwg.itgroup.nimanager.importer;
+
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.BasicConfigurator;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.openrdf.model.Resource;
+import org.openrdf.model.Statement;
+import org.openrdf.model.URI;
+import org.openrdf.model.Value;
+import org.openrdf.model.impl.LiteralImpl;
+import org.openrdf.query.BindingSet;
+import org.openrdf.query.MalformedQueryException;
+import org.openrdf.query.QueryEvaluationException;
+import org.openrdf.query.QueryLanguage;
+import org.openrdf.query.TupleQuery;
+import org.openrdf.query.TupleQueryResult;
+import org.openrdf.repository.RepositoryConnection;
+import org.openrdf.repository.RepositoryException;
+import org.openrdf.repository.RepositoryResult;
+
+import com.hp.hpl.jena.graph.impl.LiteralLabel;
+
+import de.mpiwg.itgroup.nimanager.exceptions.TripleStoreHandlerException;
+import de.mpiwg.itgroup.nimanager.owl.MetaDataHandler;
+import de.mpiwg.itgroup.nimanager.owl.TripleStoreHandler;
+import edu.stanford.smi.protegex.owl.jena.JenaOWLModel;
+import edu.stanford.smi.protegex.owl.model.OWLIndividual;
+import edu.stanford.smi.protegex.owl.model.RDFProperty;
+import edu.stanford.smi.protegex.owl.model.RDFSLiteral;
+
+public class Import {
+
+	private MetaDataHandler mh;
+	private TripleStoreHandler th;
+	private Logger logger = Logger.getRootLogger();
+	// private Connection con;
+	private String offset;
+	private String limit;
+
+	public Import(MetaDataHandler mh, TripleStoreHandler th, String offset,
+			String limit) throws SQLException, ClassNotFoundException {
+		this.mh = mh;
+		this.th = th;
+		// Class.forName("virtuoso.jdbc4.Driver");
+		// String
+		// connectString="jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111/charset=UTF-8";
+		// this.con =
+		// DriverManager.getConnection(connectString,"dwinter","weikiki7");
+		this.offset = offset;
+		this.limit = limit;
+
+	}
+
+	private void createMPIWGIdentifiers(String clsName,
+			HashMap<String, String> mapping, String inCtx, String outCtx)
+			throws RepositoryException, MalformedQueryException,
+			QueryEvaluationException, URISyntaxException,
+			TripleStoreHandlerException, SQLException {
+
+		int newPersonID = 0;
+		int namedEntityIdentifierID= 0;
+		int namedEntityIdentifierCreationID= 0;
+		
+		List<String> classes = mh.getEquivalentClasses(clsName); // suche alle
+																	// aequivalenten
+																	// Klassen
+		classes.add(clsName); // add the classname it self;
+		// RepositoryResult<Statement> objects= new
+		// RepositoryResult<Statement>(null);
+
+		OWLIndividual providerMPIWG = th.getProvider(mh,
+				TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG");
+
+		if (providerMPIWG == null) {
+			providerMPIWG = createMPIWFProvider(outCtx);
+		}
+
+		OWLIndividual providerDbPedia = th.getProvider(mh,
+				TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia");
+
+		if (providerDbPedia == null) {
+			providerDbPedia = createDbPediaProvider(outCtx);
+		}
+		for (String cl : classes) { // gehe durch die klassen
+			RepositoryConnection con = th.getRepository().getConnection();
+			// String queryString = "SELECT DISTINCT ?s FROM <" +
+			// inCtx+"> WHERE {?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <"
+			// + cl + ">  .} OFFSET 199999 LIMIT 100000";
+			String queryString = "SELECT DISTINCT ?s FROM <"
+					+ inCtx
+					+ "> WHERE {?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <"
+					+ cl + ">  .} OFFSET " + offset + " LIMIT " + limit;
+			// TupleQuery tupleQuery =
+			// con.prepareTupleQuery(QueryLanguage.SPARQL, queryString);
+
+			TupleQueryResult result = th.querySPARQL(queryString);
+
+			// RepositoryResult<Statement> objects = th.getStatements(null,
+			// "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", cl);
+
+			// while(objects.hasNext()){
+			// Statement st=objects.next();
+			// Resource subj = st.getSubject();
+			//
+
+			while (result.hasNext()) {
+				BindingSet bindingSet = result.next();
+				Value subjValue = bindingSet.getValue("s");
+				URI subj = th.getRepository().getValueFactory()
+						.createURI(subjValue.stringValue());
+
+				// Value p = bindingSet.getValue("p");
+				// Value o = bindingSet.getValue("o");
+				HashMap<String, Object> newValues = new HashMap<String, Object>();// werte
+																					// die
+																					// neu
+																					// eingetragen
+																					// werden
+																					// muessen
+				for (String key : mapping.keySet()) {
+
+					RepositoryResult<Statement> namesStatements = con
+							.getStatements(subj, th.getRepository()
+									.getValueFactory().createURI(key), null,
+									false);
+					Statement firstStatement = TripleStoreHandler
+							.getFirstStatement(namesStatements);
+
+					if (firstStatement != null) {
+						Object newValue;
+						Value val = firstStatement.getObject();
+
+						if (LiteralImpl.class.isInstance(val)) { // wenn ein
+																	// string
+																	// literal,
+																	// dann
+																	// uebersetze
+																	// in jena
+																	// string
+																	// literal
+							LiteralImpl li = (LiteralImpl) val;
+
+							newValue = mh.getOwlModel()
+									.createRDFSLiteralOrString(li.getLabel(),
+											li.getLanguage());
+						} else { // anderfalls dern string wert = uri
+							newValue = val.stringValue();
+
+						}
+
+						newValues.put(mapping.get(key), newValue);
+					}
+				}
+
+				// first create the new person
+
+				
+				Boolean ex = checkExistance(newValues,th, mapping.values(),
+						outCtx);
+				
+				if (ex) {
+					logger.info("nothing to be done!");
+					continue;
+				}
+				newPersonID = getNewId(newPersonID, "Person", outCtx);
+				logger.info("New ID choosen:" + String.valueOf(newPersonID));
+				OWLIndividual person = mh.generateEntity(
+						TripleStoreHandler.ONTOLOGY_NS + "Person",
+						TripleStoreHandler.ONTOLOGY_NS + "Person:"
+								+ String.valueOf(newPersonID), newValues);
+				mh.printIndividual(person);
+
+				newPersonID += 1;
+//				Boolean ex = checkExistance(person, th, mapping.values(),
+//						outCtx);
+//				if (ex) {
+//					logger.info("nothing to be done!");
+//					person.delete();
+//					continue;
+//				}
+				th.write(person, outCtx);
+
+				// now we create the MPIWG identifier and connect it to the
+				// person
+				HashMap<String, Object> idValues = new HashMap<String, Object>();
+				idValues.put(TripleStoreHandler.ONTOLOGY_NS
+						+ "is_preferred_namedEntityIdentifier", person);
+				idValues.put(
+						"http://erlangen-crm.org/plus/xdt/110404/has_XSD_String",
+						"Person:" + String.valueOf(newPersonID)); // TODO PID
+																	// GENERATOR
+
+				namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx);
+				OWLIndividual mpiwgIdentifier = mh.generateEntity(
+						TripleStoreHandler.ONTOLOGY_NS
+								+ "NamedEntityIdentifier",
+								TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:"
+								+ String.valueOf(namedEntityIdentifierID),
+								idValues);
+
+				namedEntityIdentifierID+=1;
+				th.write(mpiwgIdentifier, outCtx);
+
+				// now create the creation object and connect it to the MPIWG
+				// identifier and the provider MPIWG
+				HashMap<String, Object> creationValues = new HashMap<String, Object>();
+				creationValues.put(TripleStoreHandler.ONTOLOGY_NS
+						+ "carriedOutByNamedEntityProvider", providerMPIWG);
+				creationValues.put(TripleStoreHandler.ONTOLOGY_NS
+						+ "created_NamedEntityIdentifier", mpiwgIdentifier);
+				
+				
+				namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx);
+				
+				OWLIndividual creation = mh.generateEntity(
+						TripleStoreHandler.ONTOLOGY_NS
+								+ "NamedEntityIdentifierCreation",
+								TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:"
+								+ String.valueOf(namedEntityIdentifierCreationID),
+						creationValues);
+				namedEntityIdentifierCreationID+=1;
+				
+				th.write(creation, outCtx);
+
+				// now create the wikipedia identifier and connect it to the
+				// person
+				HashMap<String, Object> db_idValues = new HashMap<String, Object>();
+				db_idValues.put(TripleStoreHandler.ONTOLOGY_NS
+						+ "identifies_NamedEntity", person);
+
+				// identifier is the url at dbpedia
+				db_idValues
+						.put("http://erlangen-crm.org/plus/xdt/110404/has_XSD_String",
+								subjValue.stringValue());
+				
+				namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx);
+				
+				OWLIndividual dbIdentifier = mh.generateEntity(
+						TripleStoreHandler.ONTOLOGY_NS
+								+ "NamedEntityIdentifier", 
+								TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:"
+								+ String.valueOf(namedEntityIdentifierID),
+								db_idValues);
+				
+				namedEntityIdentifierID+=1;
+				
+				th.write(dbIdentifier, outCtx);
+
+				// now create the creation object and connect it to the dbpedia
+				// identifier and the provider pdbedia
+				HashMap<String, Object> db_creationValues = new HashMap<String, Object>();
+				db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS
+						+ "carriedOutByNamedEntityProvider", providerDbPedia);
+				db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS
+						+ "created_NamedEntityIdentifier", dbIdentifier);
+				
+				
+				namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx);
+				
+				OWLIndividual dbcreation = mh.generateEntity(
+						TripleStoreHandler.ONTOLOGY_NS
+								+ "NamedEntityIdentifierCreation",
+								TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:"
+								+ String.valueOf(namedEntityIdentifierCreationID),
+						db_creationValues);
+				namedEntityIdentifierCreationID+=1;
+			
+				
+
+				th.write(dbcreation, outCtx);
+
+				// add the bbpedia identifier to the triple store
+				th.write(subj.stringValue(),
+						"http://erlangen-crm.org/110404/P1_is_identified_by",
+						dbIdentifier.getURI(), outCtx);
+				;
+
+			}
+		}
+
+	}
+
+	private Boolean checkExistance(HashMap<String, Object> newValues,
+			TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException {
+	
+		String queryString = "select ?x FROM <" + outCtx + "> " + "where {";
+		for (String propString : props) {
+			//RDFProperty rdfProp = model.getRDFProperty(propString);
+			RDFSLiteral val = (RDFSLiteral) newValues.get(propString);
+			if  (val==null)
+				continue;
+			String lang = val.getLanguage();
+			String str = val.getString();
+			str = str.replace("\"", "\\\"");
+			queryString += "?x <" + propString + "> \"" + str + "\"@" + lang
+					+ ".";
+		}
+
+		queryString += " }";
+		TupleQueryResult result;
+		try {
+			result = th.querySPARQL(queryString);
+		} catch (MalformedQueryException e) {
+			logger.error("Query String cannot be handled:" + queryString);
+			return false;
+		} catch (QueryEvaluationException e) {
+			logger.error("Query String cannot be handled:" + queryString);
+			return false;
+		} catch (TripleStoreHandlerException e) {
+			e.printStackTrace();
+			logger.error("Query String cannot be handled:" + queryString);
+			return false;
+		}
+		try {
+			if (result.hasNext())
+				return true;
+			else
+				return false;
+		} catch (QueryEvaluationException e) {
+			logger.error("Query String cannot be handled:" + queryString);
+			return false;
+		}
+
+	}
+	private int getNewId(int startnumber, String identifier, String ctx)
+			throws SQLException {
+
+		java.sql.Statement smt = th.sqlCon.createStatement();
+		Boolean exists = true;
+
+		while (exists) {
+			String cmdString = String.format(
+					"sparql select count(*) from <%s> where {<%s> ?x ?y}",
+					ctx,
+					TripleStoreHandler.ONTOLOGY_NS + "Person:"
+							+ String.valueOf(startnumber));
+			smt.execute(cmdString);
+			ResultSet rs = smt.getResultSet();
+			rs.next();
+			int count = rs.getInt(1);
+			if (count > 0) {
+				startnumber += 1;
+			} else {
+				exists = false;
+			}
+		}
+
+		return startnumber;
+	}
+
+	private Boolean checkExistance(OWLIndividual person,
+			TripleStoreHandler th2, Collection<String> props, String outCtx)
+			throws RepositoryException {
+		JenaOWLModel model = mh.getOwlModel();
+		//Map<String, String> vals = new HashMap<String, String>();
+
+		String queryString = "select ?x FROM <" + outCtx + "> " + "where {";
+		for (String propString : props) {
+			RDFProperty rdfProp = model.getRDFProperty(propString);
+			RDFSLiteral val = (RDFSLiteral) person.getPropertyValue(rdfProp);
+			if (val == null)
+				continue;
+			String lang = val.getLanguage();
+			String str = val.getString();
+			str = str.replace("\"", "\\\"");
+			queryString += "?x <" + propString + "> \"" + str + "\"@" + lang
+					+ ".";
+		}
+
+		queryString += " }";
+		TupleQueryResult result;
+		try {
+			result = th.querySPARQL(queryString);
+		} catch (MalformedQueryException e) {
+			logger.error("Query String cannot be handled:" + queryString);
+			return false;
+		} catch (QueryEvaluationException e) {
+			logger.error("Query String cannot be handled:" + queryString);
+			return false;
+		} catch (TripleStoreHandlerException e) {
+			logger.error("Query String cannot be handled:" + queryString);
+			return false;
+		}
+		try {
+			if (result.hasNext())
+				return true;
+			else
+				return false;
+		} catch (QueryEvaluationException e) {
+			logger.error("Query String cannot be handled:" + queryString);
+			return false;
+		}
+
+	}
+
+	private OWLIndividual createDbPediaProvider(String ctx)
+			throws RepositoryException, TripleStoreHandlerException {
+		HashMap<String, Object> db_creationValues = new HashMap<String, Object>();
+
+		OWLIndividual dbcreation = mh.generateEntity(
+				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider",
+				TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia");
+
+		th.write(dbcreation, ctx);
+
+		return dbcreation;
+
+	}
+
+	private OWLIndividual createMPIWFProvider(String ctx)
+			throws RepositoryException, TripleStoreHandlerException {
+		HashMap<String, Object> db_creationValues = new HashMap<String, Object>();
+
+		OWLIndividual dbcreation = mh.generateEntity(
+				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider",
+				TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG");
+
+		th.write(dbcreation, ctx);
+
+		return dbcreation;
+
+	}
+
+	public static void main(String args[]) throws Exception {
+		if (args.length < 2) {
+			System.out.println("usage: import offset limit");
+			System.exit(1);
+		}
+
+		Logger.getRootLogger().setLevel(Level.INFO);
+		BasicConfigurator.configure();
+
+		MetaDataHandler mh = new MetaDataHandler();
+
+		TripleStoreHandler th = new TripleStoreHandler(
+				"jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", "dba",
+				"wa55er");
+		System.out.println(mh);
+		System.out.println(th);
+
+		Import imp = new Import(mh, th, args[0], args[1]);
+
+		HashMap<String, String> mapping = new HashMap<String, String>();
+		mapping.put("http://xmlns.com/foaf/0.1/surname",
+				"http://xmlns.com/foaf/0.1/lastName");
+		mapping.put("http://xmlns.com/foaf/0.1/givenName",
+				"http://xmlns.com/foaf/0.1/firstName");
+		mapping.put("http://xmlns.com/foaf/0.1/name",
+				"http://xmlns.com/foaf/0.1/name");
+
+		imp.createMPIWGIdentifiers("http://dbpedia.org/ontology/Person",
+				mapping, "file://personendataWikipedia",
+				"file://mpiwg_persons.rdf");
+
+		// mh.getOwlModel().save(new java.net.URI("file:///tmp/prot.owl"));
+	}
+
+}