view src/de/mpiwg/itgroup/nimanager/importer/ImportGND.java @ 4:f986e74583eb

removed triplestorehandler componentes
author dwinter
date Tue, 13 Dec 2011 17:46:51 +0100
parents e3ecb88314a5
children
line wrap: on
line source

package de.mpiwg.itgroup.nimanager.importer;

import java.awt.dnd.DnDConstants;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.impl.LiteralImpl;
import org.openrdf.query.Binding;
import org.openrdf.query.BindingSet;
import org.openrdf.query.MalformedQueryException;
import org.openrdf.query.Query;
import org.openrdf.query.QueryEvaluationException;
import org.openrdf.query.QueryLanguage;
import org.openrdf.query.TupleQuery;
import org.openrdf.query.TupleQueryResult;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;

import com.hp.hpl.jena.graph.impl.LiteralLabel;

import de.mpiwg.itgroup.triplestoremanager.exceptions.TripleStoreHandlerException;
import de.mpiwg.itgroup.triplestoremanager.owl.MetaDataHandler;
import de.mpiwg.itgroup.triplestoremanager.owl.TripleStoreHandler;



import edu.stanford.smi.protegex.owl.jena.JenaOWLModel;
import edu.stanford.smi.protegex.owl.model.OWLIndividual;
import edu.stanford.smi.protegex.owl.model.RDFProperty;
import edu.stanford.smi.protegex.owl.model.RDFSLiteral;

public class ImportGND {

	private MetaDataHandler mh;
	private TripleStoreHandler th;
	private Logger logger = Logger.getRootLogger();
	// private Connection con;
	private String offset;
	private String limit;
	private FileWriter dbpediaMissing;

	private String mpiwgPerson = "file://mpiwg_persons_2.rdf";

	public ImportGND(MetaDataHandler mh, TripleStoreHandler th, String offset,
			String limit) throws SQLException, ClassNotFoundException {
		this.mh = mh;
		this.th = th;
		this.offset = offset;
		this.limit = limit;

	}

	private void createMPIWGFromGNDIdentifiers(String predicate,
			HashMap<String, String> mapping, String inCtx, String outCtx)
			throws RepositoryException, MalformedQueryException,
			QueryEvaluationException, URISyntaxException,
			TripleStoreHandlerException, SQLException, IOException {
		createMPIWGFromGNDIdentifiers(predicate, mapping,
				new HashMap<String, String>(), inCtx, outCtx);
	}

	// benutze predicate zur identifizierung der gnd eintrage, diese haben keine
	// rdfs:type attribute, daher nehme ich alle eintraege die ein bestimmtes
	// attribute hhaben
	private void createMPIWGFromGNDIdentifiers(String predicate,
			HashMap<String, String> mapping,
			HashMap<String, String> complexMapping, String inCtx, String outCtx)
			throws RepositoryException, MalformedQueryException,
			QueryEvaluationException, URISyntaxException,
			TripleStoreHandlerException, SQLException, IOException {

		dbpediaMissing = new FileWriter("/tmp/missingDBPedia.txt");
		int newPersonID = 0;
		int namedEntityIdentifierID = 0;
		int namedEntityIdentifierCreationID = 0;

		// List<String> classes = mh.getEquivalentClasses(clsName); // suche
		// alle
		// aequivalenten
		// Klassen
		// classes.add(clsName); // add the classname it self;

		OWLIndividual providerMPIWG = th.getProvider(mh,
				TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG");

		if (providerMPIWG == null) {
			providerMPIWG = createMPIWFProvider(outCtx);
		}

		OWLIndividual providerDbPedia = th.getProvider(mh,
				TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia");

		if (providerDbPedia == null) {
			providerDbPedia = createDbPediaProvider(outCtx);
		}

		OWLIndividual providerDNB = th.getProvider(mh,
				TripleStoreHandler.ONTOLOGY_NS + "provider_DNB");

		if (providerDNB == null) {
			providerDNB = createDNBProvider(outCtx);
		}

		RepositoryConnection con = th.getRepository().getConnection();
		// find all object
		String queryString = "SELECT DISTINCT ?s FROM <" + inCtx
				+ "> WHERE {?s <" + predicate + ">" + "?o  .} OFFSET " + offset
				+ " LIMIT " + limit;

		TupleQueryResult result = th.querySPARQL(queryString);

		while (result.hasNext()) {
			BindingSet bindingSet = result.next();
			Value subjValue = bindingSet.getValue("s");
			URI gndPerson = th.getRepository().getValueFactory()
					.createURI(subjValue.stringValue());

			// check if dbpedia link exists
			boolean createNew = false;
			URI dbpedia = getDBPediaLink(gndPerson, inCtx);
			if (dbpedia != null) {
				// gibt es den entsprechenden Eintrag schon, dann hole die
				// entprechende person
				URI person = getPersonFromDBPedia(dbpedia);

				if (person != null) {
					OWLIndividual personInd = mh.generateEntity(
							TripleStoreHandler.ONTOLOGY_NS + "Person",
							person.toString());
					addGNDToMPIWGIdentifier(gndPerson, personInd,
							namedEntityIdentifierID, outCtx, providerDNB,
							namedEntityIdentifierCreationID);
				} else {
					dbpediaMissing.write(subjValue.stringValue());
					createNew = true;
				}
			} else {
				createNew = true;
			}

			// zur GND gibt es noch keinen personen eintrag related zur DB
			if (createNew) {

				HashMap<String, Object> newValues = new HashMap<String, Object>();// werte
																					// die
																					// neu
																					// eingetragen
																					// werden
																					// muessen
				for (String key : mapping.keySet()) {

					RepositoryResult<Statement> namesStatements = con
							.getStatements(gndPerson, th.getRepository()
									.getValueFactory().createURI(key), null,
									false);

					while (namesStatements.hasNext()) {
						Statement stmt = namesStatements.next();
						Object newValue;
						Value val = stmt.getObject();

						// sollte literal sein
						if (LiteralImpl.class.isInstance(val)) { // wenn ein
																	// string
																	// literal,
																	// dann
																	// uebersetze
																	// in jena
																	// string
																	// literal
							LiteralImpl li = (LiteralImpl) val;

							newValue = mh.getOwlModel()
									.createRDFSLiteralOrString(li.getLabel(),
											li.getLanguage());
							newValues.put(mapping.get(key), newValue);
							break;
							// nicht mehr weiter suche, nimm also immer den
							// ersten literal
						}

					}
				}

				for (String key : complexMapping.keySet()) {

					String cmd = "select ?o from <" + inCtx + "> where {"
							+ String.format(key, gndPerson.stringValue()) + "}";
					TupleQueryResult results = th.querySPARQL(cmd);

					if (results.hasNext()) { // nimm nur das erste
						BindingSet firstStatement = results.next();

						Object newValue;
						Value val = firstStatement.getBinding("o").getValue();

						if (LiteralImpl.class.isInstance(val)) { // wenn ein
																	// string
																	// literal,
																	// dann
																	// uebersetze
																	// in jena
																	// string
																	// literal
							LiteralImpl li = (LiteralImpl) val;

							newValue = mh.getOwlModel()
									.createRDFSLiteralOrString(li.getLabel(),
											li.getLanguage());
						} else { // anderfalls dern string wert = uri
							newValue = val.stringValue();

						}

						newValues.put(complexMapping.get(key), newValue);
					}
				}

				// first create the new person

				Boolean ex = checkExistance(newValues, th, mapping.values(),
						outCtx);

				if (ex) {
					logger.info("nothing to be done!");
					continue;
				}
				newPersonID = getNewId(newPersonID, "Person", new String[] {
						outCtx, mpiwgPerson });
				logger.info("New ID choosen:" + String.valueOf(newPersonID));
				OWLIndividual person = mh.generateEntity(
						TripleStoreHandler.ONTOLOGY_NS + "Person",
						TripleStoreHandler.ONTOLOGY_NS + "Person:"
								+ String.valueOf(newPersonID), newValues);
				mh.printIndividual(person);

				newPersonID += 1;
				// Boolean ex = checkExistance(person, th, mapping.values(),
				// outCtx);
				// if (ex) {
				// logger.info("nothing to be done!");
				// person.delete();
				// continue;
				// }
				th.write(person, outCtx);

				// now we create the MPIWG identifier and connect it to the
				// person
				HashMap<String, Object> idValues = new HashMap<String, Object>();
				idValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "is_preferred_namedEntityIdentifier", person);
				idValues.put(
						"http://erlangen-crm.org/plus/xdt/110404/has_XSD_String",
						"Person:" + String.valueOf(newPersonID)); // TODO PID
																	// GENERATOR

				namedEntityIdentifierID = getNewId(namedEntityIdentifierID,
						"NamedEntityIdentifier", new String[] { outCtx,
								mpiwgPerson });
				OWLIndividual mpiwgIdentifier = mh.generateEntity(
						TripleStoreHandler.ONTOLOGY_NS
								+ "NamedEntityIdentifier",
						TripleStoreHandler.ONTOLOGY_NS
								+ "NamedEntityIdentifier:"
								+ String.valueOf(namedEntityIdentifierID),
						idValues);

				namedEntityIdentifierID += 1;
				th.write(mpiwgIdentifier, outCtx);

				// now create the creation object and connect it to the MPIWG
				// identifier and the provider MPIWG
				HashMap<String, Object> creationValues = new HashMap<String, Object>();
				creationValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "carriedOutByNamedEntityProvider", providerMPIWG);
				creationValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "created_NamedEntityIdentifier", mpiwgIdentifier);

				namedEntityIdentifierCreationID = getNewId(
						namedEntityIdentifierCreationID,
						"NamedEntityIdentifierCreation", new String[] { outCtx,
								mpiwgPerson });

				OWLIndividual creation = mh
						.generateEntity(
								TripleStoreHandler.ONTOLOGY_NS
										+ "NamedEntityIdentifierCreation",
								TripleStoreHandler.ONTOLOGY_NS
										+ "NamedEntityIdentifierCreation:"
										+ String.valueOf(namedEntityIdentifierCreationID),
								creationValues);
				namedEntityIdentifierCreationID += 1;

				th.write(creation, outCtx);

				addGNDToMPIWGIdentifier(gndPerson, person,
						namedEntityIdentifierID, outCtx, providerDNB,
						namedEntityIdentifierCreationID);
			}
		}

	}

	private void addGNDToMPIWGIdentifier(URI gndPerson, OWLIndividual person,
			int namedEntityIdentifierID, String outCtx, Object gndProvider,
			int namedEntityIdentifierCreationID) throws SQLException,
			RepositoryException, TripleStoreHandlerException {
		// now create the gnd identifier and connect it to the
		// person

		HashMap<String, Object> db_idValues = new HashMap<String, Object>();
		db_idValues.put(TripleStoreHandler.ONTOLOGY_NS
				+ "identifies_NamedEntity", person);

		// identifier is the url at dng
		db_idValues.put(
				"http://erlangen-crm.org/plus/xdt/110404/has_XSD_String",
				gndPerson.stringValue());

		namedEntityIdentifierID = getNewId(namedEntityIdentifierID,
				"NamedEntityIdentifier", new String[] { outCtx, mpiwgPerson });

		OWLIndividual dbIdentifier = mh.generateEntity(
				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier",
				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:"
						+ String.valueOf(namedEntityIdentifierID), db_idValues);

		namedEntityIdentifierID += 1;

		th.write(dbIdentifier, outCtx);

		// now create the creation object and connect it to the gnd
		// identifier and the provider gnd
		HashMap<String, Object> db_creationValues = new HashMap<String, Object>();
		db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS
				+ "carriedOutByNamedEntityProvider", gndProvider);
		db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS
				+ "created_NamedEntityIdentifier", dbIdentifier);

		namedEntityIdentifierCreationID = getNewId(
				namedEntityIdentifierCreationID,
				"NamedEntityIdentifierCreation", new String[] { outCtx,
						mpiwgPerson });

		OWLIndividual dbcreation = mh.generateEntity(
				TripleStoreHandler.ONTOLOGY_NS
						+ "NamedEntityIdentifierCreation",
				TripleStoreHandler.ONTOLOGY_NS
						+ "NamedEntityIdentifierCreation:"
						+ String.valueOf(namedEntityIdentifierCreationID),
				db_creationValues);
		namedEntityIdentifierCreationID += 1;

		th.write(dbcreation, outCtx);

		// add the bbpedia identifier to the triple store
		th.write(gndPerson.stringValue(),
				"http://erlangen-crm.org/110404/P1_is_identified_by",
				dbIdentifier.getURI(), outCtx);
		;

	}

	private URI getPersonFromDBPedia(URI dbpedia) {
		String query = "select distinct ?x  ?y ?person "
				+ "from <file://mpiwg_persons.rdf> "
				+ "where { ?y <http://ontologies.mpiwg-berlin.mpg.de/authorities/namedIdentities#identifies_NamedEntity> ?person."
				+ "<" + dbpedia.stringValue()
				+ "> <http://erlangen-crm.org/110404/P1_is_identified_by> ?y.}";

		try {
			TupleQueryResult results = th.querySPARQL(query);
			while (results.hasNext()) {
				BindingSet stm = results.next();
				Binding person = stm.getBinding("person");
				return (URI) person.getValue();
			}
		} catch (MalformedQueryException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (QueryEvaluationException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (TripleStoreHandlerException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}

	private URI getDBPediaLink(URI subj, String inCtx) {
		try {
			RepositoryResult<Statement> statements = th.getStatements(subj,
					th.createUri("http://www.w3.org/2002/07/owl#sameAs"), null,
					inCtx);
			while (statements.hasNext()) {
				Statement smt = statements.next();
				URI obj = (URI) smt.getObject();
				if (obj.getNamespace().equals("http://dbpedia.org/resource/")) {
					return obj;
				}
			}
			return null;
		} catch (RepositoryException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return null;
		}
	}

	private Boolean checkExistance(HashMap<String, Object> newValues,
			TripleStoreHandler th2, Collection<String> props, String outCtx)
			throws RepositoryException {

		String queryString = "select ?x FROM <" + outCtx + "> " + "where {";
		for (String propString : props) {
			// RDFProperty rdfProp = model.getRDFProperty(propString);
			String str = "";
			String lang = "";
			Object valObj = newValues.get(propString);
			if (RDFSLiteral.class.isInstance(valObj)) {
				RDFSLiteral val = (RDFSLiteral) valObj;
				if (val == null)
					continue;
				lang = val.getLanguage();
				str = val.getString();
				str = str.replace("\"", "\\\"");
				queryString += "?x <" + propString + "> \"" + str + "\"@" + lang
						+ ".";
			} else {
				str = (String) valObj;
				str = str.replace("\"", "\\\"");
				queryString += "?x <" + propString + "> \"" + str + "\""
						+ ".";
			}


		}

		queryString += " }";

		TupleQueryResult result;
		try {
			result = th.querySPARQL(queryString);
		} catch (MalformedQueryException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		} catch (QueryEvaluationException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		} catch (TripleStoreHandlerException e) {
			e.printStackTrace();
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		}
		try {
			if (result.hasNext())
				return true;
			else
				return false;
		} catch (QueryEvaluationException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		}

	}

	private int getNewId(int startnumber, String identifier, String[] ctx)
			throws SQLException {

		java.sql.Statement smt = th.sqlCon.createStatement();
		Boolean exists = true;

		while (exists) {
			String fromString = "";
			for (int i = 0; i < ctx.length; i++) {
				fromString += String.format(" from <%s> ", ctx[i]);
			}
			String cmdString = String.format(
					"sparql select count(*) %s where {<%s> ?x ?y}",
					fromString,
					TripleStoreHandler.ONTOLOGY_NS + identifier + ":"
							+ String.valueOf(startnumber));
			smt.execute(cmdString);
			ResultSet rs = smt.getResultSet();
			rs.next();
			int count = rs.getInt(1);
			if (count > 0) {
				startnumber += 1;
			} else {
				exists = false;
			}
		}

		return startnumber;
	}

	private Boolean checkExistance(OWLIndividual person,
			TripleStoreHandler th2, Collection<String> props, String outCtx)
			throws RepositoryException {
		JenaOWLModel model = mh.getOwlModel();
		// Map<String, String> vals = new HashMap<String, String>();

		String queryString = "select ?x FROM <" + outCtx + "> " + "where {";
		for (String propString : props) {
			RDFProperty rdfProp = model.getRDFProperty(propString);
			RDFSLiteral val = (RDFSLiteral) person.getPropertyValue(rdfProp);
			if (val == null)
				continue;
			String lang = val.getLanguage();
			String str = val.getString();
			str = str.replace("\"", "\\\"");
			queryString += "?x <" + propString + "> \"" + str + "\"@" + lang
					+ ".";
		}

		queryString += " }";
		TupleQueryResult result;
		try {
			result = th.querySPARQL(queryString);
		} catch (MalformedQueryException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		} catch (QueryEvaluationException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		} catch (TripleStoreHandlerException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		}
		try {
			if (result.hasNext())
				return true;
			else
				return false;
		} catch (QueryEvaluationException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		}

	}

	private OWLIndividual createDbPediaProvider(String ctx)
			throws RepositoryException, TripleStoreHandlerException {
		HashMap<String, Object> db_creationValues = new HashMap<String, Object>();

		OWLIndividual dbcreation = mh.generateEntity(
				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider",
				TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia");

		th.write(dbcreation, ctx);

		return dbcreation;

	}

	private OWLIndividual createDNBProvider(String ctx)
			throws RepositoryException, TripleStoreHandlerException {
		HashMap<String, Object> db_creationValues = new HashMap<String, Object>();

		OWLIndividual dbcreation = mh.generateEntity(
				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider",
				TripleStoreHandler.ONTOLOGY_NS + "provider_DNB");

		th.write(dbcreation, ctx);

		return dbcreation;

	}

	private OWLIndividual createMPIWFProvider(String ctx)
			throws RepositoryException, TripleStoreHandlerException {
		HashMap<String, Object> db_creationValues = new HashMap<String, Object>();

		OWLIndividual dbcreation = mh.generateEntity(
				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider",
				TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG");

		th.write(dbcreation, ctx);

		return dbcreation;

	}

	public static void main(String args[]) throws Exception {
		if (args.length < 4) {
			System.out.println("usage: import user pw offset limit ");
			System.exit(1);
		}

		Logger.getRootLogger().setLevel(Level.INFO);
		BasicConfigurator.configure();

		MetaDataHandler mh = new MetaDataHandler();

		// TripleStoreHandler th = new TripleStoreHandler(
		// "jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111",args[0],
		// args[1]);

		TripleStoreHandler th = new TripleStoreHandler(
				"jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111", args[0], args[1]);
		System.out.println(mh);
		System.out.println(th);

		ImportGND imp = new ImportGND(mh, th, args[2], args[3]);

		HashMap<String, String> mapping = new HashMap<String, String>();
		HashMap<String, String> complexMapping = new HashMap<String, String>();

		// SELECT DISTINCT *
		// FROM <file://mpiwg_persons.rdf>
		// FROM <file:///GND.rdf>
		// WHERE { ?p <http://d-nb.info/gnd/foreName> ?o.
		// <http://d-nb.info/gnd/100004776>
		// <http://d-nb.info/gnd/preferredNameForThePerson> ?p }

		// SELECT DISTINCT *
		// FROM <file://mpiwg_persons.rdf>
		// FROM <file:///GND.rdf>
		// WHERE { ?o <http://d-nb.info/gnd/surname> ?o2.
		// <http://d-nb.info/gnd/100004776>
		// <http://d-nb.info/gnd/preferredNameForThePerson> ?o }

		complexMapping.put("?p <http://d-nb.info/gnd/surname> ?o."
				+ "<%s> <http://d-nb.info/gnd/preferredNameForThePerson> ?p ",
				"http://xmlns.com/foaf/0.1/lastName");
		complexMapping.put("?p <http://d-nb.info/gnd/foreName> ?o."
				+ "<%s> <http://d-nb.info/gnd/preferredNameForThePerson> ?p ",
				"http://xmlns.com/foaf/0.1/firstName");
		mapping.put("http://d-nb.info/gnd/preferredNameForThePerson",
				"http://xmlns.com/foaf/0.1/name");

		imp.createMPIWGFromGNDIdentifiers(
				"http://RDVocab.info/ElementsGr2/identifierForThePerson",
				mapping, complexMapping, "file:///GND.rdf",
				"file://mpiwg_persons_dnb.rdf");

		// mh.getOwlModel().save(new java.net.URI("file:///tmp/prot.owl"));
	}

}