view src/de/mpiwg/itgroup/nimanager/importer/Import.java @ 4:f986e74583eb

removed triplestorehandler componentes
author dwinter
date Tue, 13 Dec 2011 17:46:51 +0100
parents e3ecb88314a5
children
line wrap: on
line source

package de.mpiwg.itgroup.nimanager.importer;

import java.net.URISyntaxException;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.impl.LiteralImpl;
import org.openrdf.query.BindingSet;
import org.openrdf.query.MalformedQueryException;
import org.openrdf.query.QueryEvaluationException;
import org.openrdf.query.QueryLanguage;
import org.openrdf.query.TupleQuery;
import org.openrdf.query.TupleQueryResult;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;

import com.hp.hpl.jena.graph.impl.LiteralLabel;

import de.mpiwg.itgroup.triplestoremanager.exceptions.TripleStoreHandlerException;
import de.mpiwg.itgroup.triplestoremanager.owl.MetaDataHandler;
import de.mpiwg.itgroup.triplestoremanager.owl.TripleStoreHandler;


import edu.stanford.smi.protegex.owl.jena.JenaOWLModel;
import edu.stanford.smi.protegex.owl.model.OWLIndividual;
import edu.stanford.smi.protegex.owl.model.RDFProperty;
import edu.stanford.smi.protegex.owl.model.RDFSLiteral;

public class Import {

	private MetaDataHandler mh;
	private TripleStoreHandler th;
	private Logger logger = Logger.getRootLogger();
	// private Connection con;
	private String offset;
	private String limit;

	public Import(MetaDataHandler mh, TripleStoreHandler th, String offset,
			String limit) throws SQLException, ClassNotFoundException {
		this.mh = mh;
		this.th = th;
		this.offset = offset;
		this.limit = limit;

	}

	private void createMPIWGIdentifiers(String clsName,
			HashMap<String, String> mapping, String inCtx, String outCtx)
			throws RepositoryException, MalformedQueryException,
			QueryEvaluationException, URISyntaxException,
			TripleStoreHandlerException, SQLException {

		int newPersonID = 0;
		int namedEntityIdentifierID= 0;
		int namedEntityIdentifierCreationID= 0;
		
		List<String> classes = mh.getEquivalentClasses(clsName); // suche alle
																	// aequivalenten
																	// Klassen
		classes.add(clsName); // add the classname it self;
		
		OWLIndividual providerMPIWG = th.getProvider(mh,
				TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG");

		if (providerMPIWG == null) {
			providerMPIWG = createMPIWFProvider(outCtx);
		}

		OWLIndividual providerDbPedia = th.getProvider(mh,
				TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia");

		if (providerDbPedia == null) {
			providerDbPedia = createDbPediaProvider(outCtx);
		}
		for (String cl : classes) { // gehe durch die klassen
			RepositoryConnection con = th.getRepository().getConnection();
			
			String queryString = "SELECT DISTINCT ?s FROM <"
					+ inCtx
					+ "> WHERE {?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <"
					+ cl + ">  .} OFFSET " + offset + " LIMIT " + limit;
			
			TupleQueryResult result = th.querySPARQL(queryString);


			while (result.hasNext()) {
				BindingSet bindingSet = result.next();
				Value subjValue = bindingSet.getValue("s");
				URI subj = th.getRepository().getValueFactory()
						.createURI(subjValue.stringValue());

				HashMap<String, Object> newValues = new HashMap<String, Object>();// werte
																					// die
																					// neu
																					// eingetragen
																					// werden
																					// muessen
				for (String key : mapping.keySet()) {

					RepositoryResult<Statement> namesStatements = con
							.getStatements(subj, th.getRepository()
									.getValueFactory().createURI(key), null,
									false);
					Statement firstStatement = TripleStoreHandler
							.getFirstStatement(namesStatements);

					if (firstStatement != null) {
						Object newValue;
						Value val = firstStatement.getObject();

						if (LiteralImpl.class.isInstance(val)) { // wenn ein
																	// string
																	// literal,
																	// dann
																	// uebersetze
																	// in jena
																	// string
																	// literal
							LiteralImpl li = (LiteralImpl) val;

							newValue = mh.getOwlModel()
									.createRDFSLiteralOrString(li.getLabel(),
											li.getLanguage());
						} else { // anderfalls dern string wert = uri
							newValue = val.stringValue();

						}

						newValues.put(mapping.get(key), newValue);
					}
				}

				// first create the new person

				
				Boolean ex = checkExistance(newValues,th, mapping.values(),
						outCtx);
				
				if (ex) {
					logger.info("nothing to be done!");
					continue;
				}
				newPersonID = getNewId(newPersonID, "Person", outCtx);
				logger.info("New ID choosen:" + String.valueOf(newPersonID));
				OWLIndividual person = mh.generateEntity(
						TripleStoreHandler.ONTOLOGY_NS + "Person",
						TripleStoreHandler.ONTOLOGY_NS + "Person:"
								+ String.valueOf(newPersonID), newValues);
				mh.printIndividual(person);

				newPersonID += 1;
//				Boolean ex = checkExistance(person, th, mapping.values(),
//						outCtx);
//				if (ex) {
//					logger.info("nothing to be done!");
//					person.delete();
//					continue;
//				}
				th.write(person, outCtx);

				// now we create the MPIWG identifier and connect it to the
				// person
				HashMap<String, Object> idValues = new HashMap<String, Object>();
				idValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "is_preferred_namedEntityIdentifier", person);
				idValues.put(
						"http://erlangen-crm.org/plus/xdt/110404/has_XSD_String",
						"Person:" + String.valueOf(newPersonID)); // TODO PID
																	// GENERATOR

				namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx);
				OWLIndividual mpiwgIdentifier = mh.generateEntity(
						TripleStoreHandler.ONTOLOGY_NS
								+ "NamedEntityIdentifier",
								TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:"
								+ String.valueOf(namedEntityIdentifierID),
								idValues);

				namedEntityIdentifierID+=1;
				th.write(mpiwgIdentifier, outCtx);

				// now create the creation object and connect it to the MPIWG
				// identifier and the provider MPIWG
				HashMap<String, Object> creationValues = new HashMap<String, Object>();
				creationValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "carriedOutByNamedEntityProvider", providerMPIWG);
				creationValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "created_NamedEntityIdentifier", mpiwgIdentifier);
				
				
				namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx);
				
				OWLIndividual creation = mh.generateEntity(
						TripleStoreHandler.ONTOLOGY_NS
								+ "NamedEntityIdentifierCreation",
								TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:"
								+ String.valueOf(namedEntityIdentifierCreationID),
						creationValues);
				namedEntityIdentifierCreationID+=1;
				
				th.write(creation, outCtx);

				// now create the wikipedia identifier and connect it to the
				// person
				HashMap<String, Object> db_idValues = new HashMap<String, Object>();
				db_idValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "identifies_NamedEntity", person);

				// identifier is the url at dbpedia
				db_idValues
						.put("http://erlangen-crm.org/plus/xdt/110404/has_XSD_String",
								subjValue.stringValue());
				
				namedEntityIdentifierID = getNewId(namedEntityIdentifierID, "NamedEntityIdentifier", outCtx);
				
				OWLIndividual dbIdentifier = mh.generateEntity(
						TripleStoreHandler.ONTOLOGY_NS
								+ "NamedEntityIdentifier", 
								TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifier:"
								+ String.valueOf(namedEntityIdentifierID),
								db_idValues);
				
				namedEntityIdentifierID+=1;
				
				th.write(dbIdentifier, outCtx);

				// now create the creation object and connect it to the dbpedia
				// identifier and the provider pdbedia
				HashMap<String, Object> db_creationValues = new HashMap<String, Object>();
				db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "carriedOutByNamedEntityProvider", providerDbPedia);
				db_creationValues.put(TripleStoreHandler.ONTOLOGY_NS
						+ "created_NamedEntityIdentifier", dbIdentifier);
				
				
				namedEntityIdentifierCreationID = getNewId(namedEntityIdentifierCreationID, "NamedEntityIdentifierCreation", outCtx);
				
				OWLIndividual dbcreation = mh.generateEntity(
						TripleStoreHandler.ONTOLOGY_NS
								+ "NamedEntityIdentifierCreation",
								TripleStoreHandler.ONTOLOGY_NS + "NamedEntityIdentifierCreation:"
								+ String.valueOf(namedEntityIdentifierCreationID),
						db_creationValues);
				namedEntityIdentifierCreationID+=1;
			
				

				th.write(dbcreation, outCtx);

				// add the bbpedia identifier to the triple store
				th.write(subj.stringValue(),
						"http://erlangen-crm.org/110404/P1_is_identified_by",
						dbIdentifier.getURI(), outCtx);
				;

			}
		}

	}

	private Boolean checkExistance(HashMap<String, Object> newValues,
			TripleStoreHandler th2, Collection<String> props, String outCtx) throws RepositoryException {
	
		String queryString = "select ?x FROM <" + outCtx + "> " + "where {";
		for (String propString : props) {
			//RDFProperty rdfProp = model.getRDFProperty(propString);
			RDFSLiteral val = (RDFSLiteral) newValues.get(propString);
			if  (val==null)
				continue;
			String lang = val.getLanguage();
			String str = val.getString();
			str = str.replace("\"", "\\\"");
			queryString += "?x <" + propString + "> \"" + str + "\"@" + lang
					+ ".";
		}

		queryString += " }";
		TupleQueryResult result;
		try {
			result = th.querySPARQL(queryString);
		} catch (MalformedQueryException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		} catch (QueryEvaluationException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		} catch (TripleStoreHandlerException e) {
			e.printStackTrace();
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		}
		try {
			if (result.hasNext())
				return true;
			else
				return false;
		} catch (QueryEvaluationException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		}

	}
	private int getNewId(int startnumber, String identifier, String ctx)
			throws SQLException {

		java.sql.Statement smt = th.sqlCon.createStatement();
		Boolean exists = true;

		while (exists) {
			String cmdString = String.format(
					"sparql select count(*) from <%s> where {<%s> ?x ?y}",
					ctx,
					TripleStoreHandler.ONTOLOGY_NS + identifier +":"
							+ String.valueOf(startnumber));
			smt.execute(cmdString);
			ResultSet rs = smt.getResultSet();
			rs.next();
			int count = rs.getInt(1);
			if (count > 0) {
				startnumber += 1;
			} else {
				exists = false;
			}
		}

		return startnumber;
	}

	private Boolean checkExistance(OWLIndividual person,
			TripleStoreHandler th2, Collection<String> props, String outCtx)
			throws RepositoryException {
		JenaOWLModel model = mh.getOwlModel();
		//Map<String, String> vals = new HashMap<String, String>();

		String queryString = "select ?x FROM <" + outCtx + "> " + "where {";
		for (String propString : props) {
			RDFProperty rdfProp = model.getRDFProperty(propString);
			RDFSLiteral val = (RDFSLiteral) person.getPropertyValue(rdfProp);
			if (val == null)
				continue;
			String lang = val.getLanguage();
			String str = val.getString();
			str = str.replace("\"", "\\\"");
			queryString += "?x <" + propString + "> \"" + str + "\"@" + lang
					+ ".";
		}

		queryString += " }";
		TupleQueryResult result;
		try {
			result = th.querySPARQL(queryString);
		} catch (MalformedQueryException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		} catch (QueryEvaluationException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		} catch (TripleStoreHandlerException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		}
		try {
			if (result.hasNext())
				return true;
			else
				return false;
		} catch (QueryEvaluationException e) {
			logger.error("Query String cannot be handled:" + queryString);
			return false;
		}

	}

	private OWLIndividual createDbPediaProvider(String ctx)
			throws RepositoryException, TripleStoreHandlerException {
		HashMap<String, Object> db_creationValues = new HashMap<String, Object>();

		OWLIndividual dbcreation = mh.generateEntity(
				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider",
				TripleStoreHandler.ONTOLOGY_NS + "provider_wikipedia");

		th.write(dbcreation, ctx);

		return dbcreation;

	}

	private OWLIndividual createMPIWFProvider(String ctx)
			throws RepositoryException, TripleStoreHandlerException {
		HashMap<String, Object> db_creationValues = new HashMap<String, Object>();

		OWLIndividual dbcreation = mh.generateEntity(
				TripleStoreHandler.ONTOLOGY_NS + "NamedEntityProvider",
				TripleStoreHandler.ONTOLOGY_NS + "provider_MPIWG");

		th.write(dbcreation, ctx);

		return dbcreation;

	}

	public static void main(String args[]) throws Exception {
		if (args.length < 4) {
			System.out.println("usage: import user pw offset limit ");
			System.exit(1);
		}

		Logger.getRootLogger().setLevel(Level.INFO);
		BasicConfigurator.configure();

		MetaDataHandler mh = new MetaDataHandler();

		TripleStoreHandler th = new TripleStoreHandler(
				"jdbc:virtuoso://virtuoso.mpiwg-berlin.mpg.de:1111",args[0], args[1]);
		System.out.println(mh);
		System.out.println(th);

		Import imp = new Import(mh, th, args[2], args[3]);

		HashMap<String, String> mapping = new HashMap<String, String>();
		mapping.put("http://xmlns.com/foaf/0.1/surname",
				"http://xmlns.com/foaf/0.1/lastName");
		mapping.put("http://xmlns.com/foaf/0.1/givenName",
				"http://xmlns.com/foaf/0.1/firstName");
		mapping.put("http://xmlns.com/foaf/0.1/name",
				"http://xmlns.com/foaf/0.1/name");

		imp.createMPIWGIdentifiers("http://dbpedia.org/ontology/Person",
				mapping, "file://personendataWikipedia",
				"file://mpiwg_persons_2.rdf");

		// mh.getOwlModel().save(new java.net.URI("file:///tmp/prot.owl"));
	}

}