view src/de/mpiwg/dwinter/duomo/lexdump/LexOWLTransformer.java @ 2:273164f81926

bug in transformer handleroles
author dwinter
date Thu, 03 Feb 2011 16:08:06 +0100
parents 0fa29ab5e5e0
children d9fd32ecae24
line wrap: on
line source

package de.mpiwg.dwinter.duomo.lexdump;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.print.URIException;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.jdom.Attribute;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.xpath.XPath;

import com.sun.xml.internal.ws.developer.MemberSubmissionEndpointReference.Elements;

import edu.stanford.smi.protege.exception.OntologyLoadException;
import edu.stanford.smi.protege.model.Instance;
import edu.stanford.smi.protegex.owl.model.OWLAllValuesFrom;
import edu.stanford.smi.protegex.owl.model.OWLClass;
import edu.stanford.smi.protegex.owl.model.OWLDataRange;
import edu.stanford.smi.protegex.owl.model.OWLIndividual;
import edu.stanford.smi.protegex.owl.model.OWLNamedClass;
import edu.stanford.smi.protegex.owl.model.OWLProperty;
import edu.stanford.smi.protegex.owl.model.OWLUnionClass;
import edu.stanford.smi.protegex.owl.model.RDFList;
import edu.stanford.smi.protegex.owl.model.RDFProperty;
import edu.stanford.smi.protegex.owl.model.RDFResource;
import edu.stanford.smi.protegex.owl.model.impl.AbstractOWLQuantifierRestriction;
import edu.stanford.smi.protegex.owl.model.impl.DefaultOWLUnionClass;

public class LexOWLTransformer {
	private Logger logger = Logger.getRootLogger();
	private FileWriter missing;
	private OWLImporter owlDoc;
	private LexDumpImporter lexDoc;
	private Map<String, OWLIndividual> individualIds = new HashMap<String, OWLIndividual>(); // speichere
																								// ids
																								// fuer
																								// weitere
																								// Verwendung

	public LexOWLTransformer(OWLImporter owlDoc, LexDumpImporter lexDoc) throws IOException {
		this.owlDoc = owlDoc;
		this.lexDoc = lexDoc;
		missing = new FileWriter(new File("/tmp/missing.out"));
	}

	/**
	 * @param args
	 * @throws Exception 
	 */
	public static void main(String[] args) throws Exception {
		Logger.getRootLogger().setLevel(Level.DEBUG);
		// Import OWL
		//String base = "/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/owlInput";
		String base=args[0];
		String baseUri= base.replace(" ", "%20");
		URI ontologieUri = new URI("file://"+baseUri+"/duomoAnalysis.owl");
		//URI ontologieUri = new URI(
		//		"file:///Users/dwinter/Documents/Projekte/Diss%20-%20data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/owlInput/duomoAnalysis.owl");
		OWLImporter owlDoc = null;
		try {

			owlDoc = new OWLImporter(base, ontologieUri);
			// owlDoc.printModel();
		} catch (OntologyLoadException e) {
			e.printStackTrace();
			System.exit(-1);
		}
		// read and parse lexfile
		//String lexFile = "/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/duomoData/LEXXDUMP.xml";
		String lexFile=args[1];
		LexDumpImporter lexDoc = null;
		try {
			lexDoc = new LexDumpImporter(lexFile);
		} catch (JDOMException e) {
			e.printStackTrace();
			System.exit(-1);
		} catch (IOException e) {
			System.exit(-1);
			e.printStackTrace();
		}

		// List<Element> cartas = lexDoc.getCartas();
		// System.out.println(cartas.size());

		LexOWLTransformer tf = new LexOWLTransformer(owlDoc, lexDoc);
		tf.transform();
		owlDoc.save("file:///tmp/out.owl");
	}

	private void transform() throws JDOMException {
		List<Element> signatures = lexDoc.getSignatures();

		// Element signature = signatures.get(0);
		int signatureCount = 1;
		int maxsign = signatures.size();
		for (Element signature : signatures) {
			logger.debug(String.format("Signature: %s (%s)", signatureCount,
					maxsign));
			OWLIndividual signatureInd = createSignature(signature);

			@SuppressWarnings("unchecked")
			List<Element> cards = XPath.selectNodes(signature, ".//carta");
			int cardsCount = 1;
			int maxcards = cards.size();

			// Element card=cards.get(0);
			for (Element card : cards) {
				logger.debug(String.format("Cards: %s (%s)", cardsCount++,
						maxcards));
				logger.debug(String.format("Signature: %s (%s)",
						signatureCount, maxsign));
				OWLIndividual cardInd = createCard(card, signatureInd);

				@SuppressWarnings("unchecked")
				List<Element> records = XPath.selectNodes(card, ".//record");
				for (Element record : records) {
					// Element record = records.get(0);
					OWLIndividual recordInd = createRecord(record, cardInd);
					OWLIndividual recordEvent = createEvent(record, recordInd); // Event
																				// und
																				// Records
																				// sind
																				// im
																				// Original
																				// im
																				// record-tag
				}
			}
			signatureCount++;
		}
		owlDoc.printModel();
	}

	private OWLIndividual createEvent(Element record, OWLIndividual recordInd) {

		logger.debug("Create Event");
		OWLIndividual eventInstance = owlDoc.createInstance("RecordedEvent");

		owlDoc.setProperty(recordInd, "crm:P70_documents", eventInstance);

		// timespan

		try {
			String dateDcStart = lexDoc.getValue(record, ".//datdf/startdate");

			String dateDcEnd = lexDoc.getValue(record, ".//datdf/startdate");

			if (!dateDcStart.equals("")) {
				OWLIndividual timeSpan = owlDoc.createTimeSpan(dateDcStart,
						dateDcEnd);
				owlDoc.setProperty(eventInstance, "crm:P4_has_time-span",
						timeSpan);
			}
		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		// tipol
		String typology;
		try {
			typology = lexDoc.getValue(record, ".//tipol");
			OWLIndividual typolInd = createOrGetTipol(typology);
			owlDoc.setProperty(eventInstance, "has_tipol", typolInd);

		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		// reges
		try {
			createNewDependingDataTypePropertyFromXpath(record, eventInstance,
					".//reges/italian", "has_reges", "it");
			createNewDependingDataTypePropertyFromXpath(record, eventInstance,
					".//reges/english", "has_reges", "en");
		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		// nomiq

		try {
			@SuppressWarnings("unchecked")
			List<Element> nomiqs = XPath.selectNodes(record, ".//nomiq");
			OWLIndividual recordNamesRoles = null;
			for (Element nomiq : nomiqs) {
				String name = lexDoc.getValue(nomiq, "./name");
				String role = lexDoc.getValue(nomiq, "./role");

				if (!name.equals("") && !role.equals("")) {
					recordNamesRoles = handleNameWithRole(recordInd, name, role);
				} else if (!role.equals("")) {
					recordNamesRoles = createOrGetRole(role);
				} else if (!name.equals("")) {
					recordNamesRoles = createOrGetName(name);
				}

				if (recordNamesRoles != null) {
					owlDoc.setProperty(eventInstance,
							"recordsDuomoObjectNameRoles", recordNamesRoles);
					String id = lexDoc.getValue(nomiq, "./@id");
					individualIds.put(id, recordNamesRoles);
				}
			}

		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		// istit
		try {
			String istit = lexDoc.getValue(record, "./istit");
			OWLIndividual istitInd = owlDoc.getIndividualByReadableId(
					"IndicesInstitutions", istit);
			if (istitInd == null) {
				istitInd = createOrGetInstitution(istit);
			}

			owlDoc.setProperty(eventInstance, "recordsDuomoObjectInstitution",
					istitInd);

		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		// types

		try {
			List<Element> types = XPath.selectNodes(record, ".//type");
			for (Element type : types) {
				createType(eventInstance, type);
			}
		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return eventInstance;
	}

	private void createType(OWLIndividual eventInstance, Element type) {

		String typeId;
		try {
			typeId = lexDoc.getValue(type, "./ptr/@target");
			String clsName = owlDoc.getClassNameFromTypeId(typeId);
			OWLNamedClass cls = owlDoc.getClassFromTypeId(typeId);
			OWLIndividual typeInd = owlDoc.createInstance(clsName);

			OWLNamedClass subjectClass = getPreferredTargetClass(cls,
					"has_subject");
			OWLNamedClass predicateClass = getPreferredTargetClass(cls,
					"has_predicate");

			List<Element> freeTexts = XPath.selectNodes(type, "./freetext");
			for (Element freeText : freeTexts) {

				String subjPointer = lexDoc.getValue(freeText,
						"./sub/ptrtoperson/@target");
				String subjText = lexDoc.getValue(freeText, "./sub");

				OWLIndividual subjInd = createSubjectOrPredicate(subjectClass,
						subjPointer, subjText);

				String predPointer = lexDoc.getValue(freeText,
						"./pred/ptrtoperson/@target");
				String predText = lexDoc.getValue(freeText, "./pred");

				OWLIndividual predInd = createSubjectOrPredicate(
						predicateClass, predPointer, predText);

				if (subjInd != null) {
					owlDoc.setProperty(typeInd, "has_subject", subjInd);
				}

				if (predInd != null) {
					owlDoc.setProperty(typeInd, "has_predicate", predInd);
				}

				owlDoc.setProperty(eventInstance, "recordsDuomoObjectConcept",
						typeInd);
			}

		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	private OWLIndividual createSubjectOrPredicate(OWLNamedClass toClass,
			String subjPointer, String subjText) {
		OWLIndividual subjInd = null;

		if (!subjPointer.equals("")) {
			subjInd = toClass.createOWLIndividual(null);
			OWLIndividual ind = individualIds.get(subjPointer);
			if (ind == null) {
				logger.debug("target ID does not exist:" + subjPointer);
				try {
					missing.write("target ID does not exist:" + subjPointer+"\n");
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			} else {
				owlDoc.setProperty(subjInd, "has_NameOrRoleFromIndex", ind);
			}
		}

		if (!subjText.equals("")) {
			if (subjInd == null)
				subjInd = toClass.createOWLIndividual(null);

			OWLNamedClass idcls = owlDoc.owlModel
					.getOWLNamedClass("Identifier"); // is die klasse selbst
														// schon ein identifiert
			if (toClass.getNamedSuperclasses(true).contains(idcls)) { // to
				owlDoc.setProperty(subjInd, "has_readable_id", subjText);
			} else {

				OWLIndividual ident = owlDoc
						.createInstance("IdentifierPredicateOrSubject");
				owlDoc.setProperty(ident, "has_readable_id", subjText);
				owlDoc.setProperty(subjInd, "crm:P48_has_preferred_identifier",
						ident);
			}
		}
		return subjInd;
	}

	private OWLNamedClass getPreferredTargetClass(OWLNamedClass cls,
			String propertyName) {
		RDFProperty prop = owlDoc.owlModel.getRDFProperty(propertyName);

		// finde welche klasse als subject erlaubt ist
		Collection<?> restrictions = cls.getRestrictions(prop, true);

		RDFResource restrictionValues = prop.getRange(); // nimm erstmal den
															// gesammten Range

		// schaue jetzt nach ob eb es einschraenkungen gibt.
		for (Iterator<?> it = restrictions.iterator(); it.hasNext();) {
			Object restriction = it.next();
			if (OWLAllValuesFrom.class.isInstance(restriction)) {
				OWLAllValuesFrom ar = (OWLAllValuesFrom) restriction;
				restrictionValues = ar.getAllValuesFrom();
				break;
			}

		}
		OWLNamedClass toClass = null;
		if (OWLNamedClass.class.isInstance(restrictionValues)) {
			toClass = (OWLNamedClass) restrictionValues;
		} else if (OWLDataRange.class.isInstance(restrictionValues)) {
			RDFList dr = ((OWLDataRange) restrictionValues).getOneOf();
			for (Object d : dr.getValues()) {
				System.out.println(d);
				toClass = (OWLNamedClass) d;
				// FIXME: geht das??
			}
		} else if (DefaultOWLUnionClass.class.isInstance(restrictionValues)) { // mehr
																				// als
																				// eine
																				// moeglich
			DefaultOWLUnionClass ou = (DefaultOWLUnionClass) restrictionValues;
			Set set = new HashSet();
			ou.getNestedNamedClasses(set);

			for (Iterator<?> it = set.iterator(); it.hasNext();) {
				OWLNamedClass cl = (OWLNamedClass) it.next();
				OWLNamedClass idcls = owlDoc.owlModel
						.getOWLNamedClass("Identifier"); // nimm die Klasse die
															// ein Identifier
															// ist.
				if (cl.getNamedSuperclasses(true).contains(idcls)) {
					toClass = cl;
					break;
				}

			}
		}
		return toClass;
	}

	

	private OWLIndividual handleNameWithRole(OWLIndividual recordInd,
			String name, String role) {
		// teste ob schon ein solcer Eintrag existiert
		OWLIndividual nameInd = owlDoc.getIndividualByReadableId(
				"IndicesNames", name);
		Boolean createNewNameWithRole = false;
		if (nameInd == null) {
			nameInd = createOrGetName(name);
			createNewNameWithRole = true; // name existierte nicht dann kann
											// auch NameWithRole nicht
											// existierten.
		}
		OWLIndividual roleInd = owlDoc.getIndividualByReadableId(
				"IndicesRoles", role);
		if (roleInd == null) {
			roleInd = createOrGetRole(role);
			createNewNameWithRole = true; // role existierte nicht dann kann
											// auch NameWithRole nicht
											// existierten.
		}
		OWLIndividual nameWithRoleInd = null;
		if (!createNewNameWithRole) { // schon klar, dass er nicht existiert
			nameWithRoleInd = getNameWithRole(nameInd, roleInd);
		}

		if (nameWithRoleInd == null) { // existiert nicht
			nameWithRoleInd = createNameWithRole(nameInd, roleInd);
		}

		return nameWithRoleInd;
	}

	private OWLIndividual getNameWithRole(OWLIndividual nameInd,
			OWLIndividual roleInd) {
		List<OWLIndividual> indicesWithNames = owlDoc.getIndividuals(
				"IndicesNameWithRole", "refers_to_name", nameInd); // suche alle
																	// infrage
																	// kommenden
																	// nameen

		if (indicesWithNames == null) { // kein Treffer
			return null;
		}

		for (OWLIndividual name : indicesWithNames) {
			Object role = owlDoc.getRelatedIndividual(name, "refers_to_role");
			if (roleInd.equals(role)) {
				return name;
			}
		}
		return null;
	}

	private OWLIndividual createOrGetTipol(String typology) {
		OWLIndividual tipol = owlDoc.createOrGetInstanceWithIdentifier("Typology","Identifier",typology);
		return tipol;
	}

	private OWLIndividual createNameWithRole(OWLIndividual nameInd,
			OWLIndividual roleInd) {

		OWLIndividual nameWithRoleInd = owlDoc
				.createInstance("IndicesNameWithRole");

		owlDoc.setProperty(nameWithRoleInd, "refers_to_name", nameInd);
		owlDoc.setProperty(nameWithRoleInd, "refers_to_role", roleInd);

		return nameWithRoleInd;

	}

	private OWLIndividual createOrGetInstitution(String name) {
		OWLIndividual nameInd = owlDoc.createOrGetInstanceWithIdentifier("IndicesInstitutions","IdentifierInstitutions",name);
		
		return nameInd;
	}

	private OWLIndividual createOrGetName(String name) {
		OWLIndividual nameInd = owlDoc.createOrGetInstanceWithIdentifier("IndicesNames","IdentifierNames",name);
		return nameInd;
	}

	private OWLIndividual createOrGetRole(String name) {
		OWLIndividual roleInd = owlDoc.createOrGetInstanceWithIdentifier("IndicesRoles","IdentifierRoles",name);
		return roleInd;
	}

	private OWLIndividual createRecord(Element record, OWLIndividual cardInd)
			throws JDOMException {
		OWLIndividual recordInstance = owlDoc.createInstance("Record");
		owlDoc.setProperty(recordInstance, "is_on_card", cardInd);
		createNewDependingInstanceFromXpath(record, recordInstance, "./@id",
				new String[] { "has_readable_id", "rdfs:label" },
				"IdentifierCurrent", "crm:P48_has_preferred_identifier");

		String value = lexDoc.getValue(record, ".//textblockid");
		if (!value.equals(""))
			owlDoc.setProperty(recordInstance, "has_textblockid", value);

		String endOnCarta = lexDoc.getValue(record, "./@end_on_carta");
		if (!endOnCarta.equals("")) {
			OWLIndividual signature = (OWLIndividual) owlDoc
					.getRelatedIndividual(cardInd, "has_signature");
			addRecordToCarta(recordInstance, value, signature);
		}

		String dateDcStart = lexDoc.getValue(record, ".//datdc/startdate");
		String dateDcEnd = lexDoc.getValue(record, ".//datdc/startdate");

		OWLIndividual timeSpan = owlDoc.createTimeSpan(dateDcStart, dateDcEnd);

		owlDoc.setProperty(recordInstance, "crm:P4_has_time-span", timeSpan);

		return recordInstance;
	}

	private void addRecordToCarta(OWLIndividual recordInstance, String cardID,
			OWLIndividual signature) {

		OWLIndividual card = owlDoc.getIndividualByReadableId("Card", cardID);

		if (card == null) {
			card = createCard(cardID, signature);
		}

		owlDoc.setProperty(recordInstance, "is_on_card", card);
	}

	private OWLIndividual createSignature(Element signature)
			throws JDOMException {
		Element segHeaderElement = (Element) XPath.selectSingleNode(signature,
				".//segheader");
		String segheader = segHeaderElement.getTextTrim();

		OWLIndividual signatureInstance = owlDoc.createInstance("Signatur");
		owlDoc.setProperty(signatureInstance, "rdfs:label", segheader);
		return signatureInstance;
	}

	private OWLIndividual createCard(String cardId, OWLIndividual signature) {

		OWLIndividual cardInstance = owlDoc.createInstance("Card");

		owlDoc.setProperty(cardInstance, "has_signature", signature);

		OWLIndividual preferredId = owlDoc.createInstance("IdentifierCurrent");
		owlDoc.setProperty(preferredId, "has_readable_id", cardId);
		owlDoc.setProperty(preferredId, "rdfs:label", cardId);

		owlDoc.setProperty(cardInstance, "crm:P48_has_preferred_identifier",
				preferredId);

		return cardInstance;
	}

	private OWLIndividual createCard(Element card, OWLIndividual signatureInd) {

		OWLIndividual cardInstance = owlDoc.createInstance("Card");

		try {

			createNewDependingInstanceFromXpath(card, cardInstance,
					".//cartanr", new String[] { "has_readable_id",
							"rdfs:label" }, "IdentifierCurrent",
					"crm:P48_has_preferred_identifier");

			createNewDependingInstanceFromXpath(card, cardInstance,
					".//cartaant", new String[] { "has_readable_id",
							"rdfs:label" }, "IdentifierCurrent",
					"crm:P1_is_identified_by");

			owlDoc.setProperty(cardInstance, "has_signature", signatureInd);

		} catch (JDOMException e) {
			e.printStackTrace();
			return null;
		}

		return cardInstance;
	}

	private void createNewDependingInstanceFromXpath(Element card,
			OWLIndividual cardInstance, String xpath, String[] propertyNames,
			String newInstanceClassName, String relationNameToNewInstance)
			throws JDOMException {

		List<?> identifierIdEls = (List<?>) XPath.selectNodes(card, xpath);
		for (Object identifierIdEl : identifierIdEls) {
			String identifierId = "";
			if (Element.class.isInstance(identifierIdEl)) {
				identifierId = ((Element) identifierIdEl).getTextTrim();
			} else if (Attribute.class.isInstance(identifierIdEl)) {
				identifierId = ((Attribute) identifierIdEl).getValue();
			}

			OWLIndividual identifier = owlDoc
					.createInstance(newInstanceClassName);
			for (int i = 0; i < propertyNames.length; i++) {
				owlDoc.setProperty(identifier, propertyNames[i], identifierId);
			}

			owlDoc.setProperty(cardInstance, relationNameToNewInstance,
					identifier);
		}

	}

	// createNewDependingDataTypePropertyFromXpath(record, eventInstance,
	// ".//resges/italian", "has_reges",
	// "italian");
	private void createNewDependingDataTypePropertyFromXpath(Element record,
			OWLIndividual eventInstance, String xpath, String propertyName,
			String lang) throws JDOMException {

		List<?> identifierIdEls = (List<?>) XPath.selectNodes(record, xpath);
		for (Object identifierIdEl : identifierIdEls) {
			String identifierId = "";
			if (Element.class.isInstance(identifierIdEl)) {
				identifierId = ((Element) identifierIdEl).getTextTrim();
			} else if (Attribute.class.isInstance(identifierIdEl)) {
				identifierId = ((Attribute) identifierIdEl).getValue();
			}

			owlDoc.setDataTypePropery(eventInstance, propertyName,
					identifierId, lang);
		}
	}
}