Mercurial > hg > duomoOWLProject
view src/de/mpiwg/dwinter/duomo/lexdump/LexOWLTransformer.java @ 8:919e9f3b5efd
neue klassen zur textanalyse (stanford parser eingebaut)
alle has_readable_labe Datatype properties durch rdfs:label ersetzt.
author | dwinter |
---|---|
date | Thu, 21 Jun 2012 17:08:22 +0200 |
parents | 19e40abb3e8a |
children | 4392a6adf85a |
line wrap: on
line source
package de.mpiwg.dwinter.duomo.lexdump; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.net.URLEncoder; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import javax.print.URIException; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.jdom.Attribute; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.xpath.XPath; import com.sun.xml.internal.ws.developer.MemberSubmissionEndpointReference.Elements; import edu.stanford.smi.protege.exception.OntologyLoadException; import edu.stanford.smi.protege.model.Facet; import edu.stanford.smi.protege.model.Instance; import edu.stanford.smi.protege.model.Slot; import edu.stanford.smi.protegex.owl.jena.JenaOWLModel; import edu.stanford.smi.protegex.owl.model.OWLAllValuesFrom; import edu.stanford.smi.protegex.owl.model.OWLClass; import edu.stanford.smi.protegex.owl.model.OWLDataRange; import edu.stanford.smi.protegex.owl.model.OWLIndividual; import edu.stanford.smi.protegex.owl.model.OWLNamedClass; import edu.stanford.smi.protegex.owl.model.OWLProperty; import edu.stanford.smi.protegex.owl.model.OWLUnionClass; import edu.stanford.smi.protegex.owl.model.RDFList; import edu.stanford.smi.protegex.owl.model.RDFProperty; import edu.stanford.smi.protegex.owl.model.RDFResource; import edu.stanford.smi.protegex.owl.model.RDFSClass; import edu.stanford.smi.protegex.owl.model.impl.AbstractOWLQuantifierRestriction; import edu.stanford.smi.protegex.owl.model.impl.AbstractOWLRestriction; import edu.stanford.smi.protegex.owl.model.impl.DefaultOWLAllValuesFrom; import edu.stanford.smi.protegex.owl.model.impl.DefaultOWLUnionClass; /** * @author dwinter * *Transformiert die Lex Files in OWL */ public class LexOWLTransformer { private Logger logger = Logger.getRootLogger(); private FileWriter missing; private OWLImporter owlDoc; private LexDumpImporter lexDoc; private Map<String, OWLIndividual> individualIds = new HashMap<String, OWLIndividual>(); // speichere // ids // fuer // weitere // Verwendung public LexOWLTransformer(OWLImporter owlDoc, LexDumpImporter lexDoc) throws IOException { this.owlDoc = owlDoc; this.lexDoc = lexDoc; missing = new FileWriter(new File("/tmp/missing.out")); } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { System.out.println("Stack size has to be large, use at least -Xmx3048M -Xss10M"); if (args.length<2){ System.out.println("Usage: \"/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/owlInput\""+ "\"/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/duomoData/LEXXDUMP.xml\""); System.exit(-1); } Logger.getRootLogger().setLevel(Level.DEBUG); // Import OWL //String base = "/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/owlInput"; String base=args[0]; String baseUri= base.replace(" ", "%20"); URI ontologieUri = new URI("file://"+baseUri+"/duomoAnalysis.owl"); //URI ontologieUri = new URI( // "file:///Users/dwinter/Documents/Projekte/Diss%20-%20data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/owlInput/duomoAnalysis.owl"); OWLImporter owlDoc = null; try { owlDoc = new OWLImporter(base, ontologieUri); // owlDoc.printModel(); } catch (OntologyLoadException e) { e.printStackTrace(); System.exit(-1); } // read and parse lexfile //String lexFile = "/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/duomoData/LEXXDUMP.xml"; String lexFile=args[1]; LexDumpImporter lexDoc = null; try { lexDoc = new LexDumpImporter(lexFile); } catch (JDOMException e) { e.printStackTrace(); System.exit(-1); } catch (IOException e) { System.exit(-1); e.printStackTrace(); } // List<Element> cartas = lexDoc.getCartas(); // System.out.println(cartas.size()); LexOWLTransformer tf = new LexOWLTransformer(owlDoc, lexDoc); tf.transform(); //owlDoc.save("file:///tmp/out.owl"); } /** * Hautmethoden zur Transformation * @throws URISyntaxException * @throws Exception */ private void transform() throws URISyntaxException, Exception { List<Element> signatures = lexDoc.getSignatures(); // Element signature = signatures.get(0); int signatureCount = 0; int maxsign = signatures.size(); for (Element signature : signatures) { logger.debug(String.format("Signature: %s (%s)", signatureCount, maxsign)); signatureCount++; //if (signatureCount<5 || signatureCount>10) // continue; OWLIndividual signatureInd = createSignature(signature); @SuppressWarnings("unchecked") List<Element> cards = XPath.selectNodes(signature, ".//carta"); int cardsCount = 1; int maxcards = cards.size(); // Element card=cards.get(0); for (Element card : cards) { logger.debug(String.format("Cards: %s (%s)", cardsCount++, maxcards)); logger.debug(String.format("Signature: %s (%s)", signatureCount, maxsign)); OWLIndividual cardInd = createCard(card, signatureInd); @SuppressWarnings("unchecked") List<Element> records = XPath.selectNodes(card, ".//record"); for (Element record : records) { // Element record = records.get(0); OWLIndividual recordInd = createRecord(record, cardInd); OWLIndividual recordEvent = createEvent(record, recordInd); // Event // und // Records // sind // im // Original // im // record-tag } } //owlDoc.save("file:///tmp/out"+String.valueOf(signatureCount)+".owl"); //logger.debug("Saved:"+"/tmp/out"+String.valueOf(signatureCount)+".owl"); //owlDoc.reloadOWL(); } owlDoc.printModel(); } private OWLIndividual createEvent(Element record, OWLIndividual recordInd) { logger.debug("Create Event"); OWLIndividual eventInstance = owlDoc.createInstance("RecordedEvent"); owlDoc.setProperty(recordInd, "crm:P70_documents", eventInstance); // timespan try { String dateDcStart = lexDoc.getValue(record, ".//datrf/startdate"); String dateDcEnd = lexDoc.getValue(record, ".//datrf/enddate"); if (!dateDcStart.equals("")) { OWLIndividual timeSpan = owlDoc.createTimeSpan(dateDcStart, dateDcEnd); owlDoc.setProperty(eventInstance, "crm:P4_has_time-span", timeSpan); } } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); } // tipol String typology; try { typology = lexDoc.getValue(record, ".//tipol"); OWLIndividual typolInd = createOrGetTipol(typology); owlDoc.setProperty(eventInstance, "has_tipol", typolInd); } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); } // reges try { createNewDependingDataTypePropertyFromXpath(record, eventInstance, ".//reges/italian", "has_reges", "it"); createNewDependingDataTypePropertyFromXpath(record, eventInstance, ".//reges/english", "has_reges", "en"); } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); } // nomiq try { @SuppressWarnings("unchecked") List<Element> nomiqs = XPath.selectNodes(record, ".//nomiq"); OWLIndividual recordNamesRoles = null; for (Element nomiq : nomiqs) { String name = lexDoc.getValue(nomiq, "./name"); String role = lexDoc.getValue(nomiq, "./role"); String provenance = lexDoc.getValue(nomiq, "./name/provenance"); if (!name.equals("") && !role.equals("")) { recordNamesRoles = handleNameWithRole(recordInd, name, role); } else if (!role.equals("")) { recordNamesRoles = createOrGetRole(role); } else if (!name.equals("")) { recordNamesRoles = createOrGetName(name); if (provenance!=""){ owlDoc.setDataTypePropery(recordNamesRoles, "has_provenance_as_string", provenance, "it"); } } if (recordNamesRoles != null) { owlDoc.setProperty(eventInstance, "recordsDuomoObjectNameRoles", recordNamesRoles); String id = lexDoc.getValue(nomiq, "./@id"); individualIds.put(id, recordNamesRoles); } } } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); } // istit try { String istit = lexDoc.getValue(record, ".//istit"); if (!istit.equals("")){ OWLIndividual istitInd = owlDoc.getIndividualByReadableId( "IndicesInstitutions", istit); if (istitInd == null) { istitInd = createOrGetInstitution(istit); } owlDoc.setProperty(eventInstance, "recordsDuomoObjectInstitution", istitInd); } } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); } // types try { List<Element> types = XPath.selectNodes(record, ".//type"); for (Element type : types) { createType(eventInstance, type); } } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); } return eventInstance; } private void createType(OWLIndividual eventInstance, Element type) { String typeId; try { typeId = lexDoc.getValue(type, "./ptr/@target"); String clsName = owlDoc.getClassNameFromTypeId(typeId); OWLIndividual typeInd = owlDoc.createInstance(clsName); owlDoc.setProperty(eventInstance, "has_topic", typeInd); List<Element> freeTexts = XPath.selectNodes(type, "./freetext"); for (Element freeText : freeTexts) { OWLNamedClass cls = owlDoc.getClassFromTypeId(typeId); OWLNamedClass subjectClass = getPreferredTargetClass(cls, "has_subject"); OWLNamedClass predicateClass = getPreferredTargetClass(cls, "has_predicate"); String subjPointer = lexDoc.getValue(freeText, "./sub/ptrtoperson/@target"); String subjText = lexDoc.getValue(freeText, "./sub"); OWLIndividual subjInd = createSubjectOrPredicate(subjectClass, subjPointer, subjText); //suche ob eine subpropery von materiaInvolved fuer die die zem Type (type) gehoerige Klasse (clsName) existiert // und wenn ja welche, TODO: zur Zeit wird dann aus dem String "subjText" das entsprechende Material erzeugt. //Collection<RDFProperty> props = cls.getPossibleRDFProperties(); RDFProperty superproperty= owlDoc.owlModel.getRDFProperty("http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/materialInvolved"); Collection<RDFSClass> sc = cls.getSuperclasses(true); OWLNamedClass mat = owlDoc.owlModel.getOWLNamedClass("http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/DuomoActivities"); if (sc.contains(mat)){ // cls is subclass of DuomoActivities Collection<RDFProperty> props = superproperty.getSubproperties(false); //jetzt suche zu welcher subproperty die classe cls gehšrt //dazu gehe durch alle subproperties von materialInvolved- for (RDFProperty prop:props){ @SuppressWarnings("unchecked") Collection<RDFSClass> domains = prop.getDomains(true); for(RDFSClass domain: domains){ //if (domain.getName().equals(cls.getName())) //suche jetzt die domaene zu diesen property in schaue ob die cls eine subklasse davon ist if (cls.isSubclassOf(domain)) // cls ist in der domaene der property { //die propery muss genau einen wert aus einer festen klasse haben, diese wird jetzt gesucht und dann eine individual erzeugt. Collection<AbstractOWLRestriction> restrictions = cls.getRestrictions(prop, false); // suche die restriction und erzeuge dann ein object dieses type for (AbstractOWLRestriction restriction: restrictions){ if (DefaultOWLAllValuesFrom.class.isInstance(restriction)){ DefaultOWLAllValuesFrom rest = (DefaultOWLAllValuesFrom)restriction; RDFResource restClass = rest.getAllValuesFrom(); OWLIndividual inst = owlDoc.createOrGetInstanceWithIdentifier(restClass.getLocalName(), "Identifier", subjText, false); owlDoc.setProperty(typeInd, prop.getLocalName(), inst); //materialInd = owlDoc.createInstance(res.getName()); } } } } } } String predPointer = lexDoc.getValue(freeText, "./pred/ptrtoperson/@target"); String predText = lexDoc.getValue(freeText, "./pred"); OWLIndividual predInd = createSubjectOrPredicate( predicateClass, predPointer, predText); if (subjInd != null) { owlDoc.setProperty(typeInd, "has_subject", subjInd); } if (predInd != null) { owlDoc.setProperty(typeInd, "has_predicate", predInd); } owlDoc.setProperty(eventInstance, "recordsDuomoObjectConcept", typeInd); } } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private OWLIndividual createSubjectOrPredicate(OWLNamedClass toClass, String subjPointer, String subjText) { OWLIndividual subjInd = null; if (!subjPointer.equals("")) { subjInd = owlDoc.createInstance(toClass.getName()); //subjInd = toClass.createOWLIndividual(null); OWLIndividual ind = individualIds.get(subjPointer); if (ind == null) { logger.debug("target ID does not exist:" + subjPointer); try { missing.write("target ID does not exist:" + subjPointer+"\n"); missing.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { owlDoc.setProperty(subjInd, "has_NameOrRoleFromIndex", ind); } } if (!subjText.equals("") & !subjText.equals(" ")) { if (subjInd == null) subjInd = owlDoc.createInstance(toClass.getName()); //subjInd = toClass.createOWLIndividual(null); OWLNamedClass idcls = owlDoc.owlModel .getOWLNamedClass("Identifier"); // is die klasse selbst // schon ein identifiert if (toClass.getNamedSuperclasses(true).contains(idcls)) { // to owlDoc.setProperty(subjInd, "rdfs:label", subjText); } else { OWLIndividual ident = owlDoc .createInstance("IdentifierPredicateOrSubject"); owlDoc.setProperty(ident, "rdfs:label", subjText); owlDoc.setProperty(subjInd, "crm:P48_has_preferred_identifier", ident); } } return subjInd; } private OWLNamedClass getPreferredTargetClass(OWLNamedClass cls, String propertyName) { RDFProperty prop = owlDoc.owlModel.getRDFProperty(propertyName); // finde welche klasse als subject erlaubt ist Collection<?> restrictions = cls.getRestrictions(prop, true); RDFResource restrictionValues = prop.getRange(); // nimm erstmal den // gesammten Range // schaue jetzt nach ob eb es einschraenkungen gibt. for (Iterator<?> it = restrictions.iterator(); it.hasNext();) { Object restriction = it.next(); if (OWLAllValuesFrom.class.isInstance(restriction)) { OWLAllValuesFrom ar = (OWLAllValuesFrom) restriction; restrictionValues = ar.getAllValuesFrom(); break; } } OWLNamedClass toClass = null; if (OWLNamedClass.class.isInstance(restrictionValues)) { toClass = (OWLNamedClass) restrictionValues; } else if (OWLDataRange.class.isInstance(restrictionValues)) { RDFList dr = ((OWLDataRange) restrictionValues).getOneOf(); for (Object d : dr.getValues()) { System.out.println(d); toClass = (OWLNamedClass) d; // FIXME: geht das?? } } else if (DefaultOWLUnionClass.class.isInstance(restrictionValues)) { // mehr // als // eine // moeglich DefaultOWLUnionClass ou = (DefaultOWLUnionClass) restrictionValues; Set set = new HashSet(); ou.getNestedNamedClasses(set); for (Iterator<?> it = set.iterator(); it.hasNext();) { OWLNamedClass cl = (OWLNamedClass) it.next(); OWLNamedClass idcls = owlDoc.owlModel .getOWLNamedClass("Identifier"); // nimm die Klasse die // ein Identifier // ist. if (cl.getNamedSuperclasses(true).contains(idcls)) { toClass = cl; break; } } } return toClass; } private OWLIndividual handleNameWithRole(OWLIndividual recordInd, String name, String role) { // teste ob schon ein solcer Eintrag existiert OWLIndividual nameInd = owlDoc.getIndividualByReadableId( "IndicesNames", name); Boolean createNewNameWithRole = false; if (nameInd == null) { nameInd = createOrGetName(name); createNewNameWithRole = true; // name existierte nicht dann kann // auch NameWithRole nicht // existierten. } OWLIndividual roleInd = owlDoc.getIndividualByReadableId( "IndicesRoles", role); if (roleInd == null) { roleInd = createOrGetRole(role); createNewNameWithRole = true; // role existierte nicht dann kann // auch NameWithRole nicht // existierten. } OWLIndividual nameWithRoleInd = null; if (!createNewNameWithRole) { // schon klar, dass er nicht existiert nameWithRoleInd = getNameWithRole(nameInd, roleInd); } if (nameWithRoleInd == null) { // existiert nicht nameWithRoleInd = createNameWithRole(nameInd, roleInd); } return nameWithRoleInd; } private OWLIndividual getNameWithRole(OWLIndividual nameInd, OWLIndividual roleInd) { List<OWLIndividual> indicesWithNames = owlDoc.getIndividuals( "IndicesNameWithRole", "refers_to_name", nameInd); // suche alle // infrage // kommenden // nameen if (indicesWithNames == null) { // kein Treffer return null; } for (OWLIndividual name : indicesWithNames) { Object role = owlDoc.getRelatedIndividual(name, "refers_to_role"); if (roleInd.equals(role)) { return name; } } return null; } private OWLIndividual createOrGetTipol(String typology) { OWLIndividual tipol = owlDoc.createOrGetInstanceWithIdentifier("Typology","Identifier",typology,false); return tipol; } private OWLIndividual createNameWithRole(OWLIndividual nameInd, OWLIndividual roleInd) { OWLIndividual nameWithRoleInd = owlDoc .createInstance("IndicesNameWithRole"); owlDoc.setProperty(nameWithRoleInd, "refers_to_name", nameInd); owlDoc.setProperty(nameWithRoleInd, "refers_to_role", roleInd); return nameWithRoleInd; } private OWLIndividual createOrGetInstitution(String name) { OWLIndividual nameInd = owlDoc.createOrGetInstanceWithIdentifier("IndicesInstitutions","IdentifierInstitutions",name,false); return nameInd; } private OWLIndividual createOrGetName(String name) { OWLIndividual nameInd = owlDoc.createOrGetInstanceWithIdentifier("IndicesNames","IdentifierNames",name,false); return nameInd; } private OWLIndividual createOrGetRole(String name) { OWLIndividual roleInd = owlDoc.createOrGetInstanceWithIdentifier("IndicesRoles","IdentifierRoles",name,false); return roleInd; } private OWLIndividual createRecord(Element record, OWLIndividual cardInd) throws JDOMException { OWLIndividual recordInstance = owlDoc.createInstance("Record"); owlDoc.setProperty(recordInstance, "is_on_card", cardInd); createNewDependingInstanceFromXpath(record, recordInstance, "./@id", new String[] { "rdfs:label" }, "IdentifierCurrent", "crm:P48_has_preferred_identifier"); String value = lexDoc.getValue(record, ".//textblockid"); if (!value.equals("")) owlDoc.setProperty(recordInstance, "has_textblockid", value); String endOnCarta = lexDoc.getValue(record, "./@end_on_carta"); //FIXME: addRecordToCarta ist buggy. siehe dort! ausserdem wir nicht berŸcksichtig, dass zwischen // card und end_on_carta mehr als eine liegen kann, zur Zeit wird nur die carta die in end_on_carta beschrieben wird zu // record mittels is_on_card hinzugefŸgt. if (!endOnCarta.equals("")) { OWLIndividual signature = (OWLIndividual) owlDoc .getRelatedIndividual(cardInd, "has_signature"); addRecordToCarta(recordInstance, endOnCarta, signature); } String dateDcStart = lexDoc.getValue(record, ".//datdc/startdate"); String dateDcEnd = lexDoc.getValue(record, ".//datdc/enddate"); OWLIndividual timeSpan = owlDoc.createTimeSpan(dateDcStart, dateDcEnd); owlDoc.setProperty(recordInstance, "crm:P4_has_time-span", timeSpan); return recordInstance; } private void addRecordToCarta(OWLIndividual recordInstance, String cardID, OWLIndividual signature) { //FIXME: cartID ist nur innerhalb einer Signatur eindeutig, d.h. h, es muss die cardID gefunden werden die in der // selben signatur lebt wir "signature" OWLIndividual card = owlDoc.getIndividualByReadableId("Card", cardID); if (card == null) { card = createCard(cardID, signature); } owlDoc.setProperty(recordInstance, "is_on_card", card); } private OWLIndividual createSignature(Element signature) throws JDOMException { Element segHeaderElement = (Element) XPath.selectSingleNode(signature, ".//segheader"); String segheader = segHeaderElement.getTextTrim(); OWLIndividual signatureInstance = owlDoc.createInstance("Signatur"); owlDoc.setProperty(signatureInstance, "rdfs:label", segheader); return signatureInstance; } private OWLIndividual createCard(String cardId, OWLIndividual signature) { OWLIndividual cardInstance = owlDoc.createInstance("Card"); owlDoc.setProperty(cardInstance, "has_signature", signature); OWLIndividual preferredId = owlDoc.createInstance("IdentifierCurrent"); owlDoc.setProperty(preferredId, "rdfs:label", cardId); owlDoc.setProperty(cardInstance, "crm:P48_has_preferred_identifier", preferredId); return cardInstance; } private OWLIndividual createCard(Element card, OWLIndividual signatureInd) { OWLIndividual cardInstance = owlDoc.createInstance("Card"); try { createNewDependingInstanceFromXpath(card, cardInstance, ".//cartanr", new String[] { "rdfs:label" }, "IdentifierCurrent", "crm:P48_has_preferred_identifier"); createNewDependingInstanceFromXpath(card, cardInstance, ".//cartaant", new String[] { "rdfs:label" }, "IdentifierCurrent", "crm:P1_is_identified_by"); owlDoc.setProperty(cardInstance, "has_signature", signatureInd); } catch (JDOMException e) { e.printStackTrace(); return null; } return cardInstance; } private void createNewDependingInstanceFromXpath(Element card, OWLIndividual cardInstance, String xpath, String[] propertyNames, String newInstanceClassName, String relationNameToNewInstance) throws JDOMException { List<?> identifierIdEls = (List<?>) XPath.selectNodes(card, xpath); for (Object identifierIdEl : identifierIdEls) { String identifierId = ""; if (Element.class.isInstance(identifierIdEl)) { identifierId = ((Element) identifierIdEl).getTextTrim(); } else if (Attribute.class.isInstance(identifierIdEl)) { identifierId = ((Attribute) identifierIdEl).getValue(); } OWLIndividual identifier = owlDoc .createInstance(newInstanceClassName); for (int i = 0; i < propertyNames.length; i++) { owlDoc.setProperty(identifier, propertyNames[i], identifierId); } owlDoc.setProperty(cardInstance, relationNameToNewInstance, identifier); } } // createNewDependingDataTypePropertyFromXpath(record, eventInstance, // ".//resges/italian", "has_reges", // "italian"); private void createNewDependingDataTypePropertyFromXpath(Element record, OWLIndividual eventInstance, String xpath, String propertyName, String lang) throws JDOMException { List<?> identifierIdEls = (List<?>) XPath.selectNodes(record, xpath); for (Object identifierIdEl : identifierIdEls) { String identifierId = ""; if (Element.class.isInstance(identifierIdEl)) { identifierId = ((Element) identifierIdEl).getTextTrim(); } else if (Attribute.class.isInstance(identifierIdEl)) { identifierId = ((Attribute) identifierIdEl).getValue(); } owlDoc.setDataTypePropery(eventInstance, propertyName, identifierId, lang); } } }