# HG changeset patch
# User dwinter
# Date 1340291302 -7200
# Node ID 919e9f3b5efd3e59b2883260d42f3211f0c3558c
# Parent 19e40abb3e8ad89e50be9925d1af662a3814d583
neue klassen zur textanalyse (stanford parser eingebaut)
alle has_readable_labe Datatype properties durch rdfs:label ersetzt.
diff -r 19e40abb3e8a -r 919e9f3b5efd .classpath
--- a/.classpath Wed Feb 09 16:36:36 2011 +0100
+++ b/.classpath Thu Jun 21 17:08:22 2012 +0200
@@ -7,5 +7,12 @@
+
+
+
+
+
+
+
diff -r 19e40abb3e8a -r 919e9f3b5efd libs/commons-lang-2.0.jar
Binary file libs/commons-lang-2.0.jar has changed
diff -r 19e40abb3e8a -r 919e9f3b5efd protege.properties
--- a/protege.properties Wed Feb 09 16:36:36 2011 +0100
+++ b/protege.properties Thu Jun 21 17:08:22 2012 +0200
@@ -1,5 +1,5 @@
#Protege Properties
-#Wed Feb 09 16:23:54 CET 2011
+#Thu Jun 21 17:04:13 CEST 2012
SwitchableClassDefinitionType=edu.stanford.smi.protegex.owl.ui.cls.LogicClassDefinitionWidgetType
history.projects.reopen=file\:/Users/dwinter/Documents/Projekte/Diss%20-%20data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/examples/newspaper/newspaper.pprj,file\:/Users/dwinter/Documents/Projekte/Diss%20-%20data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/examples/sqwrl/SQWRLExamples.pprj,file\:/Users/dwinter/Documents/Projekte/Diss%20-%20data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/examples/pizza/pizza.owl.pprj
OntURIBase=http\://www.owl-ontologies.com
diff -r 19e40abb3e8a -r 919e9f3b5efd src/de/mpiwg/dwinter/duomo/lexdump/LexDumpImporter.java
--- a/src/de/mpiwg/dwinter/duomo/lexdump/LexDumpImporter.java Wed Feb 09 16:36:36 2011 +0100
+++ b/src/de/mpiwg/dwinter/duomo/lexdump/LexDumpImporter.java Thu Jun 21 17:08:22 2012 +0200
@@ -2,19 +2,25 @@
import java.io.File;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.lang.StringUtils;
+import org.apache.log4j.Logger;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
+import org.jdom.Text;
import org.jdom.input.SAXBuilder;
import org.jdom.xpath.XPath;
+
public class LexDumpImporter {
private Document doc;
+ private Logger logger;
public LexDumpImporter(String path) throws JDOMException, IOException{
@@ -22,6 +28,7 @@
doc = builder.build(new File(path));
+ logger = Logger.getRootLogger();
}
@SuppressWarnings("unchecked")
@@ -40,7 +47,18 @@
if (node==null){
return "";
} else if (Element.class.isInstance(node)){
- return ((Element)node).getTextTrim();
+ List retArray=new ArrayList();
+ for (Object o: ((Element)node).getContent())
+ {
+ if(Element.class.isInstance(o)){
+ retArray.add(((Element)o).getTextTrim());
+ } else if(Text.class.isInstance(o)) {
+ retArray.add(((Text)o).getTextTrim());
+ }
+ }
+ Object[] X = retArray.toArray();
+ return StringUtils.join(X,' ');
+ //return ((Element)node).getTextTrim();
} else if (Attribute.class.isInstance(node)){
return ((Attribute)node).getValue();
}
diff -r 19e40abb3e8a -r 919e9f3b5efd src/de/mpiwg/dwinter/duomo/lexdump/LexOWLTransformer.java
--- a/src/de/mpiwg/dwinter/duomo/lexdump/LexOWLTransformer.java Wed Feb 09 16:36:36 2011 +0100
+++ b/src/de/mpiwg/dwinter/duomo/lexdump/LexOWLTransformer.java Thu Jun 21 17:08:22 2012 +0200
@@ -26,7 +26,9 @@
import com.sun.xml.internal.ws.developer.MemberSubmissionEndpointReference.Elements;
import edu.stanford.smi.protege.exception.OntologyLoadException;
+import edu.stanford.smi.protege.model.Facet;
import edu.stanford.smi.protege.model.Instance;
+import edu.stanford.smi.protege.model.Slot;
import edu.stanford.smi.protegex.owl.jena.JenaOWLModel;
import edu.stanford.smi.protegex.owl.model.OWLAllValuesFrom;
import edu.stanford.smi.protegex.owl.model.OWLClass;
@@ -38,9 +40,17 @@
import edu.stanford.smi.protegex.owl.model.RDFList;
import edu.stanford.smi.protegex.owl.model.RDFProperty;
import edu.stanford.smi.protegex.owl.model.RDFResource;
+import edu.stanford.smi.protegex.owl.model.RDFSClass;
import edu.stanford.smi.protegex.owl.model.impl.AbstractOWLQuantifierRestriction;
+import edu.stanford.smi.protegex.owl.model.impl.AbstractOWLRestriction;
+import edu.stanford.smi.protegex.owl.model.impl.DefaultOWLAllValuesFrom;
import edu.stanford.smi.protegex.owl.model.impl.DefaultOWLUnionClass;
+/**
+ * @author dwinter
+ *
+ *Transformiert die Lex Files in OWL
+ */
public class LexOWLTransformer {
private Logger logger = Logger.getRootLogger();
private FileWriter missing;
@@ -106,9 +116,14 @@
LexOWLTransformer tf = new LexOWLTransformer(owlDoc, lexDoc);
tf.transform();
- owlDoc.save("file:///tmp/out.owl");
+ //owlDoc.save("file:///tmp/out.owl");
}
+ /**
+ * Hautmethoden zur Transformation
+ * @throws URISyntaxException
+ * @throws Exception
+ */
private void transform() throws URISyntaxException, Exception {
List signatures = lexDoc.getSignatures();
@@ -169,9 +184,9 @@
// timespan
try {
- String dateDcStart = lexDoc.getValue(record, ".//datdf/startdate");
+ String dateDcStart = lexDoc.getValue(record, ".//datrf/startdate");
- String dateDcEnd = lexDoc.getValue(record, ".//datdf/startdate");
+ String dateDcEnd = lexDoc.getValue(record, ".//datrf/enddate");
if (!dateDcStart.equals("")) {
OWLIndividual timeSpan = owlDoc.createTimeSpan(dateDcStart,
@@ -217,12 +232,19 @@
String name = lexDoc.getValue(nomiq, "./name");
String role = lexDoc.getValue(nomiq, "./role");
+ String provenance = lexDoc.getValue(nomiq, "./name/provenance");
+
+
+
if (!name.equals("") && !role.equals("")) {
recordNamesRoles = handleNameWithRole(recordInd, name, role);
} else if (!role.equals("")) {
recordNamesRoles = createOrGetRole(role);
} else if (!name.equals("")) {
recordNamesRoles = createOrGetName(name);
+ if (provenance!=""){
+ owlDoc.setDataTypePropery(recordNamesRoles, "has_provenance_as_string", provenance, "it");
+ }
}
if (recordNamesRoles != null) {
@@ -274,21 +296,23 @@
}
private void createType(OWLIndividual eventInstance, Element type) {
-
+
+
String typeId;
try {
typeId = lexDoc.getValue(type, "./ptr/@target");
String clsName = owlDoc.getClassNameFromTypeId(typeId);
- OWLNamedClass cls = owlDoc.getClassFromTypeId(typeId);
OWLIndividual typeInd = owlDoc.createInstance(clsName);
- OWLNamedClass subjectClass = getPreferredTargetClass(cls,
- "has_subject");
- OWLNamedClass predicateClass = getPreferredTargetClass(cls,
- "has_predicate");
-
+ owlDoc.setProperty(eventInstance, "has_topic", typeInd);
List freeTexts = XPath.selectNodes(type, "./freetext");
for (Element freeText : freeTexts) {
+ OWLNamedClass cls = owlDoc.getClassFromTypeId(typeId);
+
+ OWLNamedClass subjectClass = getPreferredTargetClass(cls,
+ "has_subject");
+ OWLNamedClass predicateClass = getPreferredTargetClass(cls,
+ "has_predicate");
String subjPointer = lexDoc.getValue(freeText,
"./sub/ptrtoperson/@target");
@@ -296,6 +320,55 @@
OWLIndividual subjInd = createSubjectOrPredicate(subjectClass,
subjPointer, subjText);
+
+
+ //suche ob eine subpropery von materiaInvolved fuer die die zem Type (type) gehoerige Klasse (clsName) existiert
+ // und wenn ja welche, TODO: zur Zeit wird dann aus dem String "subjText" das entsprechende Material erzeugt.
+
+
+ //Collection props = cls.getPossibleRDFProperties();
+
+
+
+ RDFProperty superproperty= owlDoc.owlModel.getRDFProperty("http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/materialInvolved");
+
+ Collection sc = cls.getSuperclasses(true);
+
+ OWLNamedClass mat = owlDoc.owlModel.getOWLNamedClass("http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/DuomoActivities");
+
+ if (sc.contains(mat)){ // cls is subclass of DuomoActivities
+ Collection props = superproperty.getSubproperties(false);
+
+ //jetzt suche zu welcher subproperty die classe cls gehšrt
+
+ //dazu gehe durch alle subproperties von materialInvolved-
+ for (RDFProperty prop:props){
+
+ @SuppressWarnings("unchecked")
+ Collection domains = prop.getDomains(true);
+
+ for(RDFSClass domain: domains){
+ //if (domain.getName().equals(cls.getName()))
+
+ //suche jetzt die domaene zu diesen property in schaue ob die cls eine subklasse davon ist
+ if (cls.isSubclassOf(domain)) // cls ist in der domaene der property
+ {
+
+ //die propery muss genau einen wert aus einer festen klasse haben, diese wird jetzt gesucht und dann eine individual erzeugt.
+ Collection restrictions = cls.getRestrictions(prop, false); // suche die restriction und erzeuge dann ein object dieses type
+ for (AbstractOWLRestriction restriction: restrictions){
+ if (DefaultOWLAllValuesFrom.class.isInstance(restriction)){
+ DefaultOWLAllValuesFrom rest = (DefaultOWLAllValuesFrom)restriction;
+ RDFResource restClass = rest.getAllValuesFrom();
+ OWLIndividual inst = owlDoc.createOrGetInstanceWithIdentifier(restClass.getLocalName(), "Identifier", subjText, false);
+ owlDoc.setProperty(typeInd, prop.getLocalName(), inst);
+ //materialInd = owlDoc.createInstance(res.getName());
+ }
+ }
+ }
+ }
+ }
+ }
String predPointer = lexDoc.getValue(freeText,
"./pred/ptrtoperson/@target");
@@ -328,7 +401,8 @@
OWLIndividual subjInd = null;
if (!subjPointer.equals("")) {
- subjInd = toClass.createOWLIndividual(null);
+ subjInd = owlDoc.createInstance(toClass.getName());
+ //subjInd = toClass.createOWLIndividual(null);
OWLIndividual ind = individualIds.get(subjPointer);
if (ind == null) {
logger.debug("target ID does not exist:" + subjPointer);
@@ -344,20 +418,21 @@
}
}
- if (!subjText.equals("")) {
+ if (!subjText.equals("") & !subjText.equals(" ")) {
if (subjInd == null)
- subjInd = toClass.createOWLIndividual(null);
+ subjInd = owlDoc.createInstance(toClass.getName());
+ //subjInd = toClass.createOWLIndividual(null);
OWLNamedClass idcls = owlDoc.owlModel
.getOWLNamedClass("Identifier"); // is die klasse selbst
// schon ein identifiert
if (toClass.getNamedSuperclasses(true).contains(idcls)) { // to
- owlDoc.setProperty(subjInd, "has_readable_id", subjText);
+ owlDoc.setProperty(subjInd, "rdfs:label", subjText);
} else {
OWLIndividual ident = owlDoc
.createInstance("IdentifierPredicateOrSubject");
- owlDoc.setProperty(ident, "has_readable_id", subjText);
+ owlDoc.setProperty(ident, "rdfs:label", subjText);
owlDoc.setProperty(subjInd, "crm:P48_has_preferred_identifier",
ident);
}
@@ -513,7 +588,7 @@
OWLIndividual recordInstance = owlDoc.createInstance("Record");
owlDoc.setProperty(recordInstance, "is_on_card", cardInd);
createNewDependingInstanceFromXpath(record, recordInstance, "./@id",
- new String[] { "has_readable_id", "rdfs:label" },
+ new String[] { "rdfs:label" },
"IdentifierCurrent", "crm:P48_has_preferred_identifier");
String value = lexDoc.getValue(record, ".//textblockid");
@@ -521,14 +596,18 @@
owlDoc.setProperty(recordInstance, "has_textblockid", value);
String endOnCarta = lexDoc.getValue(record, "./@end_on_carta");
+
+ //FIXME: addRecordToCarta ist buggy. siehe dort! ausserdem wir nicht berŸcksichtig, dass zwischen
+ // card und end_on_carta mehr als eine liegen kann, zur Zeit wird nur die carta die in end_on_carta beschrieben wird zu
+ // record mittels is_on_card hinzugefŸgt.
if (!endOnCarta.equals("")) {
OWLIndividual signature = (OWLIndividual) owlDoc
.getRelatedIndividual(cardInd, "has_signature");
- addRecordToCarta(recordInstance, value, signature);
+ addRecordToCarta(recordInstance, endOnCarta, signature);
}
String dateDcStart = lexDoc.getValue(record, ".//datdc/startdate");
- String dateDcEnd = lexDoc.getValue(record, ".//datdc/startdate");
+ String dateDcEnd = lexDoc.getValue(record, ".//datdc/enddate");
OWLIndividual timeSpan = owlDoc.createTimeSpan(dateDcStart, dateDcEnd);
@@ -540,6 +619,8 @@
private void addRecordToCarta(OWLIndividual recordInstance, String cardID,
OWLIndividual signature) {
+ //FIXME: cartID ist nur innerhalb einer Signatur eindeutig, d.h. h, es muss die cardID gefunden werden die in der
+ // selben signatur lebt wir "signature"
OWLIndividual card = owlDoc.getIndividualByReadableId("Card", cardID);
if (card == null) {
@@ -567,7 +648,6 @@
owlDoc.setProperty(cardInstance, "has_signature", signature);
OWLIndividual preferredId = owlDoc.createInstance("IdentifierCurrent");
- owlDoc.setProperty(preferredId, "has_readable_id", cardId);
owlDoc.setProperty(preferredId, "rdfs:label", cardId);
owlDoc.setProperty(cardInstance, "crm:P48_has_preferred_identifier",
@@ -583,13 +663,11 @@
try {
createNewDependingInstanceFromXpath(card, cardInstance,
- ".//cartanr", new String[] { "has_readable_id",
- "rdfs:label" }, "IdentifierCurrent",
+ ".//cartanr", new String[] { "rdfs:label" }, "IdentifierCurrent",
"crm:P48_has_preferred_identifier");
createNewDependingInstanceFromXpath(card, cardInstance,
- ".//cartaant", new String[] { "has_readable_id",
- "rdfs:label" }, "IdentifierCurrent",
+ ".//cartaant", new String[] { "rdfs:label" }, "IdentifierCurrent",
"crm:P1_is_identified_by");
owlDoc.setProperty(cardInstance, "has_signature", signatureInd);
diff -r 19e40abb3e8a -r 919e9f3b5efd src/de/mpiwg/dwinter/duomo/lexdump/OWLImporter.java
--- a/src/de/mpiwg/dwinter/duomo/lexdump/OWLImporter.java Wed Feb 09 16:36:36 2011 +0100
+++ b/src/de/mpiwg/dwinter/duomo/lexdump/OWLImporter.java Thu Jun 21 17:08:22 2012 +0200
@@ -1,8 +1,10 @@
package de.mpiwg.dwinter.duomo.lexdump;
import java.io.File;
+import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
+import java.io.OutputStreamWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
@@ -11,6 +13,8 @@
import java.util.Iterator;
import java.util.List;
+import javax.swing.text.html.HTMLDocument.HTMLReader.IsindexAction;
+
import org.apache.log4j.Logger;
import edu.stanford.smi.protege.exception.OntologyLoadException;
@@ -24,6 +28,19 @@
import edu.stanford.smi.protegex.owl.repository.RepositoryManager;
import edu.stanford.smi.protegex.owl.repository.impl.LocalFolderRepository;
+/**
+ * Erzeugt Entities und Verwaltet gemЧ eines OWL-Modells.
+ * Alle erzeugten Entities werden direkt als nturtle in eine File geschrieben.
+ *
+ * TODO: Dieses File ist immer /tmp/out.rdf
+ * TODO: zur Ausgabe gibt es eine Methode @see{org.openrdf.rio.trig.TriGWriter} bzw @see{de.mpiwg.itgroup.triplestoremanager.tools.Exporter}
+ *ie benutzt werden sollte anstelle der handgestrickten bei denen direkt in outrdf geschrieben wird.
+ *
+ * TODO: Einzelen Methoden hier sind noch DUOMO und CRM spezifisch, insbesondere
+ * @author dwinter
+ *
+ *
+ */
public class OWLImporter {
JenaOWLModel owlModel; // contains the model
@@ -33,7 +50,16 @@
private HashMap typeId2class=null;
private String ontFolder;
private URI uri;
+ //private FileWriter outRDF;
+ private OutputStreamWriter outRDF;
+ /**
+ * Initialisiert die Klasse und
+ * lŠdt die Ontologien ein
+ * @param folder Ordner mit der zu bearbeitenden Ontologie
+ * @param uri URI der Ontologie selbst
+ * @throws OntologyLoadException
+ */
public OWLImporter(String folder, URI uri) throws OntologyLoadException {
// owlModel = ProtegeOWL.createJenaOWLModelFromURI(uri);
@@ -41,6 +67,7 @@
this.uri=uri;
try {
this.fh= new FileWriter(new File("/tmp/identifier"));
+ this.outRDF= new OutputStreamWriter(new FileOutputStream("/tmp/out.rdf"),"UTF-8");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
@@ -48,10 +75,21 @@
loadOWL(folder, uri);
}
+ /**
+ * LŠdt die Ontologie erneut ein.
+ * @throws OntologyLoadException
+ */
public void reloadOWL() throws OntologyLoadException{
loadOWL(ontFolder, uri);
}
+ /**
+ *
+ * LŠdt die Ontologien ein
+ * @param folder Ordner mit der zu bearbeitenden Ontologie
+ * @param uri URI der Ontologie selbst
+ * @throws OntologyLoadException
+ */
public void loadOWL(String folder, URI uri) throws OntologyLoadException {
owlModel = ProtegeOWL.createJenaOWLModel();
// Load repository
@@ -84,9 +122,9 @@
}
public static void main(String args[]) throws URISyntaxException {
- String base = "/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/owlInput";
+ String base = "/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/duomoData/owl-version";
URI ontologieUri = new URI(
- "file:///Users/dwinter/Documents/Projekte/Diss%20-%20data-mining/eclipseWorkspace/de.mpiwg.dwinter.duomo/owlInput/duomoAnalysis.owl");
+ "file:///Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/duomoData/owl-version/protege_version_duomo/duomoAnalysis.owl");
try {
OWLImporter im = new OWLImporter(base, ontologieUri);
@@ -97,27 +135,81 @@
}
}
- public OWLIndividual createInstance(String string) {
+ /** Erzeuge Instanz uns schreibe in das Exportfile die enstprechende Relation.
+ * @param fullClassName Name der Classe
+ * @return null, wenn das Objekt nicht angelegt werden kann.
+ */
+ public OWLIndividual createInstance(String fullClassName) {
OWLNamedClass owlclass = (OWLNamedClass) owlModel
- .getOWLNamedClass(string);
+ .getOWLNamedClass(fullClassName);
if (owlclass == null) {
- logger.debug("Cannot find OWLClass:" + string);
+ logger.debug("Cannot find OWLClass:" + fullClassName);
return null;
}
//logger.debug("Create new individual of type:"+string);
- return owlclass.createOWLIndividual(null);
+
+
+ OWLIndividual ind = owlclass.createOWLIndividual(null);
+
+ //TODO: replace the following by @see{org.openrdf.rio.trig.TriGWriter}
+ String triple = String.format("<%s> <%s>.\n", ind.getName(),owlclass.getName());
+ try {
+ outRDF.write(triple);
+ outRDF.flush();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return ind;
}
- public void setProperty(OWLIndividual identifier, String propertyName,
+
+ /**
+ * Erzeuge eine Prpoerty und schreibe die entsprechenden Informationen in das out-file.
+ * @param individual
+ * @param propertyName
+ * @param value
+ */
+ public void setProperty(OWLIndividual individual, String propertyName,
Object value) {
RDFProperty prop = owlModel.getRDFProperty(propertyName);
- identifier.setPropertyValue(prop, value);
-
+ individual.setPropertyValue(prop, value);
+
+
+
+ //TODO: replace the following by @see{org.openrdf.rio.trig.TriGWriter}
+ String valName="";
+
+
+ if (OWLIndividual.class.isInstance(value))
+ valName="<"+((OWLIndividual)value).getName()+">";
+ else
+ valName="\""+escapeRDFLit((String)value)+"\"";
+
+
+ String triple = String.format("<%s> <%s> %s.\n", individual.getName(),prop.getName(),valName);
+ try {
+ outRDF.write(triple);
+ outRDF.flush();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
- public void setDataTypePropery(OWLIndividual eventInstance,
+ private String escapeRDFLit(String string){
+ return string.replace("\"", "");
+ }
+
+ /** Erzeuge DatatypePropery schreibe die entsprechenden Informationen in das out-file.
+ * @param individual
+ * @param propertyName
+ * @param value
+ * @param lang Sprach-tag
+ */
+ public void setDataTypePropery(OWLIndividual individual,
String propertyName, String value, String lang) {
RDFProperty prop = owlModel.getRDFProperty(propertyName);
// if(OWLDatatypeProperty.class.isInstance(prop)){
@@ -129,20 +221,35 @@
// }
+
RDFSLiteral langLiteral = owlModel.createRDFSLiteral(value, lang);
- eventInstance.setPropertyValue(prop, langLiteral);
-
+ individual.setPropertyValue(prop, langLiteral);
+ //TODO: replace the following by @see{org.openrdf.rio.trig.TriGWriter}
+ String triple = String.format("<%s> <%s> \"%s\"@%s.\n", individual.getName(),prop.getName(),escapeRDFLit(langLiteral.getString()),lang);
+ try {
+ outRDF.write(triple);
+ outRDF.flush();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
- public Object getRelatedIndividual(OWLIndividual cardInd,
+ /**
+ * Gibt Individual zurŸck das Ÿber die Property mit der Quelle verbunden ist.
+ * @param sourceIndividual
+ * @param propertyName
+ * @return
+ */
+ public Object getRelatedIndividual(OWLIndividual sourceIndividual,
String propertyName) {
RDFProperty prop = owlModel.getRDFProperty(propertyName);
if (prop == null) {
logger.debug("Property does not exist:" + propertyName);
}
- Object value = cardInd.getPropertyValue(prop);
+ Object value = sourceIndividual.getPropertyValue(prop);
return value;
}
@@ -263,32 +370,75 @@
}
public OWLIndividual createTimeSpan(String dateDcStart, String dateDcEnd) {
- OWLIndividual timeSpan = createInstance("DatesDocument");
+ // A date is described by an timespan which is described by an appellation
-
+ //TODO: die lesbaren daten gehšren in time span mit den angaben wir ungefŠhr und soweiter
+ // d.h. die klammern um ein Datum die ungefaehr signalisieren, gehoeren in die Beschreibung der
+ //Qualifier in Time span
+ // in DuomoDate_Appellation kommen die genauen teile
+ // zunaechst schreibe ich in "has_readable" hier genau rein was im Text steht
+ // dieses muss noch analysiert werden
+ // so sollte 1432 gennai 9 --> in ein Computer lesbares modell umgesetzt werden
+ // und 1432 gennaio (9) muss in 1432 gennaio 9 in DuomoDate_Appellation eingetragen werden
+ // und dann mit P79 und P80 in DuomoDate_TimeSpan genauer beschrieben werden, d.h in P79 kommt dann
+ // ungefaehr
+ // ausserdem mussesn "POST" und "ANTE aus dem Text in das Modell eingepflegt werden.
- setProperty(timeSpan, "has_readable_date", dateDcStart);
+
+ OWLIndividual timeSpan = createInstance("DuomoDate_TimeSpan");
+
+ OWLIndividual date= createInstance("DuomoDate_Appellation");
+
+ setProperty(date, "rdfs:label", dateDcStart);
+ //setProperty(timeSpan, "has_readable_date", dateDcStart);
+
+ setProperty(timeSpan,"is_identified_by_Date",date);
if(!(dateDcEnd==null || dateDcEnd.equals(""))){
- setProperty(timeSpan, "has_readable_to_date", dateDcEnd);
+ OWLIndividual toDate= createInstance("DuomoDate_Appellation");
+ setProperty(toDate, "rdfs:label", dateDcEnd);
+ //setProperty(timeSpan, "has_readable_toDate", dateDcEnd);
+ setProperty(timeSpan,"is_identified_by_toDate",toDate);
} else {
dateDcEnd=dateDcStart;
}
OWLIndividual timeSpanIdentifier = createInstance("Identifier");
- setProperty(timeSpanIdentifier,"has_readable_id",dateDcStart+"-"+dateDcEnd);
+ setProperty(timeSpanIdentifier,"rdfs:label",dateDcStart+"-"+dateDcEnd);
setProperty(timeSpan,"crm:P48_has_preferred_identifier",timeSpanIdentifier);
return timeSpan;
}
+
+// public OWLIndividual createTimeSpan2(String dateDcStart, String dateDcEnd) {
+// OWLIndividual timeSpan = createInstance("DatesDocument");
+//
+//
+//
+// setProperty(timeSpan, "has_readable_date", dateDcStart);
+//
+// if(!(dateDcEnd==null || dateDcEnd.equals(""))){
+// setProperty(timeSpan, "has_readable_to_date", dateDcEnd);
+// } else {
+// dateDcEnd=dateDcStart;
+// }
+//
+// OWLIndividual timeSpanIdentifier = createInstance("Identifier");
+// setProperty(timeSpanIdentifier,"has_readable_id",dateDcStart+"-"+dateDcEnd);
+//
+// setProperty(timeSpan,"crm:P48_has_preferred_identifier",timeSpanIdentifier);
+//
+// return timeSpan;
+// }
+
public OWLIndividual getIndividualByReadableId(String className,String identifier){
- return getIndividual(className, "crm:P48_has_preferred_identifier", "Identifier", "has_readable_id", identifier, true);
+ return getIndividual(className, "crm:P48_has_preferred_identifier", "Identifier", "rdfs:label", identifier, true);
}
public OWLIndividual getIndividualByReadableId(String className,String identifier, String classNameIdentifier,boolean subclassedIdentifier){
- return getIndividual(className, "crm:P48_has_preferred_identifier", classNameIdentifier, "has_readable_id", identifier,subclassedIdentifier);
+ return getIndividual(className, "crm:P48_has_preferred_identifier", classNameIdentifier, "rdfs:label", identifier,subclassedIdentifier);
}
public String getClassNameFromTypeId(String typeId) {
@@ -333,11 +483,14 @@
public OWLIndividual createOrGetInstanceWithIdentifier(String classNameInstance,
String classNameIdentifier, String identifier,boolean followSubclasses) {
+
+ identifier=org.apache.commons.lang.StringUtils.strip(identifier);
+
OWLIndividual ind = getIndividualByReadableId(classNameInstance, identifier,classNameIdentifier,followSubclasses);
if(ind==null){
ind = createInstance(classNameInstance);
OWLIndividual identifierInd = createInstance(classNameIdentifier);
- setProperty(identifierInd, "has_readable_id", identifier);
+ setProperty(identifierInd, "rdfs:label", identifier);
try {
fh.write(classNameInstance+" --" +classNameIdentifier+"---"+identifier+"\n");
fh.flush();
diff -r 19e40abb3e8a -r 919e9f3b5efd src/de/mpiwg/dwinter/duomo/stanford/Analyse.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/duomo/stanford/Analyse.java Thu Jun 21 17:08:22 2012 +0200
@@ -0,0 +1,182 @@
+package de.mpiwg.dwinter.duomo.stanford;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import edu.stanford.nlp.io.EncodingPrintWriter.out;
+import edu.stanford.nlp.ling.CyclicCoreLabel;
+import edu.stanford.nlp.ling.DocumentReader;
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.process.DocumentPreprocessor;
+import edu.stanford.nlp.trees.GrammaticalRelation;
+import edu.stanford.nlp.trees.GrammaticalStructure;
+import edu.stanford.nlp.trees.GrammaticalStructureFactory;
+import edu.stanford.nlp.trees.PennTreebankLanguagePack;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreebankLanguagePack;
+import edu.stanford.nlp.trees.TypedDependency;
+
+public class Analyse {
+
+ public void analyse(String filename) throws IOException {
+
+ LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
+ // This option shows loading and sentence-segment and tokenizing
+ // a file using DocumentPreprocessor
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
+ // You could also create a tokenier here (as below) and pass it
+ // to DocumentPreprocessor
+
+ int count=0;
+ Map tuple = new HashMap();
+ Map tupleLong = new HashMap();
+ Map words = new HashMap();
+
+ FileInputStream fstream = new FileInputStream(filename);
+ // Get the object of DataInputStream
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLine;
+ //Read File Line By Line
+ while ((strLine = br.readLine()) != null) {
+
+ // correct line needs to be completed to a sentence
+ strLine=strLine.replace("\"", "");
+ strLine="This is a "+strLine;
+
+
+ Reader dr = DocumentReader.getReader(strLine);
+
+
+
+ for (List sentence : new DocumentPreprocessor(dr)) {
+ Tree parse = lp.apply(sentence);
+ //parse.pennPrint();
+ //System.out.println();
+
+ for (HasWord word: sentence)
+ {
+ Word wd = (Word)word;
+
+ String st= wd.value().toLowerCase();
+
+ if (words.containsKey(st)){
+ words.put(st, words.get(st)+1);
+ } else {
+ words.put(st, 1);
+ }
+
+ }
+
+
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
+ Collection tdl = gs.typedDependenciesCCprocessed(true);
+
+ for (Object t: tdl){
+ if (TypedDependency.class.isInstance(t)){
+
+
+ TypedDependency td = (TypedDependency)t;
+
+ GrammaticalRelation reln = td.reln();
+ if (reln.getShortName().equals("prep") || reln.getShortName().equals("conj") ){
+
+ String st = reln.getShortName()
+ +"\t";
+
+ st +=td.gov().label().value()+"\t";
+
+ st+=td.dep().label().value();
+
+ st=st.toLowerCase();
+ if (tuple.containsKey(st)){
+ tuple.put(st, tuple.get(st)+1);
+ } else {
+ tuple.put(st, 1);
+ }
+
+ st = reln.getShortName()+"\t"+reln.getSpecific()+"\t";
+
+ st +=td.gov().label().value()+"\t";
+
+ st+=td.dep().label().value();
+
+ st=st.toLowerCase();
+
+ if (tupleLong.containsKey(st)){
+ tupleLong.put(st, tupleLong.get(st)+1);
+ } else {
+ tupleLong.put(st, 1);
+ }
+
+ }
+
+ }
+
+ }
+
+ //System.out.println(tdl);
+ //System.out.println();
+ count++;
+ System.out.println(count);
+
+
+ }
+ //if (count > 5)
+ // break;
+ }
+ System.out.println(tuple);
+ System.out.println(tupleLong);
+
+ FileWriter fw = new FileWriter("/tmp/tuple");
+
+ for (String key : tuple.keySet()){
+ fw.write(key+"\t"+String.valueOf(tuple.get(key))+"\n");
+ }
+ fw.close();
+
+
+ fw = new FileWriter("/tmp/tupleLong");
+
+ for (String key : tupleLong.keySet()){
+ fw.write(key+"\t"+String.valueOf(tupleLong.get(key))+"\n");
+ }
+ fw.close();
+
+ fw = new FileWriter("/tmp/words");
+
+ for (String key : words.keySet()){
+ fw.write(key+"\t"+String.valueOf(words.get(key))+"\n");
+ }
+ fw.close();
+
+ }
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ Analyse a = new Analyse();
+ try {
+ a.analyse("/tmp/reges.csv");
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+
+}
diff -r 19e40abb3e8a -r 919e9f3b5efd src/de/mpiwg/dwinter/duomo/stanford/AnalyseWithEvents.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/duomo/stanford/AnalyseWithEvents.java Thu Jun 21 17:08:22 2012 +0200
@@ -0,0 +1,210 @@
+// Analisiere calls from the virtuoso store
+// "http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/RecordedEvent_41164","Term of payment for debt for forced loans."
+// select distinct * where { {?x duomo:has_reges ?y} FILTER(lang(?y)="en")}
+
+
+package de.mpiwg.dwinter.duomo.stanford;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import edu.stanford.nlp.io.EncodingPrintWriter.out;
+import edu.stanford.nlp.ling.CyclicCoreLabel;
+import edu.stanford.nlp.ling.DocumentReader;
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.process.DocumentPreprocessor;
+import edu.stanford.nlp.trees.GrammaticalRelation;
+import edu.stanford.nlp.trees.GrammaticalStructure;
+import edu.stanford.nlp.trees.GrammaticalStructureFactory;
+import edu.stanford.nlp.trees.PennTreebankLanguagePack;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreebankLanguagePack;
+import edu.stanford.nlp.trees.TypedDependency;
+
+public class AnalyseWithEvents {
+
+ public void analyse(String filename) throws IOException {
+
+ LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
+ // This option shows loading and sentence-segment and tokenizing
+ // a file using DocumentPreprocessor
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
+ // You could also create a tokenier here (as below) and pass it
+ // to DocumentPreprocessor
+
+ int count=0;
+ Map> tuple = new HashMap>();
+ Map> tupleLong = new HashMap>();
+ Map> words = new HashMap>();
+
+ FileInputStream fstream = new FileInputStream(filename);
+ // Get the object of DataInputStream
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLineFull;
+ //Read File Line By Line
+ while ((strLineFull = br.readLine()) != null) {
+
+ // correct line needs to be completed to a sentence
+ String[] splitted = strLineFull.split(",");
+
+
+ // Line hat die Form: "http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/RecordedEvent_41164","Term of payment for debt for forced loans."
+
+ String strLine=splitted[1];
+ String recordURI = splitted[0];
+ strLine=strLine.replace("\"", "");
+ strLine="This is a "+strLine;
+
+
+ Reader dr = DocumentReader.getReader(strLine);
+
+
+
+ for (List sentence : new DocumentPreprocessor(dr)) {
+ Tree parse = lp.apply(sentence);
+ //parse.pennPrint();
+ //System.out.println();
+
+ for (HasWord word: sentence)
+ {
+ Word wd = (Word)word;
+
+ String st= wd.value().toLowerCase();
+
+ if (words.containsKey(st)){
+ words.get(st).add(recordURI);
+ } else {
+ List ls =new ArrayList();
+ ls.add(recordURI);
+ words.put(st, ls);
+ }
+
+ }
+
+
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
+ Collection tdl = gs.typedDependenciesCCprocessed(true);
+
+ for (Object t: tdl){
+ if (TypedDependency.class.isInstance(t)){
+
+
+ TypedDependency td = (TypedDependency)t;
+
+ GrammaticalRelation reln = td.reln();
+ if (reln.getShortName().equals("prep") || reln.getShortName().equals("conj") ){
+
+ String st = reln.getShortName()
+ +"\t";
+
+ st +=td.gov().label().value()+"\t";
+
+ st+=td.dep().label().value();
+
+ st=st.toLowerCase();
+
+ if (tuple.containsKey(st)){
+ tuple.get(st).add(recordURI);
+ } else {
+ List ls =new ArrayList();
+ ls.add(recordURI);
+ tuple.put(st, ls);
+ }
+
+
+ st = reln.getShortName()+"\t"+reln.getSpecific()+"\t";
+
+ st +=td.gov().label().value()+"\t";
+
+ st+=td.dep().label().value();
+
+ st=st.toLowerCase();
+
+ if (tupleLong.containsKey(st)){
+ tupleLong.get(st).add(recordURI);
+ } else {
+ List ls =new ArrayList();
+ ls.add(recordURI);
+ tupleLong.put(st, ls);
+ }
+
+
+ }
+
+ }
+
+ }
+
+ //System.out.println(tdl);
+ //System.out.println();
+ count++;
+ System.out.println(count);
+
+
+ }
+ //if (count > 5)
+ // break;
+ }
+ System.out.println(tuple);
+ System.out.println(tupleLong);
+
+ FileWriter fw = new FileWriter("/tmp/tuple");
+
+ for (String key : tuple.keySet()){
+ List val = tuple.get(key);
+ fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n");
+
+ }
+ fw.close();
+
+
+ fw = new FileWriter("/tmp/tupleLong");
+
+ for (String key : tupleLong.keySet()){
+ List val = tupleLong.get(key);
+
+ fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n");
+ }
+ fw.close();
+
+ fw = new FileWriter("/tmp/words");
+
+ for (String key : words.keySet()){
+
+ List val = words.get(key);
+ fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n");
+ }
+ fw.close();
+
+ }
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ AnalyseWithEvents a = new AnalyseWithEvents();
+ try {
+ a.analyse("/tmp/reges.csv");
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+
+}
diff -r 19e40abb3e8a -r 919e9f3b5efd src/de/mpiwg/dwinter/duomo/stanford/ParserDemo.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/duomo/stanford/ParserDemo.java Thu Jun 21 17:08:22 2012 +0200
@@ -0,0 +1,112 @@
+package de.mpiwg.dwinter.duomo.stanford;
+
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+
+import edu.stanford.nlp.objectbank.TokenizerFactory;
+import edu.stanford.nlp.process.CoreLabelTokenFactory;
+import edu.stanford.nlp.process.DocumentPreprocessor;
+import edu.stanford.nlp.process.PTBTokenizer;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.DocumentReader;
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.trees.*;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+
+class ParserDemo {
+
+ public static void main(String[] args) {
+ LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
+ if (args.length > 0) {
+ try {
+ demoDP(lp, args[0]);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ } else {
+ demoAPI(lp);
+ }
+ }
+
+ public static void demoDP(LexicalizedParser lp, String filename) throws IOException {
+ // This option shows loading and sentence-segment and tokenizing
+ // a file using DocumentPreprocessor
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
+ // You could also create a tokenier here (as below) and pass it
+ // to DocumentPreprocessor
+
+ FileInputStream fstream = new FileInputStream(filename);
+ // Get the object of DataInputStream
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLine;
+ //Read File Line By Line
+ while ((strLine = br.readLine()) != null) {
+
+ // correct line needs to be completed to a sentence
+ strLine=strLine.replace("\"", "");
+ strLine="This is a "+strLine;
+
+
+ Reader dr = DocumentReader.getReader(strLine);
+
+ for (List sentence : new DocumentPreprocessor(dr)) {
+ Tree parse = lp.apply(sentence);
+ parse.pennPrint();
+ System.out.println();
+
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
+ Collection tdl = gs.typedDependenciesCCprocessed(true);
+ System.out.println(tdl);
+ System.out.println();
+ }
+ }
+ }
+
+ public static void demoAPI(LexicalizedParser lp) {
+ // This option shows parsing a list of correctly tokenized words
+ String[] sent = { "This", "is", "an", "easy", "sentence", "." };
+ List rawWords = new ArrayList();
+ for (String word : sent) {
+ CoreLabel l = new CoreLabel();
+ l.setWord(word);
+ rawWords.add(l);
+ }
+ Tree parse = lp.apply(rawWords);
+ parse.pennPrint();
+ System.out.println();
+
+ // This option shows loading and using an explicit tokenizer
+ String sent2 = "This is another sentence.";
+ TokenizerFactory tokenizerFactory =
+ PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
+ List rawWords2 =
+ tokenizerFactory.getTokenizer(new StringReader(sent2)).tokenize();
+ parse = lp.apply(rawWords2);
+
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
+ List tdl = gs.typedDependenciesCCprocessed();
+ System.out.println(tdl);
+ System.out.println();
+
+ TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
+ tp.printTree(parse);
+ }
+
+ private ParserDemo() {} // static methods only
+
+}
diff -r 19e40abb3e8a -r 919e9f3b5efd src/de/mpiwg/dwinter/duomo/stanford/ParserDemo2.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/duomo/stanford/ParserDemo2.java Thu Jun 21 17:08:22 2012 +0200
@@ -0,0 +1,72 @@
+package de.mpiwg.dwinter.duomo.stanford;
+
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.*;
+
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.ling.Sentence;
+import edu.stanford.nlp.process.DocumentPreprocessor;
+import edu.stanford.nlp.process.Tokenizer;
+import edu.stanford.nlp.trees.*;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+
+class ParserDemo2 {
+
+ /** Usage: ParserDemo2 [[grammar] textFile] */
+ public static void main(String[] args) throws IOException {
+ String grammar = args.length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
+ String[] options = { "-maxLength", "80", "-retainTmpSubcategories" };
+ LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
+
+
+ Iterable> sentences;
+ if (args.length > 1) {
+ DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
+ List> tmp =
+ new ArrayList>();
+ for (List sentence : dp) {
+ tmp.add(sentence);
+ }
+ sentences = tmp;
+ } else {
+ // Showing tokenization and parsing in code a couple of different ways.
+ String[] sent = { "This", "is", "an", "easy", "sentence", "." };
+ List sentence = new ArrayList();
+ for (String word : sent) {
+ sentence.add(new Word(word));
+ }
+ String sent2 = ("This is a slightly longer and more complex " +
+ "sentence requiring tokenization.");
+ Tokenizer extends HasWord> toke =
+ tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
+ List extends HasWord> sentence2 = toke.tokenize();
+ List> tmp =
+ new ArrayList>();
+ tmp.add(sentence);
+ tmp.add(sentence2);
+ sentences = tmp;
+ }
+
+ for (List extends HasWord> sentence : sentences) {
+ Tree parse = lp.apply(sentence);
+ parse.pennPrint();
+ System.out.println();
+ System.out.println(parse.taggedYield());
+ System.out.println();
+
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
+ Collection tdl = gs.typedDependenciesCCprocessed(true);
+ System.out.println(tdl);
+ System.out.println();
+ }
+
+ String sent3 = "This is one last test!";
+ lp.apply(sent3).pennPrint();
+ }
+
+}
\ No newline at end of file
diff -r 19e40abb3e8a -r 919e9f3b5efd src/de/mpiwg/dwinter/duomo/stanford/TokenWithEvent.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/duomo/stanford/TokenWithEvent.java Thu Jun 21 17:08:22 2012 +0200
@@ -0,0 +1,137 @@
+package de.mpiwg.dwinter.duomo.stanford;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import edu.stanford.nlp.io.EncodingPrintWriter.out;
+import edu.stanford.nlp.ling.CyclicCoreLabel;
+import edu.stanford.nlp.ling.DocumentReader;
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.process.DocumentPreprocessor;
+import edu.stanford.nlp.trees.GrammaticalRelation;
+import edu.stanford.nlp.trees.GrammaticalStructure;
+import edu.stanford.nlp.trees.GrammaticalStructureFactory;
+import edu.stanford.nlp.trees.PennTreebankLanguagePack;
+import edu.stanford.nlp.trees.PennTreebankTokenizer;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreebankLanguagePack;
+import edu.stanford.nlp.trees.TypedDependency;
+import edu.stanford.nlp.trees.international.negra.NegraPennTokenizer;
+
+public class TokenWithEvent {
+
+ public void analyse(String filename) throws IOException {
+
+ LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
+ // This option shows loading and sentence-segment and tokenizing
+ // a file using DocumentPreprocessor
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
+ // You could also create a tokenier here (as below) and pass it
+ // to DocumentPreprocessor
+
+ int count=0;
+
+
+ Map> words = new HashMap>();
+
+ FileInputStream fstream = new FileInputStream(filename);
+ // Get the object of DataInputStream
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLineFull;
+ //Read File Line By Line
+ while ((strLineFull = br.readLine()) != null) {
+
+ // correct line needs to be completed to a sentence
+
+
+ String[] splitted = strLineFull.split("\",\"");
+ String strLine=splitted[1];
+ String recordURI = splitted[0];
+ strLine=strLine.replace("\"", "");
+ //strLine="This is a "+strLine;
+
+
+ Reader dr = DocumentReader.getReader(strLine);
+
+ //PennTreebankTokenizer tk = new PennTreebankTokenizer(dr);
+ NegraPennTokenizer tk = new NegraPennTokenizer(dr);
+
+ while (tk.hasNext()){
+
+
+ String t = tk.next();
+
+ String st= t.toLowerCase();
+ st= st.replace(".", "");
+ st= st.replace(",", "");
+ st= st.replace(":","");
+ st= st.replace(";","");
+ st= st.replace("!","");
+
+ if (st.length()<2)
+ continue;
+
+ if (words.containsKey(st)){
+ words.get(st).add(recordURI);
+ } else {
+ List ls =new ArrayList();
+ ls.add(recordURI);
+ words.put(st, ls);
+ }
+
+
+
+
+ //System.out.println(tdl);
+ //System.out.println();
+ count++;
+ System.out.println(count);
+
+
+ }
+ //if (count > 100)
+ // break;
+ }
+
+
+
+ FileWriter fw = new FileWriter("/tmp/words2");
+
+ for (String key : words.keySet()){
+ List val = words.get(key);
+ fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n");
+ }
+ fw.close();
+
+ }
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ TokenWithEvent a = new TokenWithEvent();
+ try {
+ a.analyse("/tmp/reges.csv");
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+
+}