Mercurial > hg > duomoOWLProject
view src/de/mpiwg/dwinter/duomo/stanford/AnalyseWithEvents.java @ 9:4392a6adf85a default tip
new version der label mit language tag
author | dwinter |
---|---|
date | Thu, 16 Aug 2012 11:40:17 +0200 |
parents | 919e9f3b5efd |
children |
line wrap: on
line source
// Analisiere calls from the virtuoso store // "http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/RecordedEvent_41164","Term of payment for debt for forced loans." // select distinct * where { {?x duomo:has_reges ?y} FILTER(lang(?y)="en")} package de.mpiwg.dwinter.duomo.stanford; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import edu.stanford.nlp.io.EncodingPrintWriter.out; import edu.stanford.nlp.ling.CyclicCoreLabel; import edu.stanford.nlp.ling.DocumentReader; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.process.DocumentPreprocessor; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.GrammaticalStructure; import edu.stanford.nlp.trees.GrammaticalStructureFactory; import edu.stanford.nlp.trees.PennTreebankLanguagePack; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.trees.TypedDependency; public class AnalyseWithEvents { private int prepcount = 0; private String prep_ent="http://entities.mpiwg-berlin.mpg.de/research/duomo/prep/"; private String prep_ont="http://ontologies.mpiwg-berlin.mpg.de/research/duomo/prep/"; public void analyse(String filename) throws IOException { LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); // This option shows loading and sentence-segment and tokenizing // a file using DocumentPreprocessor TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); // You could also create a tokenier here (as below) and pass it // to DocumentPreprocessor int count=0; Map<String,List<String>> tuple = new HashMap<String,List<String>>(); Map<String,List<String>> tupleLong = new HashMap<String,List<String>>(); Map<String,List<String>> words = new HashMap<String,List<String>>(); FileInputStream fstream = new FileInputStream(filename); // Get the object of DataInputStream DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String strLineFull; //Read File Line By Line while ((strLineFull = br.readLine()) != null) { // correct line needs to be completed to a sentence String[] splitted = strLineFull.split(","); // Line hat die Form: "http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/RecordedEvent_41164","Term of payment for debt for forced loans." String strLine=splitted[1]; String recordURI = splitted[0]; strLine=strLine.replace("\"", ""); strLine="This is a "+strLine; Reader dr = DocumentReader.getReader(strLine); for (List<HasWord> sentence : new DocumentPreprocessor(dr)) { Tree parse = lp.apply(sentence); //parse.pennPrint(); //System.out.println(); for (HasWord word: sentence) { Word wd = (Word)word; String st= wd.value().toLowerCase(); if (words.containsKey(st)){ words.get(st).add(recordURI); } else { List<String> ls =new ArrayList<String>(); ls.add(recordURI); words.put(st, ls); } } GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection tdl = gs.typedDependenciesCCprocessed(true); for (Object t: tdl){ if (TypedDependency.class.isInstance(t)){ TypedDependency td = (TypedDependency)t; GrammaticalRelation reln = td.reln(); if (reln.getShortName().equals("prep") || reln.getShortName().equals("conj") ){ String st = reln.getShortName() +"\t"; st +=td.gov().label().value()+"\t"; st+=td.dep().label().value(); st=st.toLowerCase(); if (tuple.containsKey(st)){ tuple.get(st).add(recordURI); } else { List<String> ls =new ArrayList<String>(); ls.add(recordURI); tuple.put(st, ls); } st = reln.getShortName()+"\t"+reln.getSpecific()+"\t"; st +=td.gov().label().value()+"\t"; st+=td.dep().label().value(); st=st.toLowerCase(); if (tupleLong.containsKey(st)){ tupleLong.get(st).add(recordURI); } else { List<String> ls =new ArrayList<String>(); ls.add(recordURI); tupleLong.put(st, ls); } } } } //System.out.println(tdl); //System.out.println(); count++; System.out.println(count); } // if (count > 100) // break; } System.out.println(tuple); System.out.println(tupleLong); FileWriter fw = new FileWriter("/tmp/tuple"); for (String key : tuple.keySet()){ List<String> val = tuple.get(key); fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n"); } fw.close(); fw = new FileWriter("/tmp/tupleLong"); FileWriter fw2 = new FileWriter("/tmp/tupleLong.nt3.rdf"); for (String key : tupleLong.keySet()){ List<String> val = tupleLong.get(key); fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n"); String res = writePrepAsTriple(fw2,key); writeEventsToRes(fw2,res,val); } fw.close(); fw2.close(); fw = new FileWriter("/tmp/words"); for (String key : words.keySet()){ List<String> val = words.get(key); fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n"); } fw.close(); } private void writeEventsToRes(FileWriter fw2, String prepUri, List<String> val) throws IOException { for (String res :val){ fw2.write("<"+res.replace("\"", "")+"><"+prep_ont+"contains> <"+prepUri+">.\n"); } fw2.flush(); } private String writePrepAsTriple(FileWriter fw2, String prep) throws IOException { String[] splitted = prep.split("\t"); prepcount+=1; String resUri=String.format(prep_ent+"prep_%s",prepcount); fw2.write("<"+resUri+ "> rdf:type "+"<"+prep_ont+"Preposition>.\n"); if (!splitted[2].equals("")){ String wd = URLEncoder.encode(splitted[2],"utf-8"); fw2.write("<"+resUri+ "> "+"<"+prep_ont+"main> <"+prep_ent+"Word_"+wd+">.\n"); fw2.write("<"+prep_ent+"Word_"+wd+"> rdfs:label \""+splitted[2]+"\"@en .\n"); fw2.write("<"+prep_ent+"Word_"+wd+"> rdf:type "+"<"+prep_ont+"Word>.\n"); } if (!splitted[3].equals("")){ String wd = URLEncoder.encode(splitted[3],"utf-8"); fw2.write("<"+resUri+ "> "+"<"+prep_ont+"specification> <"+prep_ent+"Word_"+wd+">.\n"); fw2.write("<"+prep_ent+"Word_"+wd+"> rdfs:label \""+splitted[3]+"\"@en .\n"); fw2.write("<"+prep_ent+"Word_"+wd+"> rdf:type "+"<"+prep_ont+"Word>.\n"); } if (!splitted[1].equals("")){ String wd = URLEncoder.encode(splitted[1],"utf-8"); fw2.write("<"+resUri+ "> "+"<"+prep_ont+"prepType> <"+prep_ent+"Type_"+wd+">.\n"); fw2.write("<"+prep_ent+"Type_"+wd+"> rdfs:label \""+splitted[1]+"\"@en .\n"); fw2.write("<"+prep_ent+"Word_"+wd+"> rdf:type "+"<"+prep_ont+"Type>.\n"); } fw2.flush(); return resUri; } /** * @param args */ public static void main(String[] args) { AnalyseWithEvents a = new AnalyseWithEvents(); try { a.analyse("/tmp/reges.csv"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }