Mercurial > hg > duomoOWLProject
diff src/de/mpiwg/dwinter/duomo/stanford/AnalyseWithEvents.java @ 8:919e9f3b5efd
neue klassen zur textanalyse (stanford parser eingebaut)
alle has_readable_labe Datatype properties durch rdfs:label ersetzt.
author | dwinter |
---|---|
date | Thu, 21 Jun 2012 17:08:22 +0200 |
parents | |
children | 4392a6adf85a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/duomo/stanford/AnalyseWithEvents.java Thu Jun 21 17:08:22 2012 +0200 @@ -0,0 +1,210 @@ +// Analisiere calls from the virtuoso store +// "http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/RecordedEvent_41164","Term of payment for debt for forced loans." +// select distinct * where { {?x duomo:has_reges ?y} FILTER(lang(?y)="en")} + + +package de.mpiwg.dwinter.duomo.stanford; + +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.stanford.nlp.io.EncodingPrintWriter.out; +import edu.stanford.nlp.ling.CyclicCoreLabel; +import edu.stanford.nlp.ling.DocumentReader; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.parser.lexparser.LexicalizedParser; +import edu.stanford.nlp.process.DocumentPreprocessor; +import edu.stanford.nlp.trees.GrammaticalRelation; +import edu.stanford.nlp.trees.GrammaticalStructure; +import edu.stanford.nlp.trees.GrammaticalStructureFactory; +import edu.stanford.nlp.trees.PennTreebankLanguagePack; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreebankLanguagePack; +import edu.stanford.nlp.trees.TypedDependency; + +public class AnalyseWithEvents { + + public void analyse(String filename) throws IOException { + + LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); + // This option shows loading and sentence-segment and tokenizing + // a file using DocumentPreprocessor + TreebankLanguagePack tlp = new PennTreebankLanguagePack(); + GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); + // You could also create a tokenier here (as below) and pass it + // to DocumentPreprocessor + + int count=0; + Map<String,List<String>> tuple = new HashMap<String,List<String>>(); + Map<String,List<String>> tupleLong = new HashMap<String,List<String>>(); + Map<String,List<String>> words = new HashMap<String,List<String>>(); + + FileInputStream fstream = new FileInputStream(filename); + // Get the object of DataInputStream + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLineFull; + //Read File Line By Line + while ((strLineFull = br.readLine()) != null) { + + // correct line needs to be completed to a sentence + String[] splitted = strLineFull.split(","); + + + // Line hat die Form: "http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/RecordedEvent_41164","Term of payment for debt for forced loans." + + String strLine=splitted[1]; + String recordURI = splitted[0]; + strLine=strLine.replace("\"", ""); + strLine="This is a "+strLine; + + + Reader dr = DocumentReader.getReader(strLine); + + + + for (List<HasWord> sentence : new DocumentPreprocessor(dr)) { + Tree parse = lp.apply(sentence); + //parse.pennPrint(); + //System.out.println(); + + for (HasWord word: sentence) + { + Word wd = (Word)word; + + String st= wd.value().toLowerCase(); + + if (words.containsKey(st)){ + words.get(st).add(recordURI); + } else { + List<String> ls =new ArrayList<String>(); + ls.add(recordURI); + words.put(st, ls); + } + + } + + + GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); + Collection tdl = gs.typedDependenciesCCprocessed(true); + + for (Object t: tdl){ + if (TypedDependency.class.isInstance(t)){ + + + TypedDependency td = (TypedDependency)t; + + GrammaticalRelation reln = td.reln(); + if (reln.getShortName().equals("prep") || reln.getShortName().equals("conj") ){ + + String st = reln.getShortName() + +"\t"; + + st +=td.gov().label().value()+"\t"; + + st+=td.dep().label().value(); + + st=st.toLowerCase(); + + if (tuple.containsKey(st)){ + tuple.get(st).add(recordURI); + } else { + List<String> ls =new ArrayList<String>(); + ls.add(recordURI); + tuple.put(st, ls); + } + + + st = reln.getShortName()+"\t"+reln.getSpecific()+"\t"; + + st +=td.gov().label().value()+"\t"; + + st+=td.dep().label().value(); + + st=st.toLowerCase(); + + if (tupleLong.containsKey(st)){ + tupleLong.get(st).add(recordURI); + } else { + List<String> ls =new ArrayList<String>(); + ls.add(recordURI); + tupleLong.put(st, ls); + } + + + } + + } + + } + + //System.out.println(tdl); + //System.out.println(); + count++; + System.out.println(count); + + + } + //if (count > 5) + // break; + } + System.out.println(tuple); + System.out.println(tupleLong); + + FileWriter fw = new FileWriter("/tmp/tuple"); + + for (String key : tuple.keySet()){ + List<String> val = tuple.get(key); + fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n"); + + } + fw.close(); + + + fw = new FileWriter("/tmp/tupleLong"); + + for (String key : tupleLong.keySet()){ + List<String> val = tupleLong.get(key); + + fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n"); + } + fw.close(); + + fw = new FileWriter("/tmp/words"); + + for (String key : words.keySet()){ + + List<String> val = words.get(key); + fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n"); + } + fw.close(); + + } + /** + * @param args + */ + public static void main(String[] args) { + AnalyseWithEvents a = new AnalyseWithEvents(); + try { + a.analyse("/tmp/reges.csv"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + +}