diff src/de/mpiwg/dwinter/duomo/stanford/Analyse.java @ 8:919e9f3b5efd

neue klassen zur textanalyse (stanford parser eingebaut) alle has_readable_labe Datatype properties durch rdfs:label ersetzt.
author dwinter
date Thu, 21 Jun 2012 17:08:22 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/duomo/stanford/Analyse.java	Thu Jun 21 17:08:22 2012 +0200
@@ -0,0 +1,182 @@
+package de.mpiwg.dwinter.duomo.stanford;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import edu.stanford.nlp.io.EncodingPrintWriter.out;
+import edu.stanford.nlp.ling.CyclicCoreLabel;
+import edu.stanford.nlp.ling.DocumentReader;
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.process.DocumentPreprocessor;
+import edu.stanford.nlp.trees.GrammaticalRelation;
+import edu.stanford.nlp.trees.GrammaticalStructure;
+import edu.stanford.nlp.trees.GrammaticalStructureFactory;
+import edu.stanford.nlp.trees.PennTreebankLanguagePack;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreebankLanguagePack;
+import edu.stanford.nlp.trees.TypedDependency;
+
+public class Analyse {
+
+	public void analyse(String filename) throws IOException {
+
+		LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
+		// This option shows loading and sentence-segment and tokenizing
+		// a file using DocumentPreprocessor
+		TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+		GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
+		// You could also create a tokenier here (as below) and pass it
+		// to DocumentPreprocessor
+
+		int count=0;
+		Map<String,Integer> tuple = new HashMap<String,Integer>(); 
+		Map<String,Integer> tupleLong = new HashMap<String,Integer>(); 
+		Map<String,Integer> words = new HashMap<String,Integer>(); 
+
+		FileInputStream fstream = new FileInputStream(filename);
+		// Get the object of DataInputStream
+		DataInputStream in = new DataInputStream(fstream);
+		BufferedReader br = new BufferedReader(new InputStreamReader(in));
+		String strLine;
+		//Read File Line By Line
+		while ((strLine = br.readLine()) != null)   {
+
+			// correct line needs to be completed to a sentence
+			strLine=strLine.replace("\"", "");
+			strLine="This is a "+strLine;
+
+
+			Reader dr = DocumentReader.getReader(strLine);
+
+
+
+			for (List<HasWord> sentence : new DocumentPreprocessor(dr)) {
+				Tree parse = lp.apply(sentence);
+				//parse.pennPrint();
+				//System.out.println();
+
+				for (HasWord word: sentence)
+				{
+					Word wd = (Word)word;
+
+					String st= wd.value().toLowerCase();
+
+					if (words.containsKey(st)){
+						words.put(st, words.get(st)+1);
+					} else {
+						words.put(st, 1);
+					}
+
+				}
+
+
+				GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
+				Collection tdl = gs.typedDependenciesCCprocessed(true);
+
+				for (Object t: tdl){
+					if (TypedDependency.class.isInstance(t)){
+
+
+						TypedDependency td = (TypedDependency)t;
+
+						GrammaticalRelation reln = td.reln();
+						if (reln.getShortName().equals("prep") || reln.getShortName().equals("conj") ){
+
+							String st = reln.getShortName()
+									+"\t";
+
+							st +=td.gov().label().value()+"\t";
+
+							st+=td.dep().label().value();
+
+							st=st.toLowerCase();
+							if (tuple.containsKey(st)){
+								tuple.put(st, tuple.get(st)+1);
+							} else {
+								tuple.put(st, 1);
+							}
+
+							st = reln.getShortName()+"\t"+reln.getSpecific()+"\t";
+
+							st +=td.gov().label().value()+"\t";
+
+							st+=td.dep().label().value();
+
+							st=st.toLowerCase();
+
+							if (tupleLong.containsKey(st)){
+								tupleLong.put(st, tupleLong.get(st)+1);
+							} else {
+								tupleLong.put(st, 1);
+							}
+
+						}
+
+					}
+
+				}
+
+				//System.out.println(tdl);
+				//System.out.println();
+				count++;
+				System.out.println(count);
+
+
+			}
+			//if (count > 5)
+			//	  break;
+		}
+		System.out.println(tuple);
+		System.out.println(tupleLong);
+
+		FileWriter fw = new FileWriter("/tmp/tuple");
+
+		for (String key : tuple.keySet()){
+			fw.write(key+"\t"+String.valueOf(tuple.get(key))+"\n");
+		}
+		fw.close();
+
+
+		fw = new FileWriter("/tmp/tupleLong");
+
+		for (String key : tupleLong.keySet()){
+			fw.write(key+"\t"+String.valueOf(tupleLong.get(key))+"\n");
+		}
+		fw.close();
+
+		fw = new FileWriter("/tmp/words");
+
+		for (String key : words.keySet()){
+			fw.write(key+"\t"+String.valueOf(words.get(key))+"\n");
+		}
+		fw.close();
+
+	}
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		Analyse a = new Analyse();
+		try {
+			a.analyse("/tmp/reges.csv");
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+	}
+
+}