Mercurial > hg > duomoOWLProject

// Analisiere calls from the virtuoso store
// "http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/RecordedEvent_41164","Term of payment for debt for forced loans."
// select distinct * where { {?x duomo:has_reges ?y} FILTER(lang(?y)="en")}


package de.mpiwg.dwinter.duomo.stanford;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import edu.stanford.nlp.io.EncodingPrintWriter.out;
import edu.stanford.nlp.ling.CyclicCoreLabel;
import edu.stanford.nlp.ling.DocumentReader;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;
public class AnalyseWithEvents {

	private int prepcount = 0;
	private String prep_ent="http://entities.mpiwg-berlin.mpg.de/research/duomo/prep/";
	private String prep_ont="http://ontologies.mpiwg-berlin.mpg.de/research/duomo/prep/";
	public void analyse(String filename) throws IOException {

		LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
		// This option shows loading and sentence-segment and tokenizing
		// a file using DocumentPreprocessor
		TreebankLanguagePack tlp = new PennTreebankLanguagePack();
		GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
		// You could also create a tokenier here (as below) and pass it
		// to DocumentPreprocessor

		int count=0;
		Map<String,List<String>> tuple = new HashMap<String,List<String>>();
		Map<String,List<String>> tupleLong = new HashMap<String,List<String>>();
		Map<String,List<String>> words = new HashMap<String,List<String>>();

		FileInputStream fstream = new FileInputStream(filename);
		// Get the object of DataInputStream
		DataInputStream in = new DataInputStream(fstream);
		BufferedReader br = new BufferedReader(new InputStreamReader(in));
		String strLineFull;
		//Read File Line By Line
		while ((strLineFull = br.readLine()) != null)   {

			// correct line needs to be completed to a sentence
			String[] splitted = strLineFull.split(",");


			// Line hat die Form:  "http://ontologies.mpiwg-berlin.mpg.de/research/duomoAnalysis.owl/RecordedEvent_41164","Term of payment for debt for forced loans."

			String strLine=splitted[1];
			String recordURI = splitted[0];
			strLine=strLine.replace("\"", "");
			strLine="This is a "+strLine;


			Reader dr = DocumentReader.getReader(strLine);


			for (List<HasWord> sentence : new DocumentPreprocessor(dr)) {
				Tree parse = lp.apply(sentence);
				//parse.pennPrint();
				//System.out.println();

				for (HasWord word: sentence)
				{
					Word wd = (Word)word;

					String st= wd.value().toLowerCase();

					if (words.containsKey(st)){
						words.get(st).add(recordURI);
					} else {
						List<String> ls =new ArrayList<String>();
						ls.add(recordURI);
						words.put(st, ls);
					}

				}


				GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
				Collection tdl = gs.typedDependenciesCCprocessed(true);

				for (Object t: tdl){
					if (TypedDependency.class.isInstance(t)){


						TypedDependency td = (TypedDependency)t;

						GrammaticalRelation reln = td.reln();
						if (reln.getShortName().equals("prep") || reln.getShortName().equals("conj") ){

							String st = reln.getShortName()
									+"\t";

							st +=td.gov().label().value()+"\t";

							st+=td.dep().label().value();

							st=st.toLowerCase();

							if (tuple.containsKey(st)){
								tuple.get(st).add(recordURI);
							} else {
								List<String> ls =new ArrayList<String>();
								ls.add(recordURI);
								tuple.put(st, ls);
							}


							st = reln.getShortName()+"\t"+reln.getSpecific()+"\t";

							st +=td.gov().label().value()+"\t";

							st+=td.dep().label().value();

							st=st.toLowerCase();

							if (tupleLong.containsKey(st)){
								tupleLong.get(st).add(recordURI);
							} else {
								List<String> ls =new ArrayList<String>();
								ls.add(recordURI);
								tupleLong.put(st, ls);
							}


						}

					}

				}

				//System.out.println(tdl);
				//System.out.println();
				count++;
				System.out.println(count);


			}
//			if (count > 100)
//				  break;
		}
		System.out.println(tuple);
		System.out.println(tupleLong);

		FileWriter fw = new FileWriter("/tmp/tuple");

		for (String key : tuple.keySet()){
			List<String> val = tuple.get(key);
			fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n");

		}
		fw.close();


		fw = new FileWriter("/tmp/tupleLong");

		FileWriter fw2 = new FileWriter("/tmp/tupleLong.nt3.rdf");

		for (String key : tupleLong.keySet()){
			List<String> val = tupleLong.get(key);

			fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n");
			String res = writePrepAsTriple(fw2,key);
			writeEventsToRes(fw2,res,val);


		}
		fw.close();
		fw2.close();

		fw = new FileWriter("/tmp/words");

		for (String key : words.keySet()){

			List<String> val = words.get(key);
			fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n");
		}
		fw.close();

	}
	private void writeEventsToRes(FileWriter fw2, String prepUri, List<String> val) throws IOException {
		for (String res :val){
			fw2.write("<"+res.replace("\"", "")+"><"+prep_ont+"contains> <"+prepUri+">.\n");
		}
		fw2.flush();

	}
	private String writePrepAsTriple(FileWriter fw2, String prep) throws IOException {

		String[] splitted = prep.split("\t");
		prepcount+=1;
		String resUri=String.format(prep_ent+"prep_%s",prepcount);
		fw2.write("<"+resUri+ "> rdf:type "+"<"+prep_ont+"Preposition>.\n");

		if (!splitted[2].equals("")){
			String wd = URLEncoder.encode(splitted[2],"utf-8");
		fw2.write("<"+resUri+ "> "+"<"+prep_ont+"main> <"+prep_ent+"Word_"+wd+">.\n");
		fw2.write("<"+prep_ent+"Word_"+wd+"> rdfs:label \""+splitted[2]+"\"@en .\n");
		fw2.write("<"+prep_ent+"Word_"+wd+"> rdf:type  "+"<"+prep_ont+"Word>.\n");
		}

		if (!splitted[3].equals("")){
			String wd = URLEncoder.encode(splitted[3],"utf-8");
		fw2.write("<"+resUri+ "> "+"<"+prep_ont+"specification> <"+prep_ent+"Word_"+wd+">.\n");
		fw2.write("<"+prep_ent+"Word_"+wd+"> rdfs:label \""+splitted[3]+"\"@en .\n");
		fw2.write("<"+prep_ent+"Word_"+wd+"> rdf:type  "+"<"+prep_ont+"Word>.\n");
		}

		if (!splitted[1].equals("")){
			String wd = URLEncoder.encode(splitted[1],"utf-8");
		fw2.write("<"+resUri+ "> "+"<"+prep_ont+"prepType> <"+prep_ent+"Type_"+wd+">.\n");
		fw2.write("<"+prep_ent+"Type_"+wd+"> rdfs:label \""+splitted[1]+"\"@en .\n");
		fw2.write("<"+prep_ent+"Word_"+wd+"> rdf:type  "+"<"+prep_ont+"Type>.\n");
		}

		fw2.flush();
		return resUri;


	}
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		AnalyseWithEvents a = new AnalyseWithEvents();
		try {
			a.analyse("/tmp/reges.csv");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

}
author	dwinter
date	Thu, 16 Aug 2012 11:40:17 +0200
parents	919e9f3b5efd
children