view src/de/mpiwg/dwinter/duomo/stanford/TokenWithEvent.java @ 8:919e9f3b5efd

neue klassen zur textanalyse (stanford parser eingebaut) alle has_readable_labe Datatype properties durch rdfs:label ersetzt.
author dwinter
date Thu, 21 Jun 2012 17:08:22 +0200
parents
children
line wrap: on
line source

package de.mpiwg.dwinter.duomo.stanford;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import edu.stanford.nlp.io.EncodingPrintWriter.out;
import edu.stanford.nlp.ling.CyclicCoreLabel;
import edu.stanford.nlp.ling.DocumentReader;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.PennTreebankTokenizer;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.international.negra.NegraPennTokenizer;

public class TokenWithEvent {

	public void analyse(String filename) throws IOException {

		LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
		// This option shows loading and sentence-segment and tokenizing
		// a file using DocumentPreprocessor
		TreebankLanguagePack tlp = new PennTreebankLanguagePack();
		GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
		// You could also create a tokenier here (as below) and pass it
		// to DocumentPreprocessor

		int count=0;
		
		
		Map<String,List<String>> words = new HashMap<String,List<String>>(); 

		FileInputStream fstream = new FileInputStream(filename);
		// Get the object of DataInputStream
		DataInputStream in = new DataInputStream(fstream);
		BufferedReader br = new BufferedReader(new InputStreamReader(in));
		String strLineFull;
		//Read File Line By Line
		while ((strLineFull = br.readLine()) != null)   {

			// correct line needs to be completed to a sentence
			
			
			String[] splitted = strLineFull.split("\",\"");
			String strLine=splitted[1];
			String recordURI = splitted[0];
			strLine=strLine.replace("\"", "");
			//strLine="This is a "+strLine;


			Reader dr = DocumentReader.getReader(strLine);

			//PennTreebankTokenizer tk = new PennTreebankTokenizer(dr);
			NegraPennTokenizer tk = new NegraPennTokenizer(dr);
			
			while (tk.hasNext()){
				
			
					String t = tk.next();

					String st= t.toLowerCase();
					st= st.replace(".", "");
					st= st.replace(",", "");
					st= st.replace(":","");
					st= st.replace(";","");
					st= st.replace("!","");
				
					if (st.length()<2)
						continue;
	
					if (words.containsKey(st)){
						words.get(st).add(recordURI);
					} else {
						List<String> ls =new ArrayList<String>(); 
						ls.add(recordURI);
						words.put(st, ls);
					}

				


				//System.out.println(tdl);
				//System.out.println();
				count++;
				System.out.println(count);


			}
			//if (count > 100)
			//	  break;
		}
	


		FileWriter fw = new FileWriter("/tmp/words2");

		for (String key : words.keySet()){
			List<String> val = words.get(key);	
			fw.write(key+"\t"+String.valueOf(val.size())+"\t"+val.toString()+"\n");
		}
		fw.close();

	}
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		TokenWithEvent a = new TokenWithEvent();
		try {
			a.analyse("/tmp/reges.csv");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

}