view src/de/mpiwg/dwinter/duomo/stanford/ParserDemo.java @ 8:919e9f3b5efd

neue klassen zur textanalyse (stanford parser eingebaut) alle has_readable_labe Datatype properties durch rdfs:label ersetzt.
author dwinter
date Thu, 21 Jun 2012 17:08:22 +0200
parents
children
line wrap: on
line source

package de.mpiwg.dwinter.duomo.stanford;


import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.ling.CoreLabel;  
import edu.stanford.nlp.ling.DocumentReader;
import edu.stanford.nlp.ling.HasWord;  
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;

class ParserDemo {

  public static void main(String[] args) {
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    if (args.length > 0) {
      try {
		demoDP(lp, args[0]);
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
    } else {
      demoAPI(lp);
    }
  }

  public static void demoDP(LexicalizedParser lp, String filename) throws IOException {
    // This option shows loading and sentence-segment and tokenizing
    // a file using DocumentPreprocessor
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    // You could also create a tokenier here (as below) and pass it
    // to DocumentPreprocessor
    
    FileInputStream fstream = new FileInputStream(filename);
    // Get the object of DataInputStream
    DataInputStream in = new DataInputStream(fstream);
    BufferedReader br = new BufferedReader(new InputStreamReader(in));
    String strLine;
    //Read File Line By Line
    while ((strLine = br.readLine()) != null)   {
    
    	// correct line needs to be completed to a sentence
    	strLine=strLine.replace("\"", "");
    	strLine="This is a "+strLine;
    			
    			
    	Reader dr = DocumentReader.getReader(strLine);
    	
	    for (List<HasWord> sentence : new DocumentPreprocessor(dr)) {
	      Tree parse = lp.apply(sentence);
	      parse.pennPrint();
	      System.out.println();
	      
	      GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
	      Collection tdl = gs.typedDependenciesCCprocessed(true);
	      System.out.println(tdl);
	      System.out.println();
	    }
    }
  }

  public static void demoAPI(LexicalizedParser lp) {
    // This option shows parsing a list of correctly tokenized words
    String[] sent = { "This", "is", "an", "easy", "sentence", "." };
    List<CoreLabel> rawWords = new ArrayList<CoreLabel>();
    for (String word : sent) {
      CoreLabel l = new CoreLabel();
      l.setWord(word);
      rawWords.add(l);
    }
    Tree parse = lp.apply(rawWords);
    parse.pennPrint();
    System.out.println();

    // This option shows loading and using an explicit tokenizer
    String sent2 = "This is another sentence.";
    TokenizerFactory<CoreLabel> tokenizerFactory = 
      PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> rawWords2 = 
      tokenizerFactory.getTokenizer(new StringReader(sent2)).tokenize();
    parse = lp.apply(rawWords2);

    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);
    System.out.println();

    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    tp.printTree(parse);
  }

  private ParserDemo() {} // static methods only

}