view src/de/mpiwg/dwinter/fulltext/ticket/LanguageWriter.java @ 0:72a015318a6d

CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an. https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author dwinter
date Wed, 03 Nov 2010 12:26:20 +0100
parents
children
line wrap: on
line source

package de.mpiwg.dwinter.fulltext.ticket;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;

import sun.security.krb5.internal.PAEncTSEnc;

import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
import de.mpiwg.dwinter.lucencetools.documents.FileDocument;

public class LanguageWriter extends TicketWriter{

	private File languageFile;
	private String language;

	public LanguageWriter(String ticket, String language) throws IOException{
		
		super(ticket);
		
		languageFile = new File(ticketFile.getAbsolutePath()+PATHSEPARATOR+language);
		if(!languageFile.exists()){
			logger.debug("Create Languagefolder:"+languageFile.getCanonicalPath());
			if(!languageFile.mkdirs())
				throw new IOException();	
		}
		 
		this.language=language;
	}

	/** Schreibt das Ergebnis einer Suche in eine Datei im Ticket.
	 * @param docs
	 * @param reader
	 * @return Anzahl der tatsaechlich geschriebenen Treffer
	 * @throws CorruptIndexException
	 * @throws IOException
	 */
	public Integer writeResults(TopDocs docs, IndexReader reader) throws CorruptIndexException, IOException {
		File resultFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"result");
		FileOutputStream fs = new FileOutputStream(resultFile);
		OutputStreamWriter rw = new OutputStreamWriter(fs,"utf-8");
	
		Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus irgendwelche gruenden gibt es ein Dokument mehrfach in den Fundstellen
		
		//FileWriter rw = new FileWriter(resultFile);
		
		if(!languageFile.exists()){
			logger.debug("Create Languagefolder:"+languageFile.getCanonicalPath());
			if(!languageFile.mkdirs())
				throw new IOException();	
		}
		Integer counter =0;
		for (ScoreDoc sd:docs.scoreDocs)
		{
			Document d = reader.document(sd.doc);
			
			String id = d.get("textId");
			if (!idsAlreadyDone.contains(id)){ // stelle sicher das alle treffer nur einmal in die date geschrieben werden.
				idsAlreadyDone.add(id);
				rw.write(FileDocument.toXML(d)+"\n");
				counter ++;
			}
		}
		rw.close();
		return counter;
	}

	
	public void writeResultInfo(TopDocs docs, Integer counter) throws IOException {
		File resultFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"resultInfo");
		FileOutputStream fs = new FileOutputStream(resultFile);
		OutputStreamWriter rw = new OutputStreamWriter(fs,"utf-8");
		String ret = "<resultInfo>";
		int hits = docs.totalHits;
		ret +="<lang>"+language+"</lang>";
		ret +="<hits>"+counter+"</hits>";
		ret +="<totalHits>"+hits+"</totalHits>";
		
		ret+= "</resultInfo>";
		rw.write(ret);
		rw.close(); 
	}
	

	public void writeDoc(OCRDoc ocrDoc) throws IOException {
		// erzeuge fuer jedes document einen ordner
		
		File docFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+ocrDoc.textId.replace(PATHSEPARATORCHAR, ':'));
		if(!docFile.exists()){
			logger.debug("Create Docfolder:"+docFile.getCanonicalPath());
			if(!docFile.mkdirs())
				throw new IOException();	
		}
		
		// jetzt fuer jese seite ein file
		for (String page:ocrDoc.linesInPage.keySet()){
			File pageFile = new File(docFile.getAbsolutePath()+PATHSEPARATOR+page.replace(PATHSEPARATORCHAR, ':'));
			FileWriter pageFileWriter = new FileWriter(pageFile);
			
			for (OCRLine line: ocrDoc.linesInPage.get(page)){
				pageFileWriter.write(line.toString()+"\n");
			}
			pageFileWriter.close();
		}
	}

	public void saveQuery(Query query) throws IOException {
	
		File qf = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"query");
		FileWriter fw = new FileWriter(qf);
		fw.write(query.toString());
		fw.close();
	}

}