comparison src/de/mpiwg/dwinter/fulltext/ticket/LanguageWriter.java @ 0:72a015318a6d

CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an. https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author dwinter
date Wed, 03 Nov 2010 12:26:20 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:72a015318a6d
1 package de.mpiwg.dwinter.fulltext.ticket;
2
3 import java.io.File;
4 import java.io.FileNotFoundException;
5 import java.io.FileOutputStream;
6 import java.io.FileWriter;
7 import java.io.IOException;
8 import java.io.OutputStreamWriter;
9 import java.io.UnsupportedEncodingException;
10 import java.util.HashSet;
11 import java.util.Set;
12
13 import org.apache.lucene.document.Document;
14 import org.apache.lucene.index.CorruptIndexException;
15 import org.apache.lucene.index.IndexReader;
16 import org.apache.lucene.search.Query;
17 import org.apache.lucene.search.ScoreDoc;
18 import org.apache.lucene.search.TopDocs;
19 import org.apache.lucene.search.TopScoreDocCollector;
20
21 import sun.security.krb5.internal.PAEncTSEnc;
22
23 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
24 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
25 import de.mpiwg.dwinter.lucencetools.documents.FileDocument;
26
27 public class LanguageWriter extends TicketWriter{
28
29 private File languageFile;
30 private String language;
31
32 public LanguageWriter(String ticket, String language) throws IOException{
33
34 super(ticket);
35
36 languageFile = new File(ticketFile.getAbsolutePath()+PATHSEPARATOR+language);
37 if(!languageFile.exists()){
38 logger.debug("Create Languagefolder:"+languageFile.getCanonicalPath());
39 if(!languageFile.mkdirs())
40 throw new IOException();
41 }
42
43 this.language=language;
44 }
45
46 /** Schreibt das Ergebnis einer Suche in eine Datei im Ticket.
47 * @param docs
48 * @param reader
49 * @return Anzahl der tatsaechlich geschriebenen Treffer
50 * @throws CorruptIndexException
51 * @throws IOException
52 */
53 public Integer writeResults(TopDocs docs, IndexReader reader) throws CorruptIndexException, IOException {
54 File resultFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"result");
55 FileOutputStream fs = new FileOutputStream(resultFile);
56 OutputStreamWriter rw = new OutputStreamWriter(fs,"utf-8");
57
58 Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus irgendwelche gruenden gibt es ein Dokument mehrfach in den Fundstellen
59
60 //FileWriter rw = new FileWriter(resultFile);
61
62 if(!languageFile.exists()){
63 logger.debug("Create Languagefolder:"+languageFile.getCanonicalPath());
64 if(!languageFile.mkdirs())
65 throw new IOException();
66 }
67 Integer counter =0;
68 for (ScoreDoc sd:docs.scoreDocs)
69 {
70 Document d = reader.document(sd.doc);
71
72 String id = d.get("textId");
73 if (!idsAlreadyDone.contains(id)){ // stelle sicher das alle treffer nur einmal in die date geschrieben werden.
74 idsAlreadyDone.add(id);
75 rw.write(FileDocument.toXML(d)+"\n");
76 counter ++;
77 }
78 }
79 rw.close();
80 return counter;
81 }
82
83
84 public void writeResultInfo(TopDocs docs, Integer counter) throws IOException {
85 File resultFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"resultInfo");
86 FileOutputStream fs = new FileOutputStream(resultFile);
87 OutputStreamWriter rw = new OutputStreamWriter(fs,"utf-8");
88 String ret = "<resultInfo>";
89 int hits = docs.totalHits;
90 ret +="<lang>"+language+"</lang>";
91 ret +="<hits>"+counter+"</hits>";
92 ret +="<totalHits>"+hits+"</totalHits>";
93
94 ret+= "</resultInfo>";
95 rw.write(ret);
96 rw.close();
97 }
98
99
100 public void writeDoc(OCRDoc ocrDoc) throws IOException {
101 // erzeuge fuer jedes document einen ordner
102
103 File docFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+ocrDoc.textId.replace(PATHSEPARATORCHAR, ':'));
104 if(!docFile.exists()){
105 logger.debug("Create Docfolder:"+docFile.getCanonicalPath());
106 if(!docFile.mkdirs())
107 throw new IOException();
108 }
109
110 // jetzt fuer jese seite ein file
111 for (String page:ocrDoc.linesInPage.keySet()){
112 File pageFile = new File(docFile.getAbsolutePath()+PATHSEPARATOR+page.replace(PATHSEPARATORCHAR, ':'));
113 FileWriter pageFileWriter = new FileWriter(pageFile);
114
115 for (OCRLine line: ocrDoc.linesInPage.get(page)){
116 pageFileWriter.write(line.toString()+"\n");
117 }
118 pageFileWriter.close();
119 }
120 }
121
122 public void saveQuery(Query query) throws IOException {
123
124 File qf = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"query");
125 FileWriter fw = new FileWriter(qf);
126 fw.write(query.toString());
127 fw.close();
128 }
129
130 }