Mercurial > hg > fulltextSearch
comparison src/de/mpiwg/dwinter/fulltext/ticket/LanguageWriter.java @ 0:72a015318a6d
CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an.
https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:26:20 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:72a015318a6d |
---|---|
1 package de.mpiwg.dwinter.fulltext.ticket; | |
2 | |
3 import java.io.File; | |
4 import java.io.FileNotFoundException; | |
5 import java.io.FileOutputStream; | |
6 import java.io.FileWriter; | |
7 import java.io.IOException; | |
8 import java.io.OutputStreamWriter; | |
9 import java.io.UnsupportedEncodingException; | |
10 import java.util.HashSet; | |
11 import java.util.Set; | |
12 | |
13 import org.apache.lucene.document.Document; | |
14 import org.apache.lucene.index.CorruptIndexException; | |
15 import org.apache.lucene.index.IndexReader; | |
16 import org.apache.lucene.search.Query; | |
17 import org.apache.lucene.search.ScoreDoc; | |
18 import org.apache.lucene.search.TopDocs; | |
19 import org.apache.lucene.search.TopScoreDocCollector; | |
20 | |
21 import sun.security.krb5.internal.PAEncTSEnc; | |
22 | |
23 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; | |
24 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; | |
25 import de.mpiwg.dwinter.lucencetools.documents.FileDocument; | |
26 | |
27 public class LanguageWriter extends TicketWriter{ | |
28 | |
29 private File languageFile; | |
30 private String language; | |
31 | |
32 public LanguageWriter(String ticket, String language) throws IOException{ | |
33 | |
34 super(ticket); | |
35 | |
36 languageFile = new File(ticketFile.getAbsolutePath()+PATHSEPARATOR+language); | |
37 if(!languageFile.exists()){ | |
38 logger.debug("Create Languagefolder:"+languageFile.getCanonicalPath()); | |
39 if(!languageFile.mkdirs()) | |
40 throw new IOException(); | |
41 } | |
42 | |
43 this.language=language; | |
44 } | |
45 | |
46 /** Schreibt das Ergebnis einer Suche in eine Datei im Ticket. | |
47 * @param docs | |
48 * @param reader | |
49 * @return Anzahl der tatsaechlich geschriebenen Treffer | |
50 * @throws CorruptIndexException | |
51 * @throws IOException | |
52 */ | |
53 public Integer writeResults(TopDocs docs, IndexReader reader) throws CorruptIndexException, IOException { | |
54 File resultFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"result"); | |
55 FileOutputStream fs = new FileOutputStream(resultFile); | |
56 OutputStreamWriter rw = new OutputStreamWriter(fs,"utf-8"); | |
57 | |
58 Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus irgendwelche gruenden gibt es ein Dokument mehrfach in den Fundstellen | |
59 | |
60 //FileWriter rw = new FileWriter(resultFile); | |
61 | |
62 if(!languageFile.exists()){ | |
63 logger.debug("Create Languagefolder:"+languageFile.getCanonicalPath()); | |
64 if(!languageFile.mkdirs()) | |
65 throw new IOException(); | |
66 } | |
67 Integer counter =0; | |
68 for (ScoreDoc sd:docs.scoreDocs) | |
69 { | |
70 Document d = reader.document(sd.doc); | |
71 | |
72 String id = d.get("textId"); | |
73 if (!idsAlreadyDone.contains(id)){ // stelle sicher das alle treffer nur einmal in die date geschrieben werden. | |
74 idsAlreadyDone.add(id); | |
75 rw.write(FileDocument.toXML(d)+"\n"); | |
76 counter ++; | |
77 } | |
78 } | |
79 rw.close(); | |
80 return counter; | |
81 } | |
82 | |
83 | |
84 public void writeResultInfo(TopDocs docs, Integer counter) throws IOException { | |
85 File resultFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"resultInfo"); | |
86 FileOutputStream fs = new FileOutputStream(resultFile); | |
87 OutputStreamWriter rw = new OutputStreamWriter(fs,"utf-8"); | |
88 String ret = "<resultInfo>"; | |
89 int hits = docs.totalHits; | |
90 ret +="<lang>"+language+"</lang>"; | |
91 ret +="<hits>"+counter+"</hits>"; | |
92 ret +="<totalHits>"+hits+"</totalHits>"; | |
93 | |
94 ret+= "</resultInfo>"; | |
95 rw.write(ret); | |
96 rw.close(); | |
97 } | |
98 | |
99 | |
100 public void writeDoc(OCRDoc ocrDoc) throws IOException { | |
101 // erzeuge fuer jedes document einen ordner | |
102 | |
103 File docFile = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+ocrDoc.textId.replace(PATHSEPARATORCHAR, ':')); | |
104 if(!docFile.exists()){ | |
105 logger.debug("Create Docfolder:"+docFile.getCanonicalPath()); | |
106 if(!docFile.mkdirs()) | |
107 throw new IOException(); | |
108 } | |
109 | |
110 // jetzt fuer jese seite ein file | |
111 for (String page:ocrDoc.linesInPage.keySet()){ | |
112 File pageFile = new File(docFile.getAbsolutePath()+PATHSEPARATOR+page.replace(PATHSEPARATORCHAR, ':')); | |
113 FileWriter pageFileWriter = new FileWriter(pageFile); | |
114 | |
115 for (OCRLine line: ocrDoc.linesInPage.get(page)){ | |
116 pageFileWriter.write(line.toString()+"\n"); | |
117 } | |
118 pageFileWriter.close(); | |
119 } | |
120 } | |
121 | |
122 public void saveQuery(Query query) throws IOException { | |
123 | |
124 File qf = new File(languageFile.getAbsolutePath()+PATHSEPARATOR+"query"); | |
125 FileWriter fw = new FileWriter(qf); | |
126 fw.write(query.toString()); | |
127 fw.close(); | |
128 } | |
129 | |
130 } |