Mercurial > hg > fulltextIndexer
comparison src/de/mpiwg/dwinter/fulltextIndexer/harvester/.svn/text-base/HarvesterThreaded.java.svn-base @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dc7622afcfea |
---|---|
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester; | |
2 /* */ | |
3 /* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread; | |
4 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer; | |
5 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; | |
6 /* */ import java.io.BufferedReader; | |
7 /* */ import java.io.File; | |
8 /* */ import java.io.FileNotFoundException; | |
9 /* */ import java.io.FileReader; | |
10 /* */ import java.io.IOException; | |
11 /* */ import java.io.PrintStream; | |
12 /* */ import java.util.ArrayList; | |
13 /* */ import java.util.Arrays; | |
14 /* */ import java.util.Date; | |
15 /* */ import java.util.HashMap; | |
16 /* */ import java.util.List; | |
17 /* */ import org.apache.lucene.analysis.de.GermanAnalyzer; | |
18 /* */ import org.apache.lucene.analysis.fr.FrenchAnalyzer; | |
19 /* */ import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
20 /* */ import org.apache.lucene.index.CorruptIndexException; | |
21 /* */ import org.apache.lucene.store.LockObtainFailedException; | |
22 /* */ import org.apache.lucene.util.Version; | |
23 /* */ import org.jdom.Document; | |
24 /* */ import org.jdom.Element; | |
25 /* */ import org.jdom.JDOMException; | |
26 /* */ import org.jdom.input.SAXBuilder; | |
27 /* */ import org.jdom.xpath.XPath; | |
28 /* */ | |
29 /* */ public class HarvesterThreaded | |
30 /* */ { | |
31 /* */ private static final boolean DEBUG = false; | |
32 private static final int MAXFILES = 100; // only used if DEBUG is true | |
33 /* 75 */ protected static ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" })); | |
34 /* */ | |
35 /* 77 */ protected static ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" })); | |
36 /* 78 */ protected static boolean indexMetaPriority = false; | |
37 /* */ | |
38 /* 81 */ private static String specialMode = ""; | |
39 /* 82 */ protected static int maxThread = 30; | |
40 /* */ protected File docDir; | |
41 /* */ protected File index_dir; | |
42 /* 88 */ protected HashMap<String, String> textLanguage = null; | |
43 /* 89 */ protected HashMap<String, String> languageToISO = new HashMap(); | |
44 /* 90 */ protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers(); | |
45 /* */ | |
46 /* 92 */ private int counter = 0; | |
47 /* */ protected String languageFileName; | |
48 /* 99 */ protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread]; | |
49 /* 100 */ private int filecount = 0; | |
50 /* */ | |
51 /* 102 */ protected String mdProviderUrl = null; | |
52 /* */ private String preferedLanguage; | |
53 /* 106 */ protected HashMap<String, String> supportedLanguageFolder = new HashMap(); | |
54 /* */ | |
55 /* */ public HarvesterThreaded() | |
56 /* */ { | |
57 /* */ } | |
58 /* */ | |
59 /* */ public HarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang) | |
60 /* */ throws CorruptIndexException, LockObtainFailedException, IOException | |
61 /* */ { | |
62 /* 119 */ this.docDir = docDir; | |
63 /* 120 */ this.languageFileName = languageFileName; | |
64 /* 121 */ this.preferedLanguage = lang; | |
65 /* */ | |
66 /* 133 */ this.mdProviderUrl = mdProviderUrl; | |
67 /* */ | |
68 /* 135 */ this.index_dir = index_dir; | |
69 /* */ | |
70 /* 137 */ for (int i = 0; i < maxThread; ++i) | |
71 /* */ { | |
72 /* 139 */ this.mythreads[i] = null; | |
73 /* */ } | |
74 /* */ | |
75 /* 142 */ init_languages(); | |
76 /* */ } | |
77 /* */ | |
78 /* */ private void init_languages() { | |
79 /* 146 */ this.languageToISO.put("German", "de"); | |
80 /* 147 */ this.languageToISO.put("French", "fr"); | |
81 /* 148 */ this.languageToISO.put("English", "en"); | |
82 /* 149 */ this.languageToISO.put("German-f", "de-f"); | |
83 /* */ | |
84 /* 151 */ this.supportedLanguageFolder.put("deu", "de"); | |
85 /* 152 */ this.supportedLanguageFolder.put("deu-f", "de"); | |
86 /* 153 */ this.supportedLanguageFolder.put("fra", "fr"); | |
87 /* 154 */ this.supportedLanguageFolder.put("eng", "en"); | |
88 /* 155 */ this.supportedLanguageFolder.put("lic", "la"); | |
89 /* */ try | |
90 /* */ { | |
91 /* 158 */ this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); | |
92 /* 159 */ this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); | |
93 /* 160 */ this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); | |
94 /* 161 */ this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir)); | |
95 /* 162 */ this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); | |
96 /* */ | |
97 /* 164 */ this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); | |
98 /* 165 */ this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); | |
99 /* */ } catch (CorruptIndexException e) { | |
100 /* 167 */ e.printStackTrace(); | |
101 /* 168 */ System.exit(1); | |
102 /* */ } catch (LockObtainFailedException e) { | |
103 /* 170 */ e.printStackTrace(); | |
104 /* 171 */ System.exit(1); | |
105 /* */ } catch (IOException e) { | |
106 /* 173 */ e.printStackTrace(); | |
107 /* 174 */ System.exit(1); | |
108 /* */ } | |
109 /* */ } | |
110 /* */ | |
111 /* */ public HarvesterThreaded(File docDir, File index_dir, String mdProviderUrl) throws CorruptIndexException, LockObtainFailedException, IOException | |
112 /* */ { | |
113 /* 180 */ this(docDir, index_dir, null, mdProviderUrl, null); | |
114 /* */ } | |
115 /* */ | |
116 /* */ protected HashMap<String, String> loadLanguages() | |
117 /* */ { | |
118 /* 187 */ File languageFile = new File(this.docDir + "/" + this.languageFileName); | |
119 /* 188 */ String languageFilePath = this.docDir + "/" + this.languageFileName; | |
120 /* 189 */ HashMap languages = new HashMap(); | |
121 /* 190 */ boolean relativ = true; | |
122 /* 191 */ if (this.languageFileName == null) | |
123 /* 192 */ return null; | |
124 /* 193 */ if (!languageFile.exists()) | |
125 /* */ { | |
126 /* 195 */ languageFile = new File(this.languageFileName); | |
127 /* 196 */ languageFilePath = this.languageFileName; | |
128 /* 197 */ relativ = false; | |
129 /* 198 */ if (!languageFile.exists()) | |
130 /* 199 */ return null; | |
131 /* */ } | |
132 /* */ BufferedReader in; | |
133 /* */ try { | |
134 /* 203 */ in = new BufferedReader(new FileReader(languageFilePath)); | |
135 /* */ } catch (FileNotFoundException e) { | |
136 /* 205 */ return null; | |
137 /* */ } | |
138 /* */ | |
139 /* 208 */ String zeile = null; | |
140 /* */ try { | |
141 /* 210 */ while ((zeile = in.readLine()) != null) { | |
142 /* 211 */ String[] splitted = zeile.replace("\"", "").split("[,]"); | |
143 /* 212 */ if (splitted.length == 2) | |
144 /* 213 */ if (relativ) | |
145 /* 214 */ languages.put(this.docDir + "/" + splitted[0], splitted[1]); | |
146 /* */ else | |
147 /* 216 */ languages.put(splitted[0], splitted[1]); | |
148 /* */ } | |
149 /* */ } | |
150 /* */ catch (IOException e) { | |
151 /* 220 */ e.printStackTrace(); | |
152 /* 221 */ return null; | |
153 /* */ } | |
154 /* */ | |
155 /* 224 */ return languages; | |
156 /* */ } | |
157 /* */ | |
158 /* */ public void harvestFromRDF(String rdffilepath) throws InterruptedException, JDOMException { | |
159 /* 228 */ Date start = new Date(); | |
160 /* 229 */ boolean create = true; | |
161 /* */ try | |
162 /* */ { | |
163 /* 240 */ System.out.println("Indexing to directory '" + this.index_dir + "'..."); | |
164 /* 241 */ ArrayList files = getFileListFromRDF(rdffilepath); | |
165 /* 242 */ indexDocs(files); | |
166 /* 243 */ System.out.println("Optimizing..."); | |
167 /* 244 */ this.languageAnalyzers.optimize(); | |
168 /* 245 */ this.languageAnalyzers.close(); | |
169 /* */ | |
170 /* 247 */ Date end = new Date(); | |
171 /* 248 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds"); | |
172 /* */ } | |
173 /* */ catch (IOException e) { | |
174 /* 251 */ System.out.println(" caught a " + e.getClass() + | |
175 /* 252 */ "\n with message: " + e.getMessage()); | |
176 /* */ } | |
177 /* */ } | |
178 /* */ | |
179 /* */ private ArrayList<String> getFileListFromRDF(String rdffilepath) | |
180 /* */ throws JDOMException, IOException | |
181 /* */ { | |
182 /* 260 */ ArrayList ret = new ArrayList(); | |
183 /* 261 */ SAXBuilder builder = new SAXBuilder(); | |
184 /* */ | |
185 /* 263 */ Document doc = builder.build(rdffilepath); | |
186 /* */ | |
187 /* 265 */ Element el = doc.getRootElement(); | |
188 /* */ | |
189 /* 267 */ XPath xpath = XPath.newInstance("//MPIWG:archive-path"); | |
190 /* 268 */ xpath.addNamespace("MPIWG", "http://www.mpiwg-berlin.mpg.de/ns/mpiwg"); | |
191 /* 269 */ List<Element> paths = xpath.selectNodes(el); | |
192 /* 270 */ for (Element path : paths) { | |
193 /* 271 */ ret.add(path.getText()); | |
194 /* */ } | |
195 /* */ | |
196 /* 274 */ return ret; | |
197 /* */ } | |
198 /* */ | |
199 /* */ public void harvestFolder() throws InterruptedException { | |
200 /* 278 */ Date start = new Date(); | |
201 /* 279 */ boolean create = true; | |
202 /* */ try | |
203 /* */ { | |
204 /* 290 */ System.out.println("Indexing to directory '" + this.index_dir + "'..."); | |
205 /* 291 */ indexDocs(this.docDir); | |
206 /* 292 */ System.out.println("Optimizing..."); | |
207 /* 293 */ this.languageAnalyzers.optimize(); | |
208 /* 294 */ this.languageAnalyzers.close(); | |
209 /* */ | |
210 /* 296 */ Date end = new Date(); | |
211 /* 297 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds"); | |
212 /* */ } | |
213 /* */ catch (IOException e) { | |
214 /* 300 */ System.out.println(" caught a " + e.getClass() + | |
215 /* 301 */ "\n with message: " + e.getMessage()); | |
216 /* */ } | |
217 /* */ } | |
218 /* */ | |
219 /* */ private void indexDocs(ArrayList<String> files) | |
220 /* */ throws IOException, InterruptedException | |
221 /* */ { | |
222 /* 308 */ for (String filename : files) | |
223 /* */ { | |
224 /* 310 */ indexDocs(new File(this.docDir.getAbsolutePath() + filename)); | |
225 if ((DEBUG==true) & (this.filecount>MAXFILES)) | |
226 break; | |
227 /* */ } | |
228 /* */ } | |
229 /* */ | |
230 /* */ void indexDocs(File file) | |
231 /* */ throws IOException, InterruptedException | |
232 /* */ { | |
233 /* 317 */ if (!file.canRead()) | |
234 /* */ return; | |
235 /* 319 */ if (file.isDirectory()) | |
236 /* */ { | |
237 /* 321 */ if ((DEBUG==true) && (this.filecount>MAXFILES)) | |
238 return; | |
239 /* 325 */ String[] files = file.list(); | |
240 /* */ | |
241 /* 327 */ String folderName = file.getName(); | |
242 /* 328 */ if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0 : 1)) != 0) | |
243 /* 329 */ for (int i = 0; i < files.length; ++i) | |
244 /* */ { | |
245 /* 332 */ indexDocs(new File(file, files[i])); | |
246 if ((DEBUG==true) && (this.filecount>MAXFILES)) | |
247 break; | |
248 /* */ } | |
249 /* */ } | |
250 /* 335 */ else if (isTextFile(file)) | |
251 /* */ { | |
252 /* 338 */ processFile(file); | |
253 /* */ } | |
254 /* */ else | |
255 /* */ { | |
256 /* 342 */ System.out.println("not adding " + file); | |
257 /* */ } | |
258 /* */ } | |
259 /* */ | |
260 /* */ protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException | |
261 /* */ { | |
262 /* 348 */ int freeThread = -1; | |
263 /* 349 */ while (freeThread == -1) | |
264 /* */ { | |
265 /* 351 */ freeThread = waitForFreeThread(); | |
266 /* */ } | |
267 /* */ | |
268 /* 355 */ if (this.textLanguage == null) | |
269 /* 356 */ this.textLanguage = loadLanguages(); | |
270 /* 357 */ this.mythreads[freeThread] = new ProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder); | |
271 /* 358 */ this.mythreads[freeThread].start(); | |
272 /* 359 */ System.out.println("New process started:" + freeThread); | |
273 /* */ } | |
274 /* */ | |
275 /* */ protected int waitForFreeThread() | |
276 /* */ { | |
277 /* 367 */ for (int i = 0; i < maxThread; ++i) | |
278 /* */ { | |
279 /* 369 */ if (this.mythreads[i] == null) | |
280 /* 370 */ return i; | |
281 /* 371 */ if (!this.mythreads[i].done) | |
282 /* */ continue; | |
283 /* 373 */ this.filecount += 1; | |
284 /* 374 */ System.out.println("filecount:" + this.filecount); | |
285 /* 375 */ return i; | |
286 /* */ } | |
287 /* */ | |
288 /* 378 */ return -1; | |
289 /* */ } | |
290 /* */ | |
291 /* */ private boolean isTextFile(File file) | |
292 /* */ { | |
293 /* 392 */ String fn = file.getName(); | |
294 /* */ | |
295 /* 394 */ String[] splitted = fn.split("[.]"); | |
296 /* */ | |
297 /* 396 */ String ext = ""; | |
298 /* */ | |
299 /* 398 */ if (splitted.length > 1) | |
300 /* */ { | |
301 /* 400 */ ext = splitted[(splitted.length - 1)]; | |
302 /* */ } | |
303 /* */ | |
304 /* 403 */ return fileTypesToIndex.contains(ext); | |
305 /* */ } | |
306 /* */ } | |
307 | |
308 /* Location: /private/tmp/fulltextIndexer.jar | |
309 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded | |
310 * JD-Core Version: 0.5.4 | |
311 */ |