annotate src/de/mpiwg/dwinter/fulltextIndexer/harvester/HarvesterThreaded.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7622afcfea initial
dwinter
parents:
diff changeset
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester;
dc7622afcfea initial
dwinter
parents:
diff changeset
2 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
3 /* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
dc7622afcfea initial
dwinter
parents:
diff changeset
4 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
5 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
dc7622afcfea initial
dwinter
parents:
diff changeset
6 /* */ import java.io.BufferedReader;
dc7622afcfea initial
dwinter
parents:
diff changeset
7 /* */ import java.io.File;
dc7622afcfea initial
dwinter
parents:
diff changeset
8 /* */ import java.io.FileNotFoundException;
dc7622afcfea initial
dwinter
parents:
diff changeset
9 /* */ import java.io.FileReader;
dc7622afcfea initial
dwinter
parents:
diff changeset
10 /* */ import java.io.IOException;
dc7622afcfea initial
dwinter
parents:
diff changeset
11 /* */ import java.io.PrintStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
12 /* */ import java.util.ArrayList;
dc7622afcfea initial
dwinter
parents:
diff changeset
13 /* */ import java.util.Arrays;
dc7622afcfea initial
dwinter
parents:
diff changeset
14 /* */ import java.util.Date;
dc7622afcfea initial
dwinter
parents:
diff changeset
15 /* */ import java.util.HashMap;
dc7622afcfea initial
dwinter
parents:
diff changeset
16 /* */ import java.util.List;
dc7622afcfea initial
dwinter
parents:
diff changeset
17 /* */ import org.apache.lucene.analysis.de.GermanAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
18 /* */ import org.apache.lucene.analysis.fr.FrenchAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
19 /* */ import org.apache.lucene.analysis.standard.StandardAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
20 /* */ import org.apache.lucene.index.CorruptIndexException;
dc7622afcfea initial
dwinter
parents:
diff changeset
21 /* */ import org.apache.lucene.store.LockObtainFailedException;
dc7622afcfea initial
dwinter
parents:
diff changeset
22 /* */ import org.apache.lucene.util.Version;
dc7622afcfea initial
dwinter
parents:
diff changeset
23 /* */ import org.jdom.Document;
dc7622afcfea initial
dwinter
parents:
diff changeset
24 /* */ import org.jdom.Element;
dc7622afcfea initial
dwinter
parents:
diff changeset
25 /* */ import org.jdom.JDOMException;
dc7622afcfea initial
dwinter
parents:
diff changeset
26 /* */ import org.jdom.input.SAXBuilder;
dc7622afcfea initial
dwinter
parents:
diff changeset
27 /* */ import org.jdom.xpath.XPath;
dc7622afcfea initial
dwinter
parents:
diff changeset
28 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
29 /* */ public class HarvesterThreaded
dc7622afcfea initial
dwinter
parents:
diff changeset
30 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
31 /* */ private static final boolean DEBUG = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
32 private static final int MAXFILES = 100; // only used if DEBUG is true
dc7622afcfea initial
dwinter
parents:
diff changeset
33 /* 75 */ protected static ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" }));
dc7622afcfea initial
dwinter
parents:
diff changeset
34 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
35 /* 77 */ protected static ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" }));
dc7622afcfea initial
dwinter
parents:
diff changeset
36 /* 78 */ protected static boolean indexMetaPriority = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
37 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
38 /* 81 */ private static String specialMode = "";
dc7622afcfea initial
dwinter
parents:
diff changeset
39 /* 82 */ protected static int maxThread = 30;
dc7622afcfea initial
dwinter
parents:
diff changeset
40 /* */ protected File docDir;
dc7622afcfea initial
dwinter
parents:
diff changeset
41 /* */ protected File index_dir;
dc7622afcfea initial
dwinter
parents:
diff changeset
42 /* 88 */ protected HashMap<String, String> textLanguage = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
43 /* 89 */ protected HashMap<String, String> languageToISO = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
44 /* 90 */ protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
dc7622afcfea initial
dwinter
parents:
diff changeset
45 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
46 /* 92 */ private int counter = 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
47 /* */ protected String languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
48 /* 99 */ protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread];
dc7622afcfea initial
dwinter
parents:
diff changeset
49 /* 100 */ private int filecount = 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
50 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
51 /* 102 */ protected String mdProviderUrl = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
52 /* */ private String preferedLanguage;
dc7622afcfea initial
dwinter
parents:
diff changeset
53 /* 106 */ protected HashMap<String, String> supportedLanguageFolder = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
54 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
55 /* */ public HarvesterThreaded()
dc7622afcfea initial
dwinter
parents:
diff changeset
56 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
57 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
58 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
59 /* */ public HarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang)
dc7622afcfea initial
dwinter
parents:
diff changeset
60 /* */ throws CorruptIndexException, LockObtainFailedException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
61 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
62 /* 119 */ this.docDir = docDir;
dc7622afcfea initial
dwinter
parents:
diff changeset
63 /* 120 */ this.languageFileName = languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
64 /* 121 */ this.preferedLanguage = lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
65 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
66 /* 133 */ this.mdProviderUrl = mdProviderUrl;
dc7622afcfea initial
dwinter
parents:
diff changeset
67 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
68 /* 135 */ this.index_dir = index_dir;
dc7622afcfea initial
dwinter
parents:
diff changeset
69 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
70 /* 137 */ for (int i = 0; i < maxThread; ++i)
dc7622afcfea initial
dwinter
parents:
diff changeset
71 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
72 /* 139 */ this.mythreads[i] = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
73 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
74 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
75 /* 142 */ init_languages();
dc7622afcfea initial
dwinter
parents:
diff changeset
76 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
77 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
78 /* */ private void init_languages() {
dc7622afcfea initial
dwinter
parents:
diff changeset
79 /* 146 */ this.languageToISO.put("German", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
80 /* 147 */ this.languageToISO.put("French", "fr");
dc7622afcfea initial
dwinter
parents:
diff changeset
81 /* 148 */ this.languageToISO.put("English", "en");
dc7622afcfea initial
dwinter
parents:
diff changeset
82 /* 149 */ this.languageToISO.put("German-f", "de-f");
dc7622afcfea initial
dwinter
parents:
diff changeset
83 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
84 /* 151 */ this.supportedLanguageFolder.put("deu", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
85 /* 152 */ this.supportedLanguageFolder.put("deu-f", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
86 /* 153 */ this.supportedLanguageFolder.put("fra", "fr");
dc7622afcfea initial
dwinter
parents:
diff changeset
87 /* 154 */ this.supportedLanguageFolder.put("eng", "en");
dc7622afcfea initial
dwinter
parents:
diff changeset
88 /* 155 */ this.supportedLanguageFolder.put("lic", "la");
dc7622afcfea initial
dwinter
parents:
diff changeset
89 /* */ try
dc7622afcfea initial
dwinter
parents:
diff changeset
90 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
91 /* 158 */ this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
92 /* 159 */ this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
93 /* 160 */ this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
94 /* 161 */ this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
95 /* 162 */ this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
96 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
97 /* 164 */ this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
98 /* 165 */ this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
99 /* */ } catch (CorruptIndexException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
100 /* 167 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
101 /* 168 */ System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
102 /* */ } catch (LockObtainFailedException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
103 /* 170 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
104 /* 171 */ System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
105 /* */ } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
106 /* 173 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
107 /* 174 */ System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
108 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
109 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
110 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
111 /* */ public HarvesterThreaded(File docDir, File index_dir, String mdProviderUrl) throws CorruptIndexException, LockObtainFailedException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
112 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
113 /* 180 */ this(docDir, index_dir, null, mdProviderUrl, null);
dc7622afcfea initial
dwinter
parents:
diff changeset
114 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
115 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
116 /* */ protected HashMap<String, String> loadLanguages()
dc7622afcfea initial
dwinter
parents:
diff changeset
117 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
118 /* 187 */ File languageFile = new File(this.docDir + "/" + this.languageFileName);
dc7622afcfea initial
dwinter
parents:
diff changeset
119 /* 188 */ String languageFilePath = this.docDir + "/" + this.languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
120 /* 189 */ HashMap languages = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
121 /* 190 */ boolean relativ = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
122 /* 191 */ if (this.languageFileName == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
123 /* 192 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
124 /* 193 */ if (!languageFile.exists())
dc7622afcfea initial
dwinter
parents:
diff changeset
125 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
126 /* 195 */ languageFile = new File(this.languageFileName);
dc7622afcfea initial
dwinter
parents:
diff changeset
127 /* 196 */ languageFilePath = this.languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
128 /* 197 */ relativ = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
129 /* 198 */ if (!languageFile.exists())
dc7622afcfea initial
dwinter
parents:
diff changeset
130 /* 199 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
131 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
132 /* */ BufferedReader in;
dc7622afcfea initial
dwinter
parents:
diff changeset
133 /* */ try {
dc7622afcfea initial
dwinter
parents:
diff changeset
134 /* 203 */ in = new BufferedReader(new FileReader(languageFilePath));
dc7622afcfea initial
dwinter
parents:
diff changeset
135 /* */ } catch (FileNotFoundException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
136 /* 205 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
137 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
138 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
139 /* 208 */ String zeile = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
140 /* */ try {
dc7622afcfea initial
dwinter
parents:
diff changeset
141 /* 210 */ while ((zeile = in.readLine()) != null) {
dc7622afcfea initial
dwinter
parents:
diff changeset
142 /* 211 */ String[] splitted = zeile.replace("\"", "").split("[,]");
dc7622afcfea initial
dwinter
parents:
diff changeset
143 /* 212 */ if (splitted.length == 2)
dc7622afcfea initial
dwinter
parents:
diff changeset
144 /* 213 */ if (relativ)
dc7622afcfea initial
dwinter
parents:
diff changeset
145 /* 214 */ languages.put(this.docDir + "/" + splitted[0], splitted[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
146 /* */ else
dc7622afcfea initial
dwinter
parents:
diff changeset
147 /* 216 */ languages.put(splitted[0], splitted[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
148 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
149 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
150 /* */ catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
151 /* 220 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
152 /* 221 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
153 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
154 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
155 /* 224 */ return languages;
dc7622afcfea initial
dwinter
parents:
diff changeset
156 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
157 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
158 /* */ public void harvestFromRDF(String rdffilepath) throws InterruptedException, JDOMException {
dc7622afcfea initial
dwinter
parents:
diff changeset
159 /* 228 */ Date start = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
160 /* 229 */ boolean create = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
161 /* */ try
dc7622afcfea initial
dwinter
parents:
diff changeset
162 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
163 /* 240 */ System.out.println("Indexing to directory '" + this.index_dir + "'...");
dc7622afcfea initial
dwinter
parents:
diff changeset
164 /* 241 */ ArrayList files = getFileListFromRDF(rdffilepath);
dc7622afcfea initial
dwinter
parents:
diff changeset
165 /* 242 */ indexDocs(files);
dc7622afcfea initial
dwinter
parents:
diff changeset
166 /* 243 */ System.out.println("Optimizing...");
dc7622afcfea initial
dwinter
parents:
diff changeset
167 /* 244 */ this.languageAnalyzers.optimize();
dc7622afcfea initial
dwinter
parents:
diff changeset
168 /* 245 */ this.languageAnalyzers.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
169 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
170 /* 247 */ Date end = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
171 /* 248 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds");
dc7622afcfea initial
dwinter
parents:
diff changeset
172 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
173 /* */ catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
174 /* 251 */ System.out.println(" caught a " + e.getClass() +
dc7622afcfea initial
dwinter
parents:
diff changeset
175 /* 252 */ "\n with message: " + e.getMessage());
dc7622afcfea initial
dwinter
parents:
diff changeset
176 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
177 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
178 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
179 /* */ private ArrayList<String> getFileListFromRDF(String rdffilepath)
dc7622afcfea initial
dwinter
parents:
diff changeset
180 /* */ throws JDOMException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
181 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
182 /* 260 */ ArrayList ret = new ArrayList();
dc7622afcfea initial
dwinter
parents:
diff changeset
183 /* 261 */ SAXBuilder builder = new SAXBuilder();
dc7622afcfea initial
dwinter
parents:
diff changeset
184 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
185 /* 263 */ Document doc = builder.build(rdffilepath);
dc7622afcfea initial
dwinter
parents:
diff changeset
186 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
187 /* 265 */ Element el = doc.getRootElement();
dc7622afcfea initial
dwinter
parents:
diff changeset
188 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
189 /* 267 */ XPath xpath = XPath.newInstance("//MPIWG:archive-path");
dc7622afcfea initial
dwinter
parents:
diff changeset
190 /* 268 */ xpath.addNamespace("MPIWG", "http://www.mpiwg-berlin.mpg.de/ns/mpiwg");
dc7622afcfea initial
dwinter
parents:
diff changeset
191 /* 269 */ List<Element> paths = xpath.selectNodes(el);
dc7622afcfea initial
dwinter
parents:
diff changeset
192 /* 270 */ for (Element path : paths) {
dc7622afcfea initial
dwinter
parents:
diff changeset
193 /* 271 */ ret.add(path.getText());
dc7622afcfea initial
dwinter
parents:
diff changeset
194 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
195 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
196 /* 274 */ return ret;
dc7622afcfea initial
dwinter
parents:
diff changeset
197 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
198 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
199 /* */ public void harvestFolder() throws InterruptedException {
dc7622afcfea initial
dwinter
parents:
diff changeset
200 /* 278 */ Date start = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
201 /* 279 */ boolean create = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
202 /* */ try
dc7622afcfea initial
dwinter
parents:
diff changeset
203 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
204 /* 290 */ System.out.println("Indexing to directory '" + this.index_dir + "'...");
dc7622afcfea initial
dwinter
parents:
diff changeset
205 /* 291 */ indexDocs(this.docDir);
dc7622afcfea initial
dwinter
parents:
diff changeset
206 /* 292 */ System.out.println("Optimizing...");
dc7622afcfea initial
dwinter
parents:
diff changeset
207 /* 293 */ this.languageAnalyzers.optimize();
dc7622afcfea initial
dwinter
parents:
diff changeset
208 /* 294 */ this.languageAnalyzers.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
209 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
210 /* 296 */ Date end = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
211 /* 297 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds");
dc7622afcfea initial
dwinter
parents:
diff changeset
212 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
213 /* */ catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
214 /* 300 */ System.out.println(" caught a " + e.getClass() +
dc7622afcfea initial
dwinter
parents:
diff changeset
215 /* 301 */ "\n with message: " + e.getMessage());
dc7622afcfea initial
dwinter
parents:
diff changeset
216 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
217 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
218 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
219 /* */ private void indexDocs(ArrayList<String> files)
dc7622afcfea initial
dwinter
parents:
diff changeset
220 /* */ throws IOException, InterruptedException
dc7622afcfea initial
dwinter
parents:
diff changeset
221 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
222 /* 308 */ for (String filename : files)
dc7622afcfea initial
dwinter
parents:
diff changeset
223 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
224 /* 310 */ indexDocs(new File(this.docDir.getAbsolutePath() + filename));
dc7622afcfea initial
dwinter
parents:
diff changeset
225 if ((DEBUG==true) & (this.filecount>MAXFILES))
dc7622afcfea initial
dwinter
parents:
diff changeset
226 break;
dc7622afcfea initial
dwinter
parents:
diff changeset
227 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
228 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
229 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
230 /* */ void indexDocs(File file)
dc7622afcfea initial
dwinter
parents:
diff changeset
231 /* */ throws IOException, InterruptedException
dc7622afcfea initial
dwinter
parents:
diff changeset
232 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
233 /* 317 */ if (!file.canRead())
dc7622afcfea initial
dwinter
parents:
diff changeset
234 /* */ return;
dc7622afcfea initial
dwinter
parents:
diff changeset
235 /* 319 */ if (file.isDirectory())
dc7622afcfea initial
dwinter
parents:
diff changeset
236 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
237 /* 321 */ if ((DEBUG==true) && (this.filecount>MAXFILES))
dc7622afcfea initial
dwinter
parents:
diff changeset
238 return;
dc7622afcfea initial
dwinter
parents:
diff changeset
239 /* 325 */ String[] files = file.list();
dc7622afcfea initial
dwinter
parents:
diff changeset
240 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
241 /* 327 */ String folderName = file.getName();
dc7622afcfea initial
dwinter
parents:
diff changeset
242 /* 328 */ if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0 : 1)) != 0)
dc7622afcfea initial
dwinter
parents:
diff changeset
243 /* 329 */ for (int i = 0; i < files.length; ++i)
dc7622afcfea initial
dwinter
parents:
diff changeset
244 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
245 /* 332 */ indexDocs(new File(file, files[i]));
dc7622afcfea initial
dwinter
parents:
diff changeset
246 if ((DEBUG==true) && (this.filecount>MAXFILES))
dc7622afcfea initial
dwinter
parents:
diff changeset
247 break;
dc7622afcfea initial
dwinter
parents:
diff changeset
248 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
249 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
250 /* 335 */ else if (isTextFile(file))
dc7622afcfea initial
dwinter
parents:
diff changeset
251 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
252 /* 338 */ processFile(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
253 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
254 /* */ else
dc7622afcfea initial
dwinter
parents:
diff changeset
255 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
256 /* 342 */ System.out.println("not adding " + file);
dc7622afcfea initial
dwinter
parents:
diff changeset
257 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
258 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
259 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
260 /* */ protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
261 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
262 /* 348 */ int freeThread = -1;
dc7622afcfea initial
dwinter
parents:
diff changeset
263 /* 349 */ while (freeThread == -1)
dc7622afcfea initial
dwinter
parents:
diff changeset
264 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
265 /* 351 */ freeThread = waitForFreeThread();
dc7622afcfea initial
dwinter
parents:
diff changeset
266 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
267 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
268 /* 355 */ if (this.textLanguage == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
269 /* 356 */ this.textLanguage = loadLanguages();
dc7622afcfea initial
dwinter
parents:
diff changeset
270 /* 357 */ this.mythreads[freeThread] = new ProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder);
dc7622afcfea initial
dwinter
parents:
diff changeset
271 /* 358 */ this.mythreads[freeThread].start();
dc7622afcfea initial
dwinter
parents:
diff changeset
272 /* 359 */ System.out.println("New process started:" + freeThread);
dc7622afcfea initial
dwinter
parents:
diff changeset
273 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
274 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
275 /* */ protected int waitForFreeThread()
dc7622afcfea initial
dwinter
parents:
diff changeset
276 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
277 /* 367 */ for (int i = 0; i < maxThread; ++i)
dc7622afcfea initial
dwinter
parents:
diff changeset
278 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
279 /* 369 */ if (this.mythreads[i] == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
280 /* 370 */ return i;
dc7622afcfea initial
dwinter
parents:
diff changeset
281 /* 371 */ if (!this.mythreads[i].done)
dc7622afcfea initial
dwinter
parents:
diff changeset
282 /* */ continue;
dc7622afcfea initial
dwinter
parents:
diff changeset
283 /* 373 */ this.filecount += 1;
dc7622afcfea initial
dwinter
parents:
diff changeset
284 /* 374 */ System.out.println("filecount:" + this.filecount);
dc7622afcfea initial
dwinter
parents:
diff changeset
285 /* 375 */ return i;
dc7622afcfea initial
dwinter
parents:
diff changeset
286 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
287 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
288 /* 378 */ return -1;
dc7622afcfea initial
dwinter
parents:
diff changeset
289 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
290 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
291 /* */ private boolean isTextFile(File file)
dc7622afcfea initial
dwinter
parents:
diff changeset
292 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
293 /* 392 */ String fn = file.getName();
dc7622afcfea initial
dwinter
parents:
diff changeset
294 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
295 /* 394 */ String[] splitted = fn.split("[.]");
dc7622afcfea initial
dwinter
parents:
diff changeset
296 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
297 /* 396 */ String ext = "";
dc7622afcfea initial
dwinter
parents:
diff changeset
298 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
299 /* 398 */ if (splitted.length > 1)
dc7622afcfea initial
dwinter
parents:
diff changeset
300 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
301 /* 400 */ ext = splitted[(splitted.length - 1)];
dc7622afcfea initial
dwinter
parents:
diff changeset
302 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
303 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
304 /* 403 */ return fileTypesToIndex.contains(ext);
dc7622afcfea initial
dwinter
parents:
diff changeset
305 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
306 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
307
dc7622afcfea initial
dwinter
parents:
diff changeset
308 /* Location: /private/tmp/fulltextIndexer.jar
dc7622afcfea initial
dwinter
parents:
diff changeset
309 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded
dc7622afcfea initial
dwinter
parents:
diff changeset
310 * JD-Core Version: 0.5.4
dc7622afcfea initial
dwinter
parents:
diff changeset
311 */