Mercurial > hg > nutch-mpiwg-plugins
comparison conf/schema-solr4.xml @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3b37d71af924 |
---|---|
1 <?xml version="1.0" encoding="UTF-8" ?> | |
2 <!-- | |
3 Licensed to the Apache Software Foundation (ASF) under one or more | |
4 contributor license agreements. See the NOTICE file distributed with | |
5 this work for additional information regarding copyright ownership. | |
6 The ASF licenses this file to You under the Apache License, Version 2.0 | |
7 (the "License"); you may not use this file except in compliance with | |
8 the License. You may obtain a copy of the License at | |
9 | |
10 http://www.apache.org/licenses/LICENSE-2.0 | |
11 | |
12 Unless required by applicable law or agreed to in writing, software | |
13 distributed under the License is distributed on an "AS IS" BASIS, | |
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
15 See the License for the specific language governing permissions and | |
16 limitations under the License. | |
17 --> | |
18 <!-- | |
19 Description: This document contains Solr 4.x schema definition to | |
20 be used with Solr integration currently build into Nutch. | |
21 This schema is not minimal, there are some useful field type definitions left, | |
22 and the set of fields and their flags (indexed/stored/term vectors) can be | |
23 further optimized depending on needs. See | |
24 http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup | |
25 for more info. | |
26 --> | |
27 | |
28 <schema name="nutch" version="1.5"> | |
29 | |
30 <types> | |
31 | |
32 <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> | |
33 <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> | |
34 | |
35 | |
36 <!-- | |
37 Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. | |
38 --> | |
39 <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | |
40 <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | |
41 <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | |
42 <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | |
43 | |
44 <!-- | |
45 Numeric field types that index each value at various levels of precision | |
46 to accelerate range queries when the number of values between the range | |
47 endpoints is large. See the javadoc for NumericRangeQuery for internal | |
48 implementation details. | |
49 | |
50 Smaller precisionStep values (specified in bits) will lead to more tokens | |
51 indexed per value, slightly larger index size, and faster range queries. | |
52 A precisionStep of 0 disables indexing at different precision levels. | |
53 --> | |
54 <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> | |
55 <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> | |
56 <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> | |
57 <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> | |
58 | |
59 <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and | |
60 is a more restricted form of the canonical representation of dateTime | |
61 http://www.w3.org/TR/xmlschema-2/#dateTime | |
62 The trailing "Z" designates UTC time and is mandatory. | |
63 Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z | |
64 All other components are mandatory. | |
65 | |
66 Expressions can also be used to denote calculations that should be | |
67 performed relative to "NOW" to determine the value, ie... | |
68 | |
69 NOW/HOUR | |
70 ... Round to the start of the current hour | |
71 NOW-1DAY | |
72 ... Exactly 1 day prior to now | |
73 NOW/DAY+6MONTHS+3DAYS | |
74 ... 6 months and 3 days in the future from the start of | |
75 the current day | |
76 | |
77 Consult the DateField javadocs for more information. | |
78 | |
79 Note: For faster range queries, consider the tdate type | |
80 --> | |
81 <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/> | |
82 | |
83 <!-- A Trie based date field for faster date range queries and date faceting. --> | |
84 <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/> | |
85 | |
86 | |
87 <!-- solr.TextField allows the specification of custom text analyzers | |
88 specified as a tokenizer and a list of token filters. Different | |
89 analyzers may be specified for indexing and querying. | |
90 | |
91 The optional positionIncrementGap puts space between multiple fields of | |
92 this type on the same document, with the purpose of preventing false phrase | |
93 matching across fields. | |
94 | |
95 For more info on customizing your analyzer chain, please see | |
96 http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters | |
97 --> | |
98 | |
99 <!-- A general text field that has reasonable, generic | |
100 cross-language defaults: it tokenizes with StandardTokenizer, | |
101 removes stop words from case-insensitive "stopwords.txt" | |
102 (empty by default), and down cases. At query time only, it | |
103 also applies synonyms. --> | |
104 <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> | |
105 <analyzer type="index"> | |
106 <tokenizer class="solr.StandardTokenizerFactory"/> | |
107 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> | |
108 <!-- in this example, we will only use synonyms at query time | |
109 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> | |
110 --> | |
111 <filter class="solr.LowerCaseFilterFactory"/> | |
112 </analyzer> | |
113 <analyzer type="query"> | |
114 <tokenizer class="solr.StandardTokenizerFactory"/> | |
115 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> | |
116 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> | |
117 <filter class="solr.LowerCaseFilterFactory"/> | |
118 </analyzer> | |
119 </fieldType> | |
120 | |
121 <!-- A text field with defaults appropriate for English: it | |
122 tokenizes with StandardTokenizer, removes English stop words | |
123 (stopwords.txt), down cases, protects words from protwords.txt, and | |
124 finally applies Porter's stemming. The query time analyzer | |
125 also applies synonyms from synonyms.txt. --> | |
126 <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> | |
127 <analyzer type="index"> | |
128 <tokenizer class="solr.StandardTokenizerFactory"/> | |
129 <!-- in this example, we will only use synonyms at query time | |
130 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> | |
131 --> | |
132 <!-- Case insensitive stop word removal. | |
133 add enablePositionIncrements=true in both the index and query | |
134 analyzers to leave a 'gap' for more accurate phrase queries. | |
135 --> | |
136 <filter class="solr.StopFilterFactory" | |
137 ignoreCase="true" | |
138 words="stopwords.txt" | |
139 enablePositionIncrements="true" | |
140 /> | |
141 <filter class="solr.LowerCaseFilterFactory"/> | |
142 <filter class="solr.EnglishPossessiveFilterFactory"/> | |
143 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> | |
144 <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: | |
145 <filter class="solr.EnglishMinimalStemFilterFactory"/> | |
146 --> | |
147 <filter class="solr.PorterStemFilterFactory"/> | |
148 </analyzer> | |
149 <analyzer type="query"> | |
150 <tokenizer class="solr.StandardTokenizerFactory"/> | |
151 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> | |
152 <filter class="solr.StopFilterFactory" | |
153 ignoreCase="true" | |
154 words="stopwords.txt" | |
155 enablePositionIncrements="true" | |
156 /> | |
157 <filter class="solr.LowerCaseFilterFactory"/> | |
158 <filter class="solr.EnglishPossessiveFilterFactory"/> | |
159 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> | |
160 <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: | |
161 <filter class="solr.EnglishMinimalStemFilterFactory"/> | |
162 --> | |
163 <filter class="solr.PorterStemFilterFactory"/> | |
164 </analyzer> | |
165 </fieldType> | |
166 | |
167 <!-- A text field with defaults appropriate for English, plus | |
168 aggressive word-splitting and autophrase features enabled. | |
169 This field is just like text_en, except it adds | |
170 WordDelimiterFilter to enable splitting and matching of | |
171 words on case-change, alpha numeric boundaries, and | |
172 non-alphanumeric chars. This means certain compound word | |
173 cases will work, for example query "wi fi" will match | |
174 document "WiFi" or "wi-fi". However, other cases will still | |
175 not match, for example if the query is "wifi" and the | |
176 document is "wi fi" or if the query is "wi-fi" and the | |
177 document is "wifi". | |
178 --> | |
179 <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> | |
180 <analyzer type="index"> | |
181 <tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
182 <!-- in this example, we will only use synonyms at query time | |
183 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> | |
184 --> | |
185 <!-- Case insensitive stop word removal. | |
186 add enablePositionIncrements=true in both the index and query | |
187 analyzers to leave a 'gap' for more accurate phrase queries. | |
188 --> | |
189 <filter class="solr.StopFilterFactory" | |
190 ignoreCase="true" | |
191 words="stopwords.txt" | |
192 enablePositionIncrements="true" | |
193 /> | |
194 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> | |
195 <filter class="solr.LowerCaseFilterFactory"/> | |
196 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> | |
197 <filter class="solr.PorterStemFilterFactory"/> | |
198 </analyzer> | |
199 <analyzer type="query"> | |
200 <tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
201 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> | |
202 <filter class="solr.StopFilterFactory" | |
203 ignoreCase="true" | |
204 words="stopwords.txt" | |
205 enablePositionIncrements="true" | |
206 /> | |
207 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> | |
208 <filter class="solr.LowerCaseFilterFactory"/> | |
209 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> | |
210 <filter class="solr.PorterStemFilterFactory"/> | |
211 </analyzer> | |
212 </fieldType> | |
213 | |
214 <!-- Less flexible matching, but less false matches. Probably not ideal for product names, | |
215 but may be good for SKUs. Can insert dashes in the wrong place and still match. --> | |
216 <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> | |
217 <analyzer> | |
218 <tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
219 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> | |
220 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> | |
221 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> | |
222 <filter class="solr.LowerCaseFilterFactory"/> | |
223 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> | |
224 <filter class="solr.EnglishMinimalStemFilterFactory"/> | |
225 <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes | |
226 possible with WordDelimiterFilter in conjuncton with stemming. --> | |
227 <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> | |
228 </analyzer> | |
229 </fieldType> | |
230 | |
231 <!-- Just like text_general except it reverses the characters of | |
232 each token, to enable more efficient leading wildcard queries. --> | |
233 <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> | |
234 <analyzer type="index"> | |
235 <tokenizer class="solr.StandardTokenizerFactory"/> | |
236 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> | |
237 <filter class="solr.LowerCaseFilterFactory"/> | |
238 <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" | |
239 maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> | |
240 </analyzer> | |
241 <analyzer type="query"> | |
242 <tokenizer class="solr.StandardTokenizerFactory"/> | |
243 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> | |
244 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> | |
245 <filter class="solr.LowerCaseFilterFactory"/> | |
246 </analyzer> | |
247 </fieldType> | |
248 | |
249 <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > | |
250 <analyzer> | |
251 <tokenizer class="solr.StandardTokenizerFactory"/> | |
252 <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> | |
253 </analyzer> | |
254 </fieldtype> | |
255 | |
256 <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" > | |
257 <analyzer> | |
258 <tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
259 <!-- | |
260 The DelimitedPayloadTokenFilter can put payloads on tokens... for example, | |
261 a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f | |
262 Attributes of the DelimitedPayloadTokenFilterFactory : | |
263 "delimiter" - a one character delimiter. Default is | (pipe) | |
264 "encoder" - how to encode the following value into a playload | |
265 float -> org.apache.lucene.analysis.payloads.FloatEncoder, | |
266 integer -> o.a.l.a.p.IntegerEncoder | |
267 identity -> o.a.l.a.p.IdentityEncoder | |
268 Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor. | |
269 --> | |
270 <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/> | |
271 </analyzer> | |
272 </fieldtype> | |
273 | |
274 <!-- lowercases the entire field value, keeping it as a single token. --> | |
275 <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> | |
276 <analyzer> | |
277 <tokenizer class="solr.KeywordTokenizerFactory"/> | |
278 <filter class="solr.LowerCaseFilterFactory" /> | |
279 </analyzer> | |
280 </fieldType> | |
281 | |
282 <fieldType name="url" class="solr.TextField" positionIncrementGap="100"> | |
283 <analyzer> | |
284 <tokenizer class="solr.StandardTokenizerFactory"/> | |
285 <filter class="solr.LowerCaseFilterFactory"/> | |
286 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/> | |
287 </analyzer> | |
288 </fieldType> | |
289 | |
290 | |
291 <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100"> | |
292 <analyzer> | |
293 <tokenizer class="solr.PathHierarchyTokenizerFactory"/> | |
294 </analyzer> | |
295 </fieldType> | |
296 | |
297 <!-- since fields of this type are by default not stored or indexed, | |
298 any data added to them will be ignored outright. --> | |
299 <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> | |
300 | |
301 </types> | |
302 | |
303 <fields> | |
304 <field name="id" type="string" stored="true" indexed="true"/> | |
305 | |
306 <!-- core fields --> | |
307 <field name="segment" type="string" stored="true" indexed="false"/> | |
308 <field name="digest" type="string" stored="true" indexed="false"/> | |
309 <field name="boost" type="float" stored="true" indexed="false"/> | |
310 | |
311 <!-- fields for index-basic plugin --> | |
312 <field name="host" type="url" stored="false" indexed="true"/> | |
313 <field name="url" type="url" stored="true" indexed="true" required="true"/> | |
314 <!-- stored=true for highlighting, use term vectors and positions for fast highlighting --> | |
315 <field name="content" type="text_general" stored="true" indexed="true"/> | |
316 <field name="title" type="text_general" stored="true" indexed="true"/> | |
317 <field name="cache" type="string" stored="true" indexed="false"/> | |
318 <field name="tstamp" type="date" stored="true" indexed="false"/> | |
319 | |
320 <!-- catch-all field --> | |
321 <field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/> | |
322 | |
323 <!-- fields for index-anchor plugin --> | |
324 <field name="anchor" type="text_general" stored="true" indexed="true" | |
325 multiValued="true"/> | |
326 | |
327 <!-- fields for index-more plugin --> | |
328 <field name="type" type="string" stored="true" indexed="true" multiValued="true"/> | |
329 <field name="contentLength" type="string" stored="true" indexed="false"/> | |
330 <field name="lastModified" type="date" stored="true" indexed="false"/> | |
331 <field name="date" type="tdate" stored="true" indexed="true"/> | |
332 | |
333 <!-- fields for languageidentifier plugin --> | |
334 <field name="lang" type="string" stored="true" indexed="true"/> | |
335 | |
336 <!-- fields for subcollection plugin --> | |
337 <field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/> | |
338 | |
339 <!-- fields for feed plugin (tag is also used by microformats-reltag)--> | |
340 <field name="author" type="string" stored="true" indexed="true"/> | |
341 <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/> | |
342 <field name="feed" type="string" stored="true" indexed="true"/> | |
343 <field name="publishedDate" type="date" stored="true" indexed="true"/> | |
344 <field name="updatedDate" type="date" stored="true" indexed="true"/> | |
345 | |
346 <!-- fields for creativecommons plugin --> | |
347 <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/> | |
348 </fields> | |
349 <uniqueKey>id</uniqueKey> | |
350 <defaultSearchField>text</defaultSearchField> | |
351 <solrQueryParser defaultOperator="OR"/> | |
352 | |
353 <!-- copyField commands copy one field to another at the time a document | |
354 is added to the index. It's used either to index the same field differently, | |
355 or to add multiple fields to the same field for easier/faster searching. --> | |
356 | |
357 <copyField source="content" dest="text"/> | |
358 <copyField source="url" dest="text"/> | |
359 <copyField source="title" dest="text"/> | |
360 <copyField source="anchor" dest="text"/> | |
361 <copyField source="author" dest="text"/> | |
362 | |
363 </schema> |