comparison conf/schema-solr4.xml @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3b37d71af924
1 <?xml version="1.0" encoding="UTF-8" ?>
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one or more
4 contributor license agreements. See the NOTICE file distributed with
5 this work for additional information regarding copyright ownership.
6 The ASF licenses this file to You under the Apache License, Version 2.0
7 (the "License"); you may not use this file except in compliance with
8 the License. You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 -->
18 <!--
19 Description: This document contains Solr 4.x schema definition to
20 be used with Solr integration currently build into Nutch.
21 This schema is not minimal, there are some useful field type definitions left,
22 and the set of fields and their flags (indexed/stored/term vectors) can be
23 further optimized depending on needs. See
24 http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
25 for more info.
26 -->
27
28 <schema name="nutch" version="1.5">
29
30 <types>
31
32 <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
33 <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
34
35
36 <!--
37 Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
38 -->
39 <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
40 <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
41 <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
42 <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
43
44 <!--
45 Numeric field types that index each value at various levels of precision
46 to accelerate range queries when the number of values between the range
47 endpoints is large. See the javadoc for NumericRangeQuery for internal
48 implementation details.
49
50 Smaller precisionStep values (specified in bits) will lead to more tokens
51 indexed per value, slightly larger index size, and faster range queries.
52 A precisionStep of 0 disables indexing at different precision levels.
53 -->
54 <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
55 <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
56 <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
57 <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
58
59 <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
60 is a more restricted form of the canonical representation of dateTime
61 http://www.w3.org/TR/xmlschema-2/#dateTime
62 The trailing "Z" designates UTC time and is mandatory.
63 Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
64 All other components are mandatory.
65
66 Expressions can also be used to denote calculations that should be
67 performed relative to "NOW" to determine the value, ie...
68
69 NOW/HOUR
70 ... Round to the start of the current hour
71 NOW-1DAY
72 ... Exactly 1 day prior to now
73 NOW/DAY+6MONTHS+3DAYS
74 ... 6 months and 3 days in the future from the start of
75 the current day
76
77 Consult the DateField javadocs for more information.
78
79 Note: For faster range queries, consider the tdate type
80 -->
81 <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
82
83 <!-- A Trie based date field for faster date range queries and date faceting. -->
84 <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
85
86
87 <!-- solr.TextField allows the specification of custom text analyzers
88 specified as a tokenizer and a list of token filters. Different
89 analyzers may be specified for indexing and querying.
90
91 The optional positionIncrementGap puts space between multiple fields of
92 this type on the same document, with the purpose of preventing false phrase
93 matching across fields.
94
95 For more info on customizing your analyzer chain, please see
96 http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
97 -->
98
99 <!-- A general text field that has reasonable, generic
100 cross-language defaults: it tokenizes with StandardTokenizer,
101 removes stop words from case-insensitive "stopwords.txt"
102 (empty by default), and down cases. At query time only, it
103 also applies synonyms. -->
104 <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
105 <analyzer type="index">
106 <tokenizer class="solr.StandardTokenizerFactory"/>
107 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
108 <!-- in this example, we will only use synonyms at query time
109 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
110 -->
111 <filter class="solr.LowerCaseFilterFactory"/>
112 </analyzer>
113 <analyzer type="query">
114 <tokenizer class="solr.StandardTokenizerFactory"/>
115 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
116 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
117 <filter class="solr.LowerCaseFilterFactory"/>
118 </analyzer>
119 </fieldType>
120
121 <!-- A text field with defaults appropriate for English: it
122 tokenizes with StandardTokenizer, removes English stop words
123 (stopwords.txt), down cases, protects words from protwords.txt, and
124 finally applies Porter's stemming. The query time analyzer
125 also applies synonyms from synonyms.txt. -->
126 <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
127 <analyzer type="index">
128 <tokenizer class="solr.StandardTokenizerFactory"/>
129 <!-- in this example, we will only use synonyms at query time
130 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
131 -->
132 <!-- Case insensitive stop word removal.
133 add enablePositionIncrements=true in both the index and query
134 analyzers to leave a 'gap' for more accurate phrase queries.
135 -->
136 <filter class="solr.StopFilterFactory"
137 ignoreCase="true"
138 words="stopwords.txt"
139 enablePositionIncrements="true"
140 />
141 <filter class="solr.LowerCaseFilterFactory"/>
142 <filter class="solr.EnglishPossessiveFilterFactory"/>
143 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
144 <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
145 <filter class="solr.EnglishMinimalStemFilterFactory"/>
146 -->
147 <filter class="solr.PorterStemFilterFactory"/>
148 </analyzer>
149 <analyzer type="query">
150 <tokenizer class="solr.StandardTokenizerFactory"/>
151 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
152 <filter class="solr.StopFilterFactory"
153 ignoreCase="true"
154 words="stopwords.txt"
155 enablePositionIncrements="true"
156 />
157 <filter class="solr.LowerCaseFilterFactory"/>
158 <filter class="solr.EnglishPossessiveFilterFactory"/>
159 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
160 <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
161 <filter class="solr.EnglishMinimalStemFilterFactory"/>
162 -->
163 <filter class="solr.PorterStemFilterFactory"/>
164 </analyzer>
165 </fieldType>
166
167 <!-- A text field with defaults appropriate for English, plus
168 aggressive word-splitting and autophrase features enabled.
169 This field is just like text_en, except it adds
170 WordDelimiterFilter to enable splitting and matching of
171 words on case-change, alpha numeric boundaries, and
172 non-alphanumeric chars. This means certain compound word
173 cases will work, for example query "wi fi" will match
174 document "WiFi" or "wi-fi". However, other cases will still
175 not match, for example if the query is "wifi" and the
176 document is "wi fi" or if the query is "wi-fi" and the
177 document is "wifi".
178 -->
179 <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
180 <analyzer type="index">
181 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
182 <!-- in this example, we will only use synonyms at query time
183 <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
184 -->
185 <!-- Case insensitive stop word removal.
186 add enablePositionIncrements=true in both the index and query
187 analyzers to leave a 'gap' for more accurate phrase queries.
188 -->
189 <filter class="solr.StopFilterFactory"
190 ignoreCase="true"
191 words="stopwords.txt"
192 enablePositionIncrements="true"
193 />
194 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
195 <filter class="solr.LowerCaseFilterFactory"/>
196 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
197 <filter class="solr.PorterStemFilterFactory"/>
198 </analyzer>
199 <analyzer type="query">
200 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
201 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
202 <filter class="solr.StopFilterFactory"
203 ignoreCase="true"
204 words="stopwords.txt"
205 enablePositionIncrements="true"
206 />
207 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
208 <filter class="solr.LowerCaseFilterFactory"/>
209 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
210 <filter class="solr.PorterStemFilterFactory"/>
211 </analyzer>
212 </fieldType>
213
214 <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
215 but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
216 <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
217 <analyzer>
218 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
219 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
220 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
221 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
222 <filter class="solr.LowerCaseFilterFactory"/>
223 <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
224 <filter class="solr.EnglishMinimalStemFilterFactory"/>
225 <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
226 possible with WordDelimiterFilter in conjuncton with stemming. -->
227 <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
228 </analyzer>
229 </fieldType>
230
231 <!-- Just like text_general except it reverses the characters of
232 each token, to enable more efficient leading wildcard queries. -->
233 <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
234 <analyzer type="index">
235 <tokenizer class="solr.StandardTokenizerFactory"/>
236 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
237 <filter class="solr.LowerCaseFilterFactory"/>
238 <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
239 maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
240 </analyzer>
241 <analyzer type="query">
242 <tokenizer class="solr.StandardTokenizerFactory"/>
243 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
244 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
245 <filter class="solr.LowerCaseFilterFactory"/>
246 </analyzer>
247 </fieldType>
248
249 <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
250 <analyzer>
251 <tokenizer class="solr.StandardTokenizerFactory"/>
252 <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
253 </analyzer>
254 </fieldtype>
255
256 <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
257 <analyzer>
258 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
259 <!--
260 The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
261 a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
262 Attributes of the DelimitedPayloadTokenFilterFactory :
263 "delimiter" - a one character delimiter. Default is | (pipe)
264 "encoder" - how to encode the following value into a playload
265 float -> org.apache.lucene.analysis.payloads.FloatEncoder,
266 integer -> o.a.l.a.p.IntegerEncoder
267 identity -> o.a.l.a.p.IdentityEncoder
268 Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
269 -->
270 <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
271 </analyzer>
272 </fieldtype>
273
274 <!-- lowercases the entire field value, keeping it as a single token. -->
275 <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
276 <analyzer>
277 <tokenizer class="solr.KeywordTokenizerFactory"/>
278 <filter class="solr.LowerCaseFilterFactory" />
279 </analyzer>
280 </fieldType>
281
282 <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
283 <analyzer>
284 <tokenizer class="solr.StandardTokenizerFactory"/>
285 <filter class="solr.LowerCaseFilterFactory"/>
286 <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
287 </analyzer>
288 </fieldType>
289
290
291 <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
292 <analyzer>
293 <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
294 </analyzer>
295 </fieldType>
296
297 <!-- since fields of this type are by default not stored or indexed,
298 any data added to them will be ignored outright. -->
299 <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
300
301 </types>
302
303 <fields>
304 <field name="id" type="string" stored="true" indexed="true"/>
305
306 <!-- core fields -->
307 <field name="segment" type="string" stored="true" indexed="false"/>
308 <field name="digest" type="string" stored="true" indexed="false"/>
309 <field name="boost" type="float" stored="true" indexed="false"/>
310
311 <!-- fields for index-basic plugin -->
312 <field name="host" type="url" stored="false" indexed="true"/>
313 <field name="url" type="url" stored="true" indexed="true" required="true"/>
314 <!-- stored=true for highlighting, use term vectors and positions for fast highlighting -->
315 <field name="content" type="text_general" stored="true" indexed="true"/>
316 <field name="title" type="text_general" stored="true" indexed="true"/>
317 <field name="cache" type="string" stored="true" indexed="false"/>
318 <field name="tstamp" type="date" stored="true" indexed="false"/>
319
320 <!-- catch-all field -->
321 <field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/>
322
323 <!-- fields for index-anchor plugin -->
324 <field name="anchor" type="text_general" stored="true" indexed="true"
325 multiValued="true"/>
326
327 <!-- fields for index-more plugin -->
328 <field name="type" type="string" stored="true" indexed="true" multiValued="true"/>
329 <field name="contentLength" type="string" stored="true" indexed="false"/>
330 <field name="lastModified" type="date" stored="true" indexed="false"/>
331 <field name="date" type="tdate" stored="true" indexed="true"/>
332
333 <!-- fields for languageidentifier plugin -->
334 <field name="lang" type="string" stored="true" indexed="true"/>
335
336 <!-- fields for subcollection plugin -->
337 <field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/>
338
339 <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
340 <field name="author" type="string" stored="true" indexed="true"/>
341 <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
342 <field name="feed" type="string" stored="true" indexed="true"/>
343 <field name="publishedDate" type="date" stored="true" indexed="true"/>
344 <field name="updatedDate" type="date" stored="true" indexed="true"/>
345
346 <!-- fields for creativecommons plugin -->
347 <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/>
348 </fields>
349 <uniqueKey>id</uniqueKey>
350 <defaultSearchField>text</defaultSearchField>
351 <solrQueryParser defaultOperator="OR"/>
352
353 <!-- copyField commands copy one field to another at the time a document
354 is added to the index. It's used either to index the same field differently,
355 or to add multiple fields to the same field for easier/faster searching. -->
356
357 <copyField source="content" dest="text"/>
358 <copyField source="url" dest="text"/>
359 <copyField source="title" dest="text"/>
360 <copyField source="anchor" dest="text"/>
361 <copyField source="author" dest="text"/>
362
363 </schema>