0
|
1 <?xml version="1.0" encoding="UTF-8" ?>
|
|
2 <!--
|
|
3 Licensed to the Apache Software Foundation (ASF) under one or
|
|
4 more contributor license agreements. See the NOTICE file
|
|
5 distributed with this work for additional information regarding
|
|
6 copyright ownership. The ASF licenses this file to You under the
|
|
7 Apache License, Version 2.0 (the "License"); you may not use
|
|
8 this file except in compliance with the License. You may obtain
|
|
9 a copy of the License at
|
|
10 http://www.apache.org/licenses/LICENSE-2.0 Unless required by
|
|
11 applicable law or agreed to in writing, software distributed
|
|
12 under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
13 WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14 See the License for the specific language governing permissions
|
|
15 and limitations under the License.
|
|
16 -->
|
|
17 <!--
|
|
18 Description: This document contains Solr 3.1 schema definition to
|
|
19 be used with Solr integration currently build into Nutch. See
|
|
20 https://issues.apache.org/jira/browse/NUTCH-442
|
|
21 https://issues.apache.org/jira/browse/NUTCH-699
|
|
22 https://issues.apache.org/jira/browse/NUTCH-994
|
|
23 https://issues.apache.org/jira/browse/NUTCH-997
|
|
24 https://issues.apache.org/jira/browse/NUTCH-1058
|
|
25 https://issues.apache.org/jira/browse/NUTCH-1232
|
|
26 and
|
|
27 http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/
|
|
28 example/solr/conf/schema.xml?view=markup
|
|
29 for more info.
|
|
30 -->
|
|
31 <schema name="nutch" version="1.5">
|
|
32 <types>
|
|
33 <fieldType name="string" class="solr.StrField" sortMissingLast="true"
|
|
34 omitNorms="true"/>
|
|
35 <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
|
|
36 omitNorms="true" positionIncrementGap="0"/>
|
|
37 <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
|
|
38 omitNorms="true" positionIncrementGap="0"/>
|
|
39 <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
|
|
40 omitNorms="true" positionIncrementGap="0"/>
|
|
41
|
|
42 <fieldType name="text" class="solr.TextField"
|
|
43 positionIncrementGap="100">
|
|
44 <analyzer>
|
|
45 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
|
46 <filter class="solr.StopFilterFactory"
|
|
47 ignoreCase="true" words="stopwords.txt"/>
|
|
48 <filter class="solr.WordDelimiterFilterFactory"
|
|
49 generateWordParts="1" generateNumberParts="1"
|
|
50 catenateWords="1" catenateNumbers="1" catenateAll="0"
|
|
51 splitOnCaseChange="1"/>
|
|
52 <filter class="solr.LowerCaseFilterFactory"/>
|
|
53 <filter class="solr.EnglishPorterFilterFactory"
|
|
54 protected="protwords.txt"/>
|
|
55 <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
|
56 </analyzer>
|
|
57 </fieldType>
|
|
58 <fieldType name="url" class="solr.TextField"
|
|
59 positionIncrementGap="100">
|
|
60 <analyzer>
|
|
61 <tokenizer class="solr.StandardTokenizerFactory"/>
|
|
62 <filter class="solr.LowerCaseFilterFactory"/>
|
|
63 <filter class="solr.WordDelimiterFilterFactory"
|
|
64 generateWordParts="1" generateNumberParts="1"/>
|
|
65 </analyzer>
|
|
66 </fieldType>
|
|
67 </types>
|
|
68 <fields>
|
|
69 <field name="id" type="string" stored="true" indexed="true"/>
|
|
70
|
|
71 <!-- core fields -->
|
|
72 <field name="segment" type="string" stored="true" indexed="false"/>
|
|
73 <field name="digest" type="string" stored="true" indexed="false"/>
|
|
74 <field name="boost" type="float" stored="true" indexed="false"/>
|
|
75
|
|
76 <!-- fields for index-basic plugin -->
|
|
77 <field name="host" type="string" stored="false" indexed="true"/>
|
|
78 <field name="url" type="url" stored="true" indexed="true"
|
|
79 required="true"/>
|
|
80 <field name="content" type="text" stored="false" indexed="true"/>
|
|
81 <field name="title" type="text" stored="true" indexed="true"/>
|
|
82 <field name="cache" type="string" stored="true" indexed="false"/>
|
|
83 <field name="tstamp" type="date" stored="true" indexed="false"/>
|
|
84
|
|
85 <!-- fields for index-anchor plugin -->
|
|
86 <field name="anchor" type="string" stored="true" indexed="true"
|
|
87 multiValued="true"/>
|
|
88
|
|
89 <!-- fields for index-more plugin -->
|
|
90 <field name="type" type="string" stored="true" indexed="true"
|
|
91 multiValued="true"/>
|
|
92 <field name="contentLength" type="long" stored="true"
|
|
93 indexed="false"/>
|
|
94 <field name="lastModified" type="date" stored="true"
|
|
95 indexed="false"/>
|
|
96 <field name="date" type="date" stored="true" indexed="true"/>
|
|
97
|
|
98 <!-- fields for languageidentifier plugin -->
|
|
99 <field name="lang" type="string" stored="true" indexed="true"/>
|
|
100
|
|
101 <!-- fields for subcollection plugin -->
|
|
102 <field name="subcollection" type="string" stored="true"
|
|
103 indexed="true" multiValued="true"/>
|
|
104
|
|
105 <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
|
|
106 <field name="author" type="string" stored="true" indexed="true"/>
|
|
107 <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
|
|
108 <field name="feed" type="string" stored="true" indexed="true"/>
|
|
109 <field name="publishedDate" type="date" stored="true"
|
|
110 indexed="true"/>
|
|
111 <field name="updatedDate" type="date" stored="true"
|
|
112 indexed="true"/>
|
|
113
|
|
114 <!-- fields for creativecommons plugin -->
|
|
115 <field name="cc" type="string" stored="true" indexed="true"
|
|
116 multiValued="true"/>
|
|
117 </fields>
|
|
118 <uniqueKey>id</uniqueKey>
|
|
119 <defaultSearchField>content</defaultSearchField>
|
|
120 <solrQueryParser defaultOperator="OR"/>
|
|
121 </schema>
|