Mercurial > hg > nutch-mpiwg-plugins
diff conf/schema-solr4.xml @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/conf/schema-solr4.xml Tue Feb 26 15:50:30 2013 +0100 @@ -0,0 +1,363 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- + Description: This document contains Solr 4.x schema definition to + be used with Solr integration currently build into Nutch. + This schema is not minimal, there are some useful field type definitions left, + and the set of fields and their flags (indexed/stored/term vectors) can be + further optimized depending on needs. See + http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup + for more info. +--> + +<schema name="nutch" version="1.5"> + + <types> + + <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> + <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> + + + <!-- + Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. + --> + <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + + <!-- + Numeric field types that index each value at various levels of precision + to accelerate range queries when the number of values between the range + endpoints is large. See the javadoc for NumericRangeQuery for internal + implementation details. + + Smaller precisionStep values (specified in bits) will lead to more tokens + indexed per value, slightly larger index size, and faster range queries. + A precisionStep of 0 disables indexing at different precision levels. + --> + <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + + <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and + is a more restricted form of the canonical representation of dateTime + http://www.w3.org/TR/xmlschema-2/#dateTime + The trailing "Z" designates UTC time and is mandatory. + Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z + All other components are mandatory. + + Expressions can also be used to denote calculations that should be + performed relative to "NOW" to determine the value, ie... + + NOW/HOUR + ... Round to the start of the current hour + NOW-1DAY + ... Exactly 1 day prior to now + NOW/DAY+6MONTHS+3DAYS + ... 6 months and 3 days in the future from the start of + the current day + + Consult the DateField javadocs for more information. + + Note: For faster range queries, consider the tdate type + --> + <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/> + + <!-- A Trie based date field for faster date range queries and date faceting. --> + <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/> + + + <!-- solr.TextField allows the specification of custom text analyzers + specified as a tokenizer and a list of token filters. Different + analyzers may be specified for indexing and querying. + + The optional positionIncrementGap puts space between multiple fields of + this type on the same document, with the purpose of preventing false phrase + matching across fields. + + For more info on customizing your analyzer chain, please see + http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters + --> + + <!-- A general text field that has reasonable, generic + cross-language defaults: it tokenizes with StandardTokenizer, + removes stop words from case-insensitive "stopwords.txt" + (empty by default), and down cases. At query time only, it + also applies synonyms. --> + <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + + <!-- A text field with defaults appropriate for English: it + tokenizes with StandardTokenizer, removes English stop words + (stopwords.txt), down cases, protects words from protwords.txt, and + finally applies Porter's stemming. The query time analyzer + also applies synonyms from synonyms.txt. --> + <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <!-- Case insensitive stop word removal. + add enablePositionIncrements=true in both the index and query + analyzers to leave a 'gap' for more accurate phrase queries. + --> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="stopwords.txt" + enablePositionIncrements="true" + /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.EnglishPossessiveFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: + <filter class="solr.EnglishMinimalStemFilterFactory"/> + --> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="stopwords.txt" + enablePositionIncrements="true" + /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.EnglishPossessiveFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: + <filter class="solr.EnglishMinimalStemFilterFactory"/> + --> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- A text field with defaults appropriate for English, plus + aggressive word-splitting and autophrase features enabled. + This field is just like text_en, except it adds + WordDelimiterFilter to enable splitting and matching of + words on case-change, alpha numeric boundaries, and + non-alphanumeric chars. This means certain compound word + cases will work, for example query "wi fi" will match + document "WiFi" or "wi-fi". However, other cases will still + not match, for example if the query is "wifi" and the + document is "wi fi" or if the query is "wi-fi" and the + document is "wifi". + --> + <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer type="index"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <!-- Case insensitive stop word removal. + add enablePositionIncrements=true in both the index and query + analyzers to leave a 'gap' for more accurate phrase queries. + --> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="stopwords.txt" + enablePositionIncrements="true" + /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="stopwords.txt" + enablePositionIncrements="true" + /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Less flexible matching, but less false matches. Probably not ideal for product names, + but may be good for SKUs. Can insert dashes in the wrong place and still match. --> + <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.EnglishMinimalStemFilterFactory"/> + <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes + possible with WordDelimiterFilter in conjuncton with stemming. --> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Just like text_general except it reverses the characters of + each token, to enable more efficient leading wildcard queries. --> + <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" + maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + + <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> + </analyzer> + </fieldtype> + + <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" > + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <!-- + The DelimitedPayloadTokenFilter can put payloads on tokens... for example, + a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f + Attributes of the DelimitedPayloadTokenFilterFactory : + "delimiter" - a one character delimiter. Default is | (pipe) + "encoder" - how to encode the following value into a playload + float -> org.apache.lucene.analysis.payloads.FloatEncoder, + integer -> o.a.l.a.p.IntegerEncoder + identity -> o.a.l.a.p.IdentityEncoder + Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor. + --> + <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/> + </analyzer> + </fieldtype> + + <!-- lowercases the entire field value, keeping it as a single token. --> + <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory" /> + </analyzer> + </fieldType> + + <fieldType name="url" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/> + </analyzer> + </fieldType> + + + <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.PathHierarchyTokenizerFactory"/> + </analyzer> + </fieldType> + + <!-- since fields of this type are by default not stored or indexed, + any data added to them will be ignored outright. --> + <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> + + </types> + + <fields> + <field name="id" type="string" stored="true" indexed="true"/> + + <!-- core fields --> + <field name="segment" type="string" stored="true" indexed="false"/> + <field name="digest" type="string" stored="true" indexed="false"/> + <field name="boost" type="float" stored="true" indexed="false"/> + + <!-- fields for index-basic plugin --> + <field name="host" type="url" stored="false" indexed="true"/> + <field name="url" type="url" stored="true" indexed="true" required="true"/> + <!-- stored=true for highlighting, use term vectors and positions for fast highlighting --> + <field name="content" type="text_general" stored="true" indexed="true"/> + <field name="title" type="text_general" stored="true" indexed="true"/> + <field name="cache" type="string" stored="true" indexed="false"/> + <field name="tstamp" type="date" stored="true" indexed="false"/> + + <!-- catch-all field --> + <field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/> + + <!-- fields for index-anchor plugin --> + <field name="anchor" type="text_general" stored="true" indexed="true" + multiValued="true"/> + + <!-- fields for index-more plugin --> + <field name="type" type="string" stored="true" indexed="true" multiValued="true"/> + <field name="contentLength" type="string" stored="true" indexed="false"/> + <field name="lastModified" type="date" stored="true" indexed="false"/> + <field name="date" type="tdate" stored="true" indexed="true"/> + + <!-- fields for languageidentifier plugin --> + <field name="lang" type="string" stored="true" indexed="true"/> + + <!-- fields for subcollection plugin --> + <field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/> + + <!-- fields for feed plugin (tag is also used by microformats-reltag)--> + <field name="author" type="string" stored="true" indexed="true"/> + <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/> + <field name="feed" type="string" stored="true" indexed="true"/> + <field name="publishedDate" type="date" stored="true" indexed="true"/> + <field name="updatedDate" type="date" stored="true" indexed="true"/> + + <!-- fields for creativecommons plugin --> + <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/> + </fields> + <uniqueKey>id</uniqueKey> + <defaultSearchField>text</defaultSearchField> + <solrQueryParser defaultOperator="OR"/> + + <!-- copyField commands copy one field to another at the time a document + is added to the index. It's used either to index the same field differently, + or to add multiple fields to the same field for easier/faster searching. --> + + <copyField source="content" dest="text"/> + <copyField source="url" dest="text"/> + <copyField source="title" dest="text"/> + <copyField source="anchor" dest="text"/> + <copyField source="author" dest="text"/> + +</schema>