nutch-mpiwg-plugins: conf/schema-solr4.xml comparison

comparison conf/schema-solr4.xml @ 0:3b37d71af924 default tip

iniitial

author	dwinter
date	Tue, 26 Feb 2013 15:50:30 +0100
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:3b37d71af924
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<!--
+Description: This document contains Solr 4.x schema definition to
+be used with Solr integration currently build into Nutch.
+This schema is not minimal, there are some useful field type definitions left,
+and the set of fields and their flags (indexed/stored/term vectors) can be
+further optimized depending on needs.  See
+http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+for more info.
+-->
+<schema name="nutch" version="1.5">
+<types>
+<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+<!--
+Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+-->
+<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+<!--
+Numeric field types that index each value at various levels of precision
+to accelerate range queries when the number of values between the range
+endpoints is large. See the javadoc for NumericRangeQuery for internal
+implementation details.
+Smaller precisionStep values (specified in bits) will lead to more tokens
+indexed per value, slightly larger index size, and faster range queries.
+A precisionStep of 0 disables indexing at different precision levels.
+-->
+<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+is a more restricted form of the canonical representation of dateTime
+http://www.w3.org/TR/xmlschema-2/#dateTime
+The trailing "Z" designates UTC time and is mandatory.
+Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+All other components are mandatory.
+Expressions can also be used to denote calculations that should be
+performed relative to "NOW" to determine the value, ie...
+NOW/HOUR
+... Round to the start of the current hour
+NOW-1DAY
+... Exactly 1 day prior to now
+NOW/DAY+6MONTHS+3DAYS
+... 6 months and 3 days in the future from the start of
+the current day
+Consult the DateField javadocs for more information.
+Note: For faster range queries, consider the tdate type
+-->
+<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+<!-- A Trie based date field for faster date range queries and date faceting. -->
+<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+<!-- solr.TextField allows the specification of custom text analyzers
+specified as a tokenizer and a list of token filters. Different
+analyzers may be specified for indexing and querying.
+The optional positionIncrementGap puts space between multiple fields of
+this type on the same document, with the purpose of preventing false phrase
+matching across fields.
+For more info on customizing your analyzer chain, please see
+http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+-->
+<!-- A general text field that has reasonable, generic
+cross-language defaults: it tokenizes with StandardTokenizer,
+	 removes stop words from case-insensitive "stopwords.txt"
+	 (empty by default), and down cases.  At query time only, it
+	 also applies synonyms. -->
+<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+<analyzer type="index">
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+<!-- in this example, we will only use synonyms at query time
+<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+-->
+<filter class="solr.LowerCaseFilterFactory"/>
+</analyzer>
+<analyzer type="query">
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+<filter class="solr.LowerCaseFilterFactory"/>
+</analyzer>
+</fieldType>
+<!-- A text field with defaults appropriate for English: it
+tokenizes with StandardTokenizer, removes English stop words
+(stopwords.txt), down cases, protects words from protwords.txt, and
+finally applies Porter's stemming.  The query time analyzer
+also applies synonyms from synonyms.txt. -->
+<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+<analyzer type="index">
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<!-- in this example, we will only use synonyms at query time
+<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+-->
+<!-- Case insensitive stop word removal.
+add enablePositionIncrements=true in both the index and query
+analyzers to leave a 'gap' for more accurate phrase queries.
+-->
+<filter class="solr.StopFilterFactory"
+ignoreCase="true"
+words="stopwords.txt"
+enablePositionIncrements="true"
+/>
+<filter class="solr.LowerCaseFilterFactory"/>
+	<filter class="solr.EnglishPossessiveFilterFactory"/>
+<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+<filter class="solr.EnglishMinimalStemFilterFactory"/>
+	-->
+<filter class="solr.PorterStemFilterFactory"/>
+</analyzer>
+<analyzer type="query">
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+<filter class="solr.StopFilterFactory"
+ignoreCase="true"
+words="stopwords.txt"
+enablePositionIncrements="true"
+/>
+<filter class="solr.LowerCaseFilterFactory"/>
+	<filter class="solr.EnglishPossessiveFilterFactory"/>
+<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+<filter class="solr.EnglishMinimalStemFilterFactory"/>
+	-->
+<filter class="solr.PorterStemFilterFactory"/>
+</analyzer>
+</fieldType>
+<!-- A text field with defaults appropriate for English, plus
+	 aggressive word-splitting and autophrase features enabled.
+	 This field is just like text_en, except it adds
+	 WordDelimiterFilter to enable splitting and matching of
+	 words on case-change, alpha numeric boundaries, and
+	 non-alphanumeric chars.  This means certain compound word
+	 cases will work, for example query "wi fi" will match
+	 document "WiFi" or "wi-fi".  However, other cases will still
+	 not match, for example if the query is "wifi" and the
+	 document is "wi fi" or if the query is "wi-fi" and the
+	 document is "wifi".
+-->
+<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+<analyzer type="index">
+<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+<!-- in this example, we will only use synonyms at query time
+<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+-->
+<!-- Case insensitive stop word removal.
+add enablePositionIncrements=true in both the index and query
+analyzers to leave a 'gap' for more accurate phrase queries.
+-->
+<filter class="solr.StopFilterFactory"
+ignoreCase="true"
+words="stopwords.txt"
+enablePositionIncrements="true"
+/>
+<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+<filter class="solr.LowerCaseFilterFactory"/>
+<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+<filter class="solr.PorterStemFilterFactory"/>
+</analyzer>
+<analyzer type="query">
+<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+<filter class="solr.StopFilterFactory"
+ignoreCase="true"
+words="stopwords.txt"
+enablePositionIncrements="true"
+/>
+<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+<filter class="solr.LowerCaseFilterFactory"/>
+<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+<filter class="solr.PorterStemFilterFactory"/>
+</analyzer>
+</fieldType>
+<!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
+but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
+<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+<analyzer>
+<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+<filter class="solr.LowerCaseFilterFactory"/>
+<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+<filter class="solr.EnglishMinimalStemFilterFactory"/>
+<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+possible with WordDelimiterFilter in conjuncton with stemming. -->
+<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+</analyzer>
+</fieldType>
+<!-- Just like text_general except it reverses the characters of
+	 each token, to enable more efficient leading wildcard queries. -->
+<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+<analyzer type="index">
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+<filter class="solr.LowerCaseFilterFactory"/>
+<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+</analyzer>
+<analyzer type="query">
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+<filter class="solr.LowerCaseFilterFactory"/>
+</analyzer>
+</fieldType>
+<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+<analyzer>
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+</analyzer>
+</fieldtype>
+<fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
+<analyzer>
+<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+<!--
+The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
+a token of "foo|1.4"  would be indexed as "foo" with a payload of 1.4f
+Attributes of the DelimitedPayloadTokenFilterFactory :
+"delimiter" - a one character delimiter. Default is | (pipe)
+	 "encoder" - how to encode the following value into a playload
+	    float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+	    integer -> o.a.l.a.p.IntegerEncoder
+	    identity -> o.a.l.a.p.IdentityEncoder
+Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
+-->
+<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
+</analyzer>
+</fieldtype>
+<!-- lowercases the entire field value, keeping it as a single token.  -->
+<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
+<analyzer>
+<tokenizer class="solr.KeywordTokenizerFactory"/>
+<filter class="solr.LowerCaseFilterFactory" />
+</analyzer>
+</fieldType>
+<fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+<analyzer>
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<filter class="solr.LowerCaseFilterFactory"/>
+<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
+</analyzer>
+</fieldType>
+<fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+<analyzer>
+<tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+</analyzer>
+</fieldType>
+<!-- since fields of this type are by default not stored or indexed,
+any data added to them will be ignored outright.  -->
+<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+</types>
+<fields>
+<field name="id" type="string" stored="true" indexed="true"/>
+<!-- core fields -->
+<field name="segment" type="string" stored="true" indexed="false"/>
+<field name="digest" type="string" stored="true" indexed="false"/>
+<field name="boost" type="float" stored="true" indexed="false"/>
+<!-- fields for index-basic plugin -->
+<field name="host" type="url" stored="false" indexed="true"/>
+<field name="url" type="url" stored="true" indexed="true" required="true"/>
+<!-- stored=true for highlighting, use term vectors  and positions for fast highlighting -->
+<field name="content" type="text_general" stored="true" indexed="true"/>
+<field name="title" type="text_general" stored="true" indexed="true"/>
+<field name="cache" type="string" stored="true" indexed="false"/>
+<field name="tstamp" type="date" stored="true" indexed="false"/>
+<!-- catch-all field -->
+<field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/>
+<!-- fields for index-anchor plugin -->
+<field name="anchor" type="text_general" stored="true" indexed="true"
+multiValued="true"/>
+<!-- fields for index-more plugin -->
+<field name="type" type="string" stored="true" indexed="true" multiValued="true"/>
+<field name="contentLength" type="string" stored="true" indexed="false"/>
+<field name="lastModified" type="date" stored="true" indexed="false"/>
+<field name="date" type="tdate" stored="true" indexed="true"/>
+<!-- fields for languageidentifier plugin -->
+<field name="lang" type="string" stored="true" indexed="true"/>
+<!-- fields for subcollection plugin -->
+<field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/>
+<!-- fields for feed plugin (tag is also used by microformats-reltag)-->
+<field name="author" type="string" stored="true" indexed="true"/>
+<field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
+<field name="feed" type="string" stored="true" indexed="true"/>
+<field name="publishedDate" type="date" stored="true" indexed="true"/>
+<field name="updatedDate" type="date" stored="true" indexed="true"/>
+<!-- fields for creativecommons plugin -->
+<field name="cc" type="string" stored="true" indexed="true" multiValued="true"/>
+</fields>
+<uniqueKey>id</uniqueKey>
+<defaultSearchField>text</defaultSearchField>
+<solrQueryParser defaultOperator="OR"/>
+<!-- copyField commands copy one field to another at the time a document
+is added to the index.  It's used either to index the same field differently,
+or to add multiple fields to the same field for easier/faster searching.  -->
+<copyField source="content" dest="text"/>
+<copyField source="url" dest="text"/>
+<copyField source="title" dest="text"/>
+<copyField source="anchor" dest="text"/>
+<copyField source="author" dest="text"/>
+</schema>

Mercurial > hg > nutch-mpiwg-plugins

comparison conf/schema-solr4.xml @ 0:3b37d71af924 default tip