comparison conf/nutch-site.xml @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3b37d71af924
1 <?xml version="1.0"?>
2 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3
4 <!-- Put site-specific property overrides in this file. -->
5
6 <configuration>
7 <property>
8 <name>http.agent.name</name>
9 <value>MPIWG crawler2</value>
10 </property>
11 <property>
12 <name>urlmeta.tags</name>
13 <value>description</value>
14 </property>
15 <property>
16 <name>urlmeta.mpiwg</name>
17 <value>first_name,last_name,project_title,project_author,project_author_url,description,main_content,lang,urlNorm</value>
18 </property>
19
20
21 <property>
22 <name>urlmeta.mpiwg-parser</name>
23 <value>mpiwg-parser.xml</value>
24 </property>
25
26
27 <property>
28 <name>urlmeta.mpiwg-dom-parser</name>
29 <value>mpiwg-dom-parser.xml</value>
30 </property>
31
32
33 <property>
34 <name>plugin.includes</name>
35 <value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)|parse-metatags|parse-mpiwg|parse-MPIWG-metaTag</value>
36 <description>Regular expression naming plugin directory names to
37 include. Any plugin not matching this expression is excluded.
38 In any case you need at least include the nutch-extensionpoints plugin. By
39 default Nutch includes crawling just HTML and plain text via HTTP,
40 and basic indexing and search plugins.
41 </description>
42 </property>
43
44
45 <property>
46 <name>http.content.limit</name>
47 <value>200000</value>
48 <description>The length limit for downloaded content using the http://
49 protocol, in bytes. If this value is nonnegative (>=0), content longer
50 than it will be truncated; otherwise, no truncation at all. Do not
51 confuse this setting with the file.content.limit setting.
52 </description>
53 </property>
54
55 <property>
56 <name>fetcher.server.delay</name>
57 <value>1</value>
58 <description>The number of seconds the fetcher will delay between
59 successive requests to the same server.</description>
60 </property>
61
62 </configuration>