Mercurial > hg > nutch-mpiwg-plugins
comparison conf/nutch-site.xml @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3b37d71af924 |
---|---|
1 <?xml version="1.0"?> | |
2 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
3 | |
4 <!-- Put site-specific property overrides in this file. --> | |
5 | |
6 <configuration> | |
7 <property> | |
8 <name>http.agent.name</name> | |
9 <value>MPIWG crawler2</value> | |
10 </property> | |
11 <property> | |
12 <name>urlmeta.tags</name> | |
13 <value>description</value> | |
14 </property> | |
15 <property> | |
16 <name>urlmeta.mpiwg</name> | |
17 <value>first_name,last_name,project_title,project_author,project_author_url,description,main_content,lang,urlNorm</value> | |
18 </property> | |
19 | |
20 | |
21 <property> | |
22 <name>urlmeta.mpiwg-parser</name> | |
23 <value>mpiwg-parser.xml</value> | |
24 </property> | |
25 | |
26 | |
27 <property> | |
28 <name>urlmeta.mpiwg-dom-parser</name> | |
29 <value>mpiwg-dom-parser.xml</value> | |
30 </property> | |
31 | |
32 | |
33 <property> | |
34 <name>plugin.includes</name> | |
35 <value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)|parse-metatags|parse-mpiwg|parse-MPIWG-metaTag</value> | |
36 <description>Regular expression naming plugin directory names to | |
37 include. Any plugin not matching this expression is excluded. | |
38 In any case you need at least include the nutch-extensionpoints plugin. By | |
39 default Nutch includes crawling just HTML and plain text via HTTP, | |
40 and basic indexing and search plugins. | |
41 </description> | |
42 </property> | |
43 | |
44 | |
45 <property> | |
46 <name>http.content.limit</name> | |
47 <value>200000</value> | |
48 <description>The length limit for downloaded content using the http:// | |
49 protocol, in bytes. If this value is nonnegative (>=0), content longer | |
50 than it will be truncated; otherwise, no truncation at all. Do not | |
51 confuse this setting with the file.content.limit setting. | |
52 </description> | |
53 </property> | |
54 | |
55 <property> | |
56 <name>fetcher.server.delay</name> | |
57 <value>1</value> | |
58 <description>The number of seconds the fetcher will delay between | |
59 successive requests to the same server.</description> | |
60 </property> | |
61 | |
62 </configuration> |