Mercurial > hg > nutch-mpiwg-plugins
view conf/nutch-site.xml @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
line wrap: on
line source
<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Put site-specific property overrides in this file. --> <configuration> <property> <name>http.agent.name</name> <value>MPIWG crawler2</value> </property> <property> <name>urlmeta.tags</name> <value>description</value> </property> <property> <name>urlmeta.mpiwg</name> <value>first_name,last_name,project_title,project_author,project_author_url,description,main_content,lang,urlNorm</value> </property> <property> <name>urlmeta.mpiwg-parser</name> <value>mpiwg-parser.xml</value> </property> <property> <name>urlmeta.mpiwg-dom-parser</name> <value>mpiwg-dom-parser.xml</value> </property> <property> <name>plugin.includes</name> <value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)|parse-metatags|parse-mpiwg|parse-MPIWG-metaTag</value> <description>Regular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By default Nutch includes crawling just HTML and plain text via HTTP, and basic indexing and search plugins. </description> </property> <property> <name>http.content.limit</name> <value>200000</value> <description>The length limit for downloaded content using the http:// protocol, in bytes. If this value is nonnegative (>=0), content longer than it will be truncated; otherwise, no truncation at all. Do not confuse this setting with the file.content.limit setting. </description> </property> <property> <name>fetcher.server.delay</name> <value>1</value> <description>The number of seconds the fetcher will delay between successive requests to the same server.</description> </property> </configuration>