view conf/nutch-site.xml @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
line wrap: on
line source

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
 <name>http.agent.name</name>
 <value>MPIWG crawler2</value>
</property>
<property>
 <name>urlmeta.tags</name>
 <value>description</value>
</property>
<property>
 <name>urlmeta.mpiwg</name>
 <value>first_name,last_name,project_title,project_author,project_author_url,description,main_content,lang,urlNorm</value>
</property>


<property>
<name>urlmeta.mpiwg-parser</name>
<value>mpiwg-parser.xml</value>
</property>


<property>
<name>urlmeta.mpiwg-dom-parser</name>
<value>mpiwg-dom-parser.xml</value>
</property>


<property>
  <name>plugin.includes</name>
  <value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)|parse-metatags|parse-mpiwg|parse-MPIWG-metaTag</value>
  <description>Regular expression naming plugin directory names to
  include.  Any plugin not matching this expression is excluded.
  In any case you need at least include the nutch-extensionpoints plugin. By
  default Nutch includes crawling just HTML and plain text via HTTP,
  and basic indexing and search plugins.
  </description>
</property>


<property>
  <name>http.content.limit</name>
  <value>200000</value>
  <description>The length limit for downloaded content using the http://
  protocol, in bytes. If this value is nonnegative (>=0), content longer
  than it will be truncated; otherwise, no truncation at all. Do not
  confuse this setting with the file.content.limit setting.
  </description>
</property>

<property>
  <name>fetcher.server.delay</name>
  <value>1</value>
  <description>The number of seconds the fetcher will delay between 
   successive requests to the same server.</description>
</property>

</configuration>