Mercurial > hg > nutch-mpiwg-plugins
view src/plugin/parse-MPIWG-metaTag/src/java/de/mpiwg/itgroup/indexer/urlmeta/URLMetaIndexingFilter.java @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.itgroup.indexer.urlmeta; import java.io.FileWriter; import java.io.IOException; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.indexer.NutchField; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; public class URLMetaIndexingFilter implements IndexingFilter { private static final Log LOG = LogFactory .getLog(URLMetaIndexingFilter.class); private static final String CONF_PROPERTY = "urlmeta.tags"; private static String[] urlMetaTags; private static final String CONF_PROPERTY_MPIWG = "urlmeta.mpiwg"; private static String[] metadDataClasses; private Configuration conf; private FileWriter fw; /** * This will take the metatags that you have listed in your "urlmeta.tags" * property, and looks for them inside the CrawlDatum object. If they exist, * this will add it as an attribute inside the NutchDocument. * * @see IndexingFilter#filter */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf != null) this.setConf(conf); if (urlMetaTags == null || doc == null) return doc; Metadata md = parse.getData().getParseMeta(); // get the metadata for (String metatag : urlMetaTags) { //Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); //Set<Writable> show = datum.getMetaData().keySet(); String[] vals = md.getValues(metatag); for (int i=0; i<vals.length;i++){ doc.add(metatag, vals[i]); } } for (String metatag : metadDataClasses) { //Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); //Set<Writable> show = datum.getMetaData().keySet(); String[] vals = md.getValues(metatag); if (vals.length==0) // versuche noch einmal ob metag.metatag exisitiert { vals = md.getValues("metatag."+metatag); } for (int i=0; i<vals.length;i++){ try { fw.write("-------------\n"); fw.write("URL:"+url); fw.write("-------------\n"); fw.write(vals[i ].toString()); fw.write("\n"); fw.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } doc.add(metatag, vals[i]); } } return doc; } /** Boilerplate */ public Configuration getConf() { return conf; } /** * handles conf assignment and pulls the value assignment from the * "urlmeta.tags" property */ public void setConf(Configuration conf) { this.conf = conf; try { fw = new FileWriter("/tmp/out2",true); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (conf == null) return; urlMetaTags = conf.getStrings(CONF_PROPERTY); metadDataClasses = conf.getStrings(CONF_PROPERTY_MPIWG); } }