Mercurial > hg > nutch-mpiwg-plugins
diff src/plugin/parse-MPIWG-metaTag/src/java/de/mpiwg/itgroup/indexer/urlmeta/URLMetaIndexingFilter.java @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugin/parse-MPIWG-metaTag/src/java/de/mpiwg/itgroup/indexer/urlmeta/URLMetaIndexingFilter.java Tue Feb 26 15:50:30 2013 +0100 @@ -0,0 +1,120 @@ +package de.mpiwg.itgroup.indexer.urlmeta; + +import java.io.FileWriter; +import java.io.IOException; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchField; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; + +public class URLMetaIndexingFilter implements IndexingFilter { + + private static final Log LOG = LogFactory + .getLog(URLMetaIndexingFilter.class); + private static final String CONF_PROPERTY = "urlmeta.tags"; + private static String[] urlMetaTags; + + private static final String CONF_PROPERTY_MPIWG = "urlmeta.mpiwg"; + private static String[] metadDataClasses; + + private Configuration conf; + private FileWriter fw; + + /** + * This will take the metatags that you have listed in your "urlmeta.tags" + * property, and looks for them inside the CrawlDatum object. If they exist, + * this will add it as an attribute inside the NutchDocument. + * + * @see IndexingFilter#filter + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + if (conf != null) + this.setConf(conf); + + if (urlMetaTags == null || doc == null) + return doc; + + Metadata md = parse.getData().getParseMeta(); // get the metadata + + + for (String metatag : urlMetaTags) { + //Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); + //Set<Writable> show = datum.getMetaData().keySet(); + + String[] vals = md.getValues(metatag); + + + for (int i=0; i<vals.length;i++){ + + doc.add(metatag, vals[i]); + } + } + + for (String metatag : metadDataClasses) { + //Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); + //Set<Writable> show = datum.getMetaData().keySet(); + + String[] vals = md.getValues(metatag); + if (vals.length==0) // versuche noch einmal ob metag.metatag exisitiert + { + vals = md.getValues("metatag."+metatag); + } + + for (int i=0; i<vals.length;i++){ + + try { + fw.write("-------------\n"); + fw.write("URL:"+url); + fw.write("-------------\n"); + fw.write(vals[i ].toString()); + fw.write("\n"); + fw.flush(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + doc.add(metatag, vals[i]); + } + } + + return doc; + } + + /** Boilerplate */ + public Configuration getConf() { + return conf; + } + + /** + * handles conf assignment and pulls the value assignment from the + * "urlmeta.tags" property + */ + public void setConf(Configuration conf) { + this.conf = conf; + + try { + fw = new FileWriter("/tmp/out2",true); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if (conf == null) + return; + + urlMetaTags = conf.getStrings(CONF_PROPERTY); + metadDataClasses = conf.getStrings(CONF_PROPERTY_MPIWG); + } +} \ No newline at end of file