view src/plugin/parse-MPIWG-metaTag/src/java/de/mpiwg/itgroup/scoring/urlmeta/URLMetaScoringFilter.java @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
line wrap: on
line source

package de.mpiwg.itgroup.scoring.urlmeta;

import java.util.Collection;
import java.util.Map.Entry;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;


/**
 * For documentation:
 * 
 * @see URLMetaIndexingFilter
 */
public class URLMetaScoringFilter extends Configured implements ScoringFilter {

  private static final Log LOG = LogFactory.getLog(URLMetaScoringFilter.class);
  private static final String CONF_PROPERTY = "urlmeta.tags";
 
  
  private static final String CONF_PROPERTY_MPIWG = "urlmeta.mpiwg";
  
  private static String[] metaTags; // enthaelt alle metadaten relevant fuer MPIWG indexing,i.e. spezielle tags in meta und classen im body
  private Configuration conf;

  /**
   * This will take the metatags that you have listed in your "urlmeta.tags" and  "urlmeta.mpiwg"
   * property, and looks for them inside the parseData object. If they exist,
   * this will be propagated into your 'targets' Collection's ["outlinks"]
   * attributes.
   * 
   * @see ScoringFilter#distributeScoreToOutlinks
   */
  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
      CrawlDatum adjust, int allCount) throws ScoringFilterException {
    if (metaTags == null || targets == null || parseData == null)
      return adjust;

    Iterator<Entry<Text, CrawlDatum>> targetIterator = targets.iterator();

    while (targetIterator.hasNext()) {
      Entry<Text, CrawlDatum> nextTarget = targetIterator.next();

      for (String metatag : metaTags) {
        String metaFromParse = parseData.getMeta(metatag);

        if (metaFromParse == null)
          continue;

        nextTarget.getValue().getMetaData().put(new Text(metatag),
            new Text(metaFromParse));
      }
    }
    return adjust;
  }

  /**
   * Takes the metadata, specified in your "urlmeta.tags"  and "urlmeta.mpiwg" property, from the
   * datum object and injects it into the content. This is transfered to the
   * parseData object.
   * 
   * @see ScoringFilter#passScoreBeforeParsing
   * @see URLMetaScoringFilter#passScoreAfterParsing
   */
  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
    if (metaTags == null || content == null || datum == null)
      return;

    for (String metatag : metaTags) {
      Text metaFromDatum = (Text) datum.getMetaData().get(new Text(metatag));

      if (metaFromDatum == null)
        continue;

      content.getMetadata().set(metatag, metaFromDatum.toString());
    }
  }

  /**
   * Takes the metadata, which was lumped inside the content, and replicates it
   * within your parse data.
   * 
   * @see URLMetaScoringFilter#passScoreBeforeParsing
   * @see ScoringFilter#passScoreAfterParsing
   */
  public void passScoreAfterParsing(Text url, Content content, Parse parse) {
    if (metaTags == null || content == null || parse == null)
      return;

    for (String metatag : metaTags) {
      String metaFromContent = content.getMetadata().get(metatag);

      if (metaFromContent == null)
        continue;

      parse.getData().getParseMeta().set(metatag, metaFromContent);
    }
  }

  /** Boilerplate */
  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
      throws ScoringFilterException {
    return initSort;
  }

  /** Boilerplate */
  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
      throws ScoringFilterException {
    return initScore;
  }

  /** Boilerplate */
  public void initialScore(Text url, CrawlDatum datum)
      throws ScoringFilterException {
    return;
  }

  /** Boilerplate */
  public void injectedScore(Text url, CrawlDatum datum)
      throws ScoringFilterException {
    return;
  }

  /** Boilerplate */
  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
      List inlinked) throws ScoringFilterException {
    return;
  }

  /**
   * handles conf assignment and pulls the value assignment from the
   * "urlmeta.tags" property
   */
  public void setConf(Configuration conf) {
    super.setConf(conf);

    if (conf == null)
      return;

    //lade alle metadata typen
    String[] urlMetaTagsTmp = conf.getStrings(CONF_PROPERTY);
    //String[] classMetadata = conf.getStrings(CONF_PROPERTY_MPIWG);
     
    //metaTags = (String[]) ArrayUtils.addAll(urlMetaTagsTmp, classMetadata);
    metaTags = urlMetaTagsTmp;
     
    
  }

  /** Boilerplate */
  public Configuration getConf() {
    return conf;
  }
}