view src/plugin/parse-MPIWG-metaTag/src/java/de/mpiwg/itgroup/indexer/urlmeta/URLMetaIndexingFilter.java @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
line wrap: on
line source

package de.mpiwg.itgroup.indexer.urlmeta;

import java.io.FileWriter;
import java.io.IOException;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchField;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;

public class URLMetaIndexingFilter implements IndexingFilter {

        private static final Log LOG = LogFactory
                        .getLog(URLMetaIndexingFilter.class);
        private static final String CONF_PROPERTY = "urlmeta.tags";
        private static String[] urlMetaTags;
        
        private static final String CONF_PROPERTY_MPIWG = "urlmeta.mpiwg";
        private static String[] metadDataClasses;
       
        private Configuration conf;
		private FileWriter fw;

        /**
         * This will take the metatags that you have listed in your "urlmeta.tags"
         * property, and looks for them inside the CrawlDatum object. If they exist,
         * this will add it as an attribute inside the NutchDocument.
         * 
         * @see IndexingFilter#filter
         */
        public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
                        CrawlDatum datum, Inlinks inlinks) throws IndexingException {
                if (conf != null)
                        this.setConf(conf);

                if (urlMetaTags == null || doc == null)
                        return doc;

                Metadata md = parse.getData().getParseMeta(); // get the metadata
                
                
                for (String metatag : urlMetaTags) {
                        //Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
                        //Set<Writable> show = datum.getMetaData().keySet();
                		
                	String[] vals = md.getValues(metatag);
                
                	
                	for (int i=0; i<vals.length;i++){
                		
                                doc.add(metatag, vals[i]);
                	}
                }

                for (String metatag : metadDataClasses) {
                    //Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
                    //Set<Writable> show = datum.getMetaData().keySet();
            		
            	String[] vals = md.getValues(metatag);
            	if (vals.length==0) // versuche noch einmal ob metag.metatag exisitiert
            	{
            		vals = md.getValues("metatag."+metatag);
            	}
            	
            	for (int i=0; i<vals.length;i++){
            		
            		    	try {
            		    		fw.write("-------------\n");
            		    		fw.write("URL:"+url);
            		    		fw.write("-------------\n");
            					fw.write(vals[i ].toString());
            					fw.write("\n");
            					fw.flush();
            				} catch (IOException e) {
            					// TODO Auto-generated catch block
            					e.printStackTrace();
            				}
            		    
                            doc.add(metatag, vals[i]);
            	}
            }

                return doc;
        }

        /** Boilerplate */
        public Configuration getConf() {
                return conf;
        }

        /**
         * handles conf assignment and pulls the value assignment from the
         * "urlmeta.tags" property
         */
        public void setConf(Configuration conf) {
                this.conf = conf;
                
            	try {
					fw = new FileWriter("/tmp/out2",true);
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
                if (conf == null)
                        return;

                urlMetaTags = conf.getStrings(CONF_PROPERTY);
                metadDataClasses = conf.getStrings(CONF_PROPERTY_MPIWG);
        }
}