diff src/plugin/parse-MPIWG-metaTag/src/java/de/mpiwg/itgroup/indexer/urlmeta/URLMetaIndexingFilter.java @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugin/parse-MPIWG-metaTag/src/java/de/mpiwg/itgroup/indexer/urlmeta/URLMetaIndexingFilter.java	Tue Feb 26 15:50:30 2013 +0100
@@ -0,0 +1,120 @@
+package de.mpiwg.itgroup.indexer.urlmeta;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+
+public class URLMetaIndexingFilter implements IndexingFilter {
+
+        private static final Log LOG = LogFactory
+                        .getLog(URLMetaIndexingFilter.class);
+        private static final String CONF_PROPERTY = "urlmeta.tags";
+        private static String[] urlMetaTags;
+        
+        private static final String CONF_PROPERTY_MPIWG = "urlmeta.mpiwg";
+        private static String[] metadDataClasses;
+       
+        private Configuration conf;
+		private FileWriter fw;
+
+        /**
+         * This will take the metatags that you have listed in your "urlmeta.tags"
+         * property, and looks for them inside the CrawlDatum object. If they exist,
+         * this will add it as an attribute inside the NutchDocument.
+         * 
+         * @see IndexingFilter#filter
+         */
+        public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+                        CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+                if (conf != null)
+                        this.setConf(conf);
+
+                if (urlMetaTags == null || doc == null)
+                        return doc;
+
+                Metadata md = parse.getData().getParseMeta(); // get the metadata
+                
+                
+                for (String metatag : urlMetaTags) {
+                        //Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+                        //Set<Writable> show = datum.getMetaData().keySet();
+                		
+                	String[] vals = md.getValues(metatag);
+                
+                	
+                	for (int i=0; i<vals.length;i++){
+                		
+                                doc.add(metatag, vals[i]);
+                	}
+                }
+
+                for (String metatag : metadDataClasses) {
+                    //Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+                    //Set<Writable> show = datum.getMetaData().keySet();
+            		
+            	String[] vals = md.getValues(metatag);
+            	if (vals.length==0) // versuche noch einmal ob metag.metatag exisitiert
+            	{
+            		vals = md.getValues("metatag."+metatag);
+            	}
+            	
+            	for (int i=0; i<vals.length;i++){
+            		
+            		    	try {
+            		    		fw.write("-------------\n");
+            		    		fw.write("URL:"+url);
+            		    		fw.write("-------------\n");
+            					fw.write(vals[i ].toString());
+            					fw.write("\n");
+            					fw.flush();
+            				} catch (IOException e) {
+            					// TODO Auto-generated catch block
+            					e.printStackTrace();
+            				}
+            		    
+                            doc.add(metatag, vals[i]);
+            	}
+            }
+
+                return doc;
+        }
+
+        /** Boilerplate */
+        public Configuration getConf() {
+                return conf;
+        }
+
+        /**
+         * handles conf assignment and pulls the value assignment from the
+         * "urlmeta.tags" property
+         */
+        public void setConf(Configuration conf) {
+                this.conf = conf;
+                
+            	try {
+					fw = new FileWriter("/tmp/out2",true);
+				} catch (IOException e) {
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+                if (conf == null)
+                        return;
+
+                urlMetaTags = conf.getStrings(CONF_PROPERTY);
+                metadDataClasses = conf.getStrings(CONF_PROPERTY_MPIWG);
+        }
+}
\ No newline at end of file