/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer.urlmeta; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.indexer.lucene.LuceneWriter; import org.apache.nutch.parse.Parse; /** * This is part of the URL Meta plugin. It is designed to enhance the NUTCH-655 * patch, by doing two things: 1. Meta Tags that are supplied with your Crawl * URLs, during injection, will be propagated throughout the outlinks of those * Crawl URLs. 2. When you index your URLs, the meta tags that you specified * with your URLs will be indexed alongside those URLs--and can be directly * queried, assuming you have done everything else correctly. * * The flat-file of URLs you are injecting should, per NUTCH-655, be tab-delimited * in the form of: * * [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN] * * Be aware that if you collide with keywords that are already in use (such * as nutch.score/nutch.fetchInterval) then you are in for some unpredictable * behavior. * * Furthermore, in your nutch-site.xml config, you must specify that this * plugin is to be used (1), as well as what (2) Meta Tags it should * actively look for. This does not mean that you must use these tags for * every URL, but it does mean that you must list _all_ of meta tags that * you have specified. If you want them to be propagated and indexed, that * is. * * 1. As of Nutch 1.2, the property "plugin.includes" looks as follows: * <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index * -(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic * |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You should add * urlmeta as an OR'ed option (e.g., using |) to the plugin list", in order to * call both pieces (the indexer and the scoring parts) of this plugin. * * 2. You must also specify the property "urlmeta.tags", who's values are * comma-delimited <value>key1, key2, key3</value> * * TODO: It may be ideal to offer two separate properties, to specify what * gets indexed versus merely propagated. * */ public class URLMetaIndexingFilter implements IndexingFilter { private static final Log LOG = LogFactory.getLog(URLMetaIndexingFilter.class); private static final String CONF_PROPERTY = "urlmeta.tags"; private static String[] urlMetaTags; private Configuration conf; /** * This will take the metatags that you have listed in your "urlmeta.tags" * property, and looks for them inside the CrawlDatum object. If they exist, * this will add it as an attribute inside the NutchDocument. * * @see IndexingFilter#filter */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf != null) this.setConf(conf); if (urlMetaTags == null || doc == null) return doc; for (String metatag : urlMetaTags) { Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); if (metadata != null) doc.add(metatag, metadata.toString()); } return doc; } /** * This tells the LuceneWriter that the above attributes should be part of its * Indexing process. * * @see IndexingFilter#addIndexBackendOptions */ public void addIndexBackendOptions(Configuration conf) { if (conf != null) this.setConf(conf); if (urlMetaTags == null) return; for (String metatag : urlMetaTags) { LuceneWriter.addFieldOptions(metatag, LuceneWriter.STORE.YES, LuceneWriter.INDEX.TOKENIZED, conf); } } /** Boilerplate */ public Configuration getConf() { return conf; } /** * handles conf assignment and pulls the value assignment from the * "urlmeta.tags" property */ public void setConf(Configuration conf) { this.conf = conf; if (conf == null) return; urlMetaTags = conf.getStrings(CONF_PROPERTY); } }