/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.lucene.LuceneWriter; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; /** * Indexes field description and keywords provided by the metataparser Note that * we limit ourselves to these 2 types of metadata as we must specify the exact * values in addIndexBackendOptions **/ public class MetaTagsIndexer implements IndexingFilter { public static final Log LOG = LogFactory.getLog(MetaTagsIndexer.class .getName()); private Configuration conf; public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta(); String description = metadata.get("metatag.description"); String keywords = metadata.get("metatag.keywords"); if (description != null) { doc.add("description", description); LOG.debug(url.toString() + " : added " + description + " to the description Field"); } if (keywords != null) { // split the keywords and send them as separate fields // in SOLR this will allow us to specify a gap in order to prevent // cross keywords matching String[] kws = keywords.split(" *, *"); for (String kw : kws) { doc.add("keywords", kw); } LOG.debug(url.toString() + " : added " + kws + " to the keywords Field"); } return doc; } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return this.conf; } public void addIndexBackendOptions(Configuration conf) { LuceneWriter.addFieldOptions("description", LuceneWriter.STORE.YES, LuceneWriter.INDEX.TOKENIZED, conf); LuceneWriter.addFieldOptions("keywords", LuceneWriter.STORE.NO, LuceneWriter.INDEX.TOKENIZED, conf); } }