/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
/**
* Indexes field description and keywords provided by the metataparser Note that
* we limit ourselves to these 2 types of metadata as we must specify the exact
* values in addIndexBackendOptions
**/
public class MetaTagsIndexer implements IndexingFilter {
public static final Log LOG = LogFactory.getLog(MetaTagsIndexer.class
.getName());
private Configuration conf;
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
Metadata metadata = parse.getData().getParseMeta();
String description = metadata.get("metatag.description");
String keywords = metadata.get("metatag.keywords");
if (description != null) {
doc.add("description", description);
LOG.debug(url.toString() + " : added " + description
+ " to the description Field");
}
if (keywords != null) {
// split the keywords and send them as separate fields
// in SOLR this will allow us to specify a gap in order to prevent
// cross keywords matching
String[] kws = keywords.split(" *, *");
for (String kw : kws) {
doc.add("keywords", kw);
}
LOG.debug(url.toString() + " : added " + kws + " to the keywords Field");
}
return doc;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
public void addIndexBackendOptions(Configuration conf) {
LuceneWriter.addFieldOptions("description", LuceneWriter.STORE.YES,
LuceneWriter.INDEX.TOKENIZED, conf);
LuceneWriter.addFieldOptions("keywords", LuceneWriter.STORE.NO,
LuceneWriter.INDEX.TOKENIZED, conf);
}
}