/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
public class IndexerMapReduce extends Configured
implements Mapper<Text, Writable, Text, NutchWritable>,
Reducer<Text, NutchWritable, Text, NutchIndexAction> {
public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
public static final String INDEXER_DELETE = "indexer.delete";
public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex";
public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
public static final String URL_FILTERING = "indexer.url.filters";
public static final String URL_NORMALIZING = "indexer.url.normalizers";
private boolean skip = false;
private boolean delete = false;
private boolean deleteRobotsNoIndex = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
// using normalizers and/or filters
private boolean normalize = false;
private boolean filter = false;
// url normalizers, filters and job configuration
private URLNormalizers urlNormalizers;
private URLFilters urlFilters;
public void configure(JobConf job) {
setConf(job);
this.filters = new IndexingFilters(getConf());
this.scfilters = new ScoringFilters(getConf());
this.delete = job.getBoolean(INDEXER_DELETE, false);
this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX, false);
this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
normalize = job.getBoolean(URL_NORMALIZING, false);
filter = job.getBoolean(URL_FILTERING, false);
if (normalize) {
urlNormalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
}
if (filter) {
urlFilters = new URLFilters(getConf());
}
}
/**
* Normalizes and trims extra whitespace from the given url.
*
* @param url The url to normalize.
*
* @return The normalized url.
*/
private String normalizeUrl(String url) {
if (!normalize) {
return url;
}
String normalized = null;
if (urlNormalizers != null) {
try {
// normalize and trim the url
normalized = urlNormalizers.normalize(url,
URLNormalizers.SCOPE_INDEXER);
normalized = normalized.trim();
}
catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
normalized = null;
}
}
return normalized;
}
/**
* Filters the given url.
*
* @param url The url to filter.
*
* @return The filtered url or null.
*/
private String filterUrl(String url) {
if (!filter) {
return url;
}
try {
url = urlFilters.filter(url);
} catch (Exception e) {
url = null;
}
return url;
}
public void map(Text key, Writable value,
OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
String urlString = filterUrl(normalizeUrl(key.toString()));
if (urlString == null) {
return;
} else {
key.set(urlString);
}
output.collect(key, new NutchWritable(value));
}
public void reduce(Text key, Iterator<NutchWritable> values,
OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
throws IOException {
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
final Writable value = values.next().get(); // unwrap
if (value instanceof Inlinks) {
inlinks = (Inlinks)value;
} else if (value instanceof CrawlDatum) {
final CrawlDatum datum = (CrawlDatum)value;
if (CrawlDatum.hasDbStatus(datum)) {
dbDatum = datum;
}
else if (CrawlDatum.hasFetchStatus(datum)) {
// don't index unmodified (empty) pages
if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
fetchDatum = datum;
/**
* Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT.
*/
if (delete) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) {
reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
}
}
} else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
continue;
} else {
throw new RuntimeException("Unexpected status: "+datum.getStatus());
}
} else if (value instanceof ParseData) {
parseData = (ParseData)value;
// Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
if (deleteRobotsNoIndex) {
// Get the robots meta data
String robotsMeta = parseData.getMeta("robots");
// Has it a noindex for this url?
if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
// Delete it!
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
}
} else if (value instanceof ParseText) {
parseText = (ParseText)value;
} else if (LOG.isWarnEnabled()) {
LOG.warn("Unrecognized type: "+value.getClass());
}
}
if (fetchDatum == null || dbDatum == null
|| parseText == null || parseData == null) {
return; // only have inlinks
}
// Whether to skip DB_NOTMODIFIED pages
if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
reporter.incrCounter("IndexerStatus", "Skipped", 1);
return;
}
if (!parseData.getStatus().isSuccess() ||
fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
return;
}
NutchDocument doc = new NutchDocument();
final Metadata metadata = parseData.getContentMeta();
// add segment, used to map from merged index back to segment files
doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
// add digest, used by dedup
doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
final Parse parse = new ParseImpl(parseText, parseData);
try {
// extract information from dbDatum and pass it to
// fetchDatum so that indexing filters can use it
final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (url != null) {
fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
}
// run indexing filters
doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
} catch (final IndexingException e) {
if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
reporter.incrCounter("IndexerStatus", "Errors", 1);
return;
}
// skip documents discarded by indexing filters
if (doc == null) {
reporter.incrCounter("IndexerStatus", "Skipped by filters", 1);
return;
}
float boost = 1.0f;
// run scoring filters
try {
boost = this.scfilters.indexerScore(key, doc, dbDatum,
fetchDatum, parse, inlinks, boost);
} catch (final ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Error calculating score " + key + ": " + e);
}
return;
}
// apply boost to all indexed fields.
doc.setWeight(boost);
// store boost for use by explain and dedup
doc.add("boost", Float.toString(boost));
reporter.incrCounter("IndexerStatus", "Documents added", 1);
NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
output.collect(key, action);
}
public void close() throws IOException { }
public static void initMRJob(Path crawlDb, Path linkDb,
Collection<Path> segments,
JobConf job) {
LOG.info("IndexerMapReduce: crawldb: " + crawlDb);
if (linkDb!=null)
LOG.info("IndexerMapReduce: linkdb: " + linkDb);
for (final Path segment : segments) {
LOG.info("IndexerMapReduces: adding segment: " + segment);
FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
}
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
if (linkDb!=null)
FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(IndexerMapReduce.class);
job.setReducerClass(IndexerMapReduce.class);
job.setOutputFormat(IndexerOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputValueClass(NutchWritable.class);
job.setOutputValueClass(NutchWritable.class);
}
}