package org.apache.nutchbase.indexer;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.RowResult;
import org.apache.hadoop.hbase.mapred.TableMap;
import org.apache.hadoop.hbase.mapred.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.indexer.IndexerOutputFormat;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchIndexWriterFactory;
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.nutchbase.util.hbase.ImmutableRowPart;
import org.apache.nutchbase.util.hbase.TableColumns;
import org.apache.nutchbase.util.hbase.TableUtil;
public class IndexerHbase
extends MapReduceBase
implements Tool,
TableMap<ImmutableBytesWritable, ImmutableRowPart>,
Reducer<ImmutableBytesWritable, ImmutableRowPart,
ImmutableBytesWritable, NutchDocument> {
public static final Log LOG = LogFactory.getLog(IndexerHbase.class);
private static final Set<String> COLUMNS = new HashSet<String>();
private Configuration conf;
private IndexingFiltersHbase filters;
static {
COLUMNS.add(TableColumns.SIGNATURE_STR);
COLUMNS.add(TableColumns.PARSE_STATUS_STR);
COLUMNS.add(TableColumns.SCORE_STR);
}
public Configuration getConf() {
return conf;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public void configure(JobConf job) {
filters = new IndexingFiltersHbase(job);
}
public void map(ImmutableBytesWritable key, RowResult rowResult,
OutputCollector<ImmutableBytesWritable, ImmutableRowPart> output,
Reporter reporter)
throws IOException {
ImmutableRowPart row = new ImmutableRowPart(rowResult);
if (!row.hasColumn(TableColumns.PARSE_STATUS)) {
return;
}
ParseStatus pstatus = row.getParseStatus();
if (!pstatus.isSuccess() ||
pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
return; // filter urls not parsed
}
output.collect(key, row);
}
public void reduce(ImmutableBytesWritable key,
Iterator<ImmutableRowPart> values,
OutputCollector<ImmutableBytesWritable, NutchDocument> output,
Reporter reporter) throws IOException {
ImmutableRowPart row = values.next();
NutchDocument doc = new NutchDocument();
doc.add("digest", StringUtil.toHexString(row.getSignature()));
String url = TableUtil.unreverseUrl(Bytes.toString(key.get()));
if (LOG.isTraceEnabled()) {
LOG.trace("Indexing URL: " + url);
}
try {
doc = filters.filter(doc, url, row);
} catch (IndexingException e) {
LOG.warn("Error indexing "+key+": "+e);
return;
}
// skip documents discarded by indexing filters
if (doc == null) return;
float boost = row.getScore();
doc.setScore(boost);
// store boost for use by explain and dedup
doc.add("boost", Float.toString(boost));
output.collect(key, doc);
}
private Set<String> getColumnSet(JobConf job) {
Set<String> columnSet = new HashSet<String>(COLUMNS);
IndexingFiltersHbase filters = new IndexingFiltersHbase(job);
columnSet.addAll(filters.getColumnSet());
return columnSet;
}
public void index(Path indexDir, String table) throws IOException {
LOG.info("IndexerHbase: starting");
LOG.info("IndexerHbase: table: " + table);
JobConf job = new NutchJob(getConf());
job.setJobName("index " + table);
TableMapReduceUtil.initTableMapJob(table,
TableUtil.getColumns(getColumnSet(job)),
IndexerHbase.class, ImmutableBytesWritable.class,
ImmutableRowPart.class, job);
job.setReducerClass(IndexerHbase.class);
FileOutputFormat.setOutputPath(job, indexDir);
job.setOutputFormat(IndexerOutputFormat.class);
LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
LuceneWriter.addFieldOptions("boost", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
NutchIndexWriterFactory.addClassToConf(job, LuceneWriter.class);
JobClient.runJob(job);
LOG.info("IndexerHbase: done");
}
public int run(String[] args) throws Exception {
String usage = "Usage: IndexerHbase <index> <webtable>";
if (args.length != 2) {
System.err.println(usage);
System.exit(-1);
}
index(new Path(args[0]), args[1]);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new IndexerHbase(), args);
System.exit(res);
}
}