package org.solbase.indexer;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.MultiTableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.solbase.SolbaseUtil;
import org.solbase.indexer.mapreduce.SolbaseIndexReducer;
// example scan query to get term vector rows - this is actual search query given term. term in this example query is "logan"
//scan 'TV', {STARTROW=>"contents\x7F\x7F\x7F\x7Flogan\x7F\x7F\x7F\x7F\x00\x00\x00\x00", ENDROW=>"contents\x7F\x7F\x7F\x7Flogan\x7F\x7F\x7F\x7F\x7F\x7F\x7F\x7F"}
public class SolbaseIndexerTool implements Tool{
private Configuration conf;
private SolbaseIndexUtil indexerUtil;
public SolbaseIndexerTool(SolbaseIndexUtil util){
this.indexerUtil = util;
}
public String convertScanToString(Scan scan) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(out);
scan.write(dos);
return Base64.encodeBytes(out.toByteArray());
}
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public int run(String[] arg0) throws Exception {
if(this.conf == null){
this.conf = new Configuration();
}
String tableName = arg0[0];
// for debugging, this will run local
// in 0.90, this only works for mapping phase, after that, it tries to copy blocks to reduce and never finishes that phase
//conf.set("mapred.job.tracker", "local");
// set up tables ahead of MR job
setupTables();
Scan scan = indexerUtil.getScanner();
// tablemapreduceutil way of doing it
Job job = new Job(conf);
// used to work fine without explicit resource addition of hbase to map/red in .89
job.getConfiguration().addResource("hbase-site.xml");
job.getConfiguration().set("indexerUtil", indexerUtil.getClass().getName());
job.setJarByClass(org.solbase.indexer.mapreduce.SolbaseInitialIndexMapper.class);
TableMapReduceUtil.initTableMapperJob(tableName, scan, org.solbase.indexer.mapreduce.SolbaseInitialIndexMapper.class, BytesWritable.class, MapWritable.class, job);
job.setJarByClass(org.solbase.indexer.mapreduce.SolbaseIndexReducer.class);
job.setJarByClass(org.solbase.lucenehbase.TermDocMetadata.class);
job.setJarByClass(org.solbase.lucenehbase.IndexWriter.class);
job.setJarByClass(org.solbase.SolbaseUtil.class);
job.setJarByClass(org.solbase.indexer.writable.TermDocMetadataWritable.class);
job.setJarByClass(org.solbase.indexer.writable.DocumentPutWritable.class);
job.setJarByClass(org.apache.lucene.document.EmbeddedSortField.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(MapWritable.class);
job.setReducerClass(SolbaseIndexReducer.class);
job.setOutputFormatClass(MultiTableOutputFormat.class);
// 1.75 * number of node (6) * max tasks (4) = 42, but seems small
job.setNumReduceTasks(48);
job.waitForCompletion(true);
return 0;
}
private void setupTables() {
HBaseAdmin admin;
try {
// should be running on the cluster that has zoo.cfg or hbase-site.xml on hadoop/hbase classpath already.
Configuration conf = HBaseConfiguration.create();
admin = new HBaseAdmin(conf);
if (!admin.isTableAvailable(SolbaseUtil.termVectorTable)) {
SolbaseUtil.createTermVectorTable(null, null, null);
} else {
//SolbaseUtil.truncateTable(admin, SolbaseUtil.termVectorTable);
}
if (!admin.isTableAvailable(SolbaseUtil.termVectorVersionIDTable)) {
SolbaseUtil.createTermVectorVersionIDTable();
} else {
//SolbaseUtil.truncateTable(admin, SolbaseUtil.termVectorVersionIDTable);
}
if (!admin.isTableAvailable(SolbaseUtil.docKeyIdMapTable)) {
SolbaseUtil.createDocKeyIdMapTable(null, null, null);
} else {
//SolbaseUtil.truncateTable(admin, SolbaseUtil.docKeyIdMapTable);
}
if (!admin.isTableAvailable(SolbaseUtil.docTable)) {
SolbaseUtil.createDocTable(null, null, null);
} else {
//SolbaseUtil.truncateTable(admin, SolbaseUtil.docTable);
}
if (!admin.isTableAvailable(SolbaseUtil.sequenceTable)) {
SolbaseUtil.createSequenceTable();
} else {
//SolbaseUtil.truncateTable(admin, SolbaseUtil.sequenceTable);
}
if (!admin.isTableAvailable(SolbaseUtil.uniqChecksumUserMediaTable)) {
SolbaseUtil.createUniqChecksumUserMediaTable(null, null, null);
} else {
//SolbaseUtil.truncateTable(admin, SolbaseUtil.uniqChecksumUserMediaTable);
}
} catch (MasterNotRunningException e) {
e.printStackTrace();
} catch (ZooKeeperConnectionException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}