package uk.bl.wa.hadoop.indexer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.PropertyConfigurator;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
import uk.bl.wa.apache.solr.hadoop.Solate;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.SolrWebServer;
import uk.bl.wa.solr.WctEnricher;
import uk.bl.wa.solr.WctFields;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
@SuppressWarnings({ "deprecation" })
public class WARCIndexerReducer extends MapReduceBase implements
Reducer<IntWritable, WritableSolrRecord, Text, Text> {
private static Log log = LogFactory.getLog(WARCIndexerReducer.class);
private SolrServer solrServer;
private int batchSize;
private boolean dummyRun;
private ArrayList<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
private int numberOfSequentialFails = 0;
private static final int SUBMISSION_PAUSE_MINS = 5;
private FileSystem fs;
private Path solrHomeDir = null;
private Path outputDir;
private String shardPrefix = "shard";
private boolean useEmbeddedServer = false;
private boolean exportXml = false;
static enum MyCounters {
NUM_RECORDS, NUM_ERRORS, NUM_DROPPED_RECORDS
}
public WARCIndexerReducer() {
try {
Properties props = new Properties();
props.load(getClass().getResourceAsStream(
"/log4j-override.properties"));
PropertyConfigurator.configure(props);
} catch (IOException e1) {
log.error("Failed to load log4j config from properties file.");
}
}
/**
* Sets up our SolrServer. Presumes the existence of either
* "warc.solr.zookepers" or "warc.solr.servers" in the config.
*/
@Override
public void configure(JobConf job) {
log.info("Configuring reducer, including Solr connection...");
// Get config from job property:
Config conf = ConfigFactory.parseString(job
.get(WARCIndexerRunner.CONFIG_PROPERTIES));
this.dummyRun = conf.getBoolean("warc.solr.dummy_run");
this.batchSize = conf.getInt("warc.solr.batch_size");
this.useEmbeddedServer = conf.getBoolean("warc.solr.hdfs");
if (job.get("mapred.output.oai-pmh") != null)
this.exportXml = Boolean.parseBoolean(job
.get("mapred.output.oai-pmh"));
// Decide between to-HDFS and to-SolrCloud indexing modes:
if (this.useEmbeddedServer) {
initEmbeddedServerConfig(job, conf);
} else {
solrServer = new SolrWebServer(conf).getSolrServer();
}
log.info("Initialisation complete.");
}
private void initEmbeddedServerConfig(JobConf job, Config conf) {
try {
// Filesystem:
job.setBoolean("fs.hdfs.impl.disable.cache", true);
fs = FileSystem.get(job);
// Input:
solrHomeDir = Solate.findSolrConfig(job,
WARCIndexerRunner.solrHomeZipName);
log.info("Found solrHomeDir " + solrHomeDir);
} catch (IOException e) {
e.printStackTrace();
log.error("FAILED in reducer configuration: " + e);
}
// Output:
outputDir = new Path(conf.getString(SolrWebServer.HDFS_OUTPUT_PATH));
}
private void initEmbeddedServer(int slice) throws IOException {
// Defined the output directory accordingly:
Path outputShardDir = new Path(fs.getHomeDirectory() + "/" + outputDir,
this.shardPrefix + slice);
// Fire up a server:
solrServer = Solate.createEmbeddedSolrServer(solrHomeDir, fs,
outputDir, outputShardDir);
}
@Override
public void reduce(IntWritable key, Iterator<WritableSolrRecord> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
WctEnricher wct;
WritableSolrRecord wsr;
SolrRecord solr;
// Get the slice number, but counting from 1 instead of 0:
int slice = key.get() + 1;
// For indexing into HDFS, set up a new server per key:
if (useEmbeddedServer) {
this.initEmbeddedServer(slice);
}
// Go through the documents for this shard:
long noValues = 0;
while (values.hasNext()) {
wsr = values.next();
solr = wsr.getSolrRecord();
noValues++;
// Add additional metadata for WCT Instances.
if (solr.containsKey(WctFields.WCT_INSTANCE_ID)) {
wct = new WctEnricher(key.toString());
wct.addWctMetadata(solr);
}
if (!dummyRun) {
docs.add(solr.getSolrDocument());
// Have we exceeded the batchSize?
checkSubmission(docs, batchSize, reporter);
} else {
log.info("DUMMY_RUN: Skipping addition of doc: "
+ solr.getField("id").getFirstValue());
}
// Occasionally update application-level status:
if ((noValues % 1000) == 0) {
reporter.setStatus(this.shardPrefix
+ slice
+ ": processed "
+ noValues
+ ", dropped "
+ reporter.getCounter(MyCounters.NUM_DROPPED_RECORDS)
.getValue());
}
if (this.exportXml
&& solr.getSolrDocument().getFieldValue(
SolrFields.SOLR_URL_TYPE) != null
&& solr.getSolrDocument()
.getFieldValue(SolrFields.SOLR_URL_TYPE)
.equals(
SolrFields.SOLR_URL_TYPE_SLASHPAGE)) {
output.collect(
new Text(""),
new Text(MetadataBuilder.SolrDocumentToElement(solr
.getSolrDocument())));
}
}
try {
/**
* If we have at least one document unsubmitted, make sure we submit
* it.
*/
checkSubmission(docs, 1, reporter);
// If we are indexing to HDFS, shut the shard down:
if (useEmbeddedServer) {
// Commit, and block until the changes have been flushed.
solrServer.commit(true, false);
// And shut it down.
solrServer.shutdown();
}
} catch (Exception e) {
log.error("ERROR on commit: " + e);
e.printStackTrace();
}
}
@Override
public void close() {
}
/**
* Checks whether a List of docs has exceeded a given limit and if so,
* submits them.
*
* @param docs
* @param limit
* @param reporter
*/
private void checkSubmission(List<SolrInputDocument> docs, int limit,
Reporter reporter) {
if (docs.size() > 0 && docs.size() >= limit) {
try {
// Inform that there is progress (still-alive):
reporter.progress();
// Add the documents:
UpdateResponse response = solrServer.add(docs);
log.info("Submitted " + docs.size() + " docs ["
+ response.getStatus() + "]");
// Update document counter:
reporter.incrCounter(MyCounters.NUM_RECORDS, docs.size());
// All good:
docs.clear();
numberOfSequentialFails = 0;
} catch (Exception e) {
// Count up repeated fails:
numberOfSequentialFails++;
// If there have been a lot of fails, drop the records
// (we have seen some "Invalid UTF-8 character 0xfffe at char"
// so this avoids bad data blocking job completion)
if (this.numberOfSequentialFails >= 3) {
log.error("Submission has repeatedly failed - assuming bad data and dropping these "
+ docs.size() + " records.");
reporter.incrCounter(MyCounters.NUM_DROPPED_RECORDS,
docs.size());
docs.clear();
}
// SOLR-5719 possibly hitting us here;
// CloudSolrServer.RouteException
log.error("Sleeping for " + SUBMISSION_PAUSE_MINS
+ " minute(s): " + e.getMessage(), e);
// Also add a report for this condition:
reporter.incrCounter(MyCounters.NUM_ERRORS, 1);
try {
Thread.sleep(1000 * 60 * SUBMISSION_PAUSE_MINS);
} catch (InterruptedException ex) {
log.warn("Sleep between Solr submissions was interrupted!");
}
}
}
}
}