package edu.isi.dig.elasticsearch.mapreduce.inputformat;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.TimeZone;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import net.sf.json.JSONSerializer;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContextBuilder;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ESRecordReader extends RecordReader<Writable, Writable>{
private static Logger LOG = LoggerFactory.getLogger(ESRecordReader.class);
private String esHost=null;
private String esPort=null;
private String esIndex=null;
private String esDocType=null;
private String esUser=null;
private String esPassword = null;
private String startTimestamp=null;
private String endTimeStamp=null;
private String batchSize=null;
private String esProtocol=null;
private JSONArray results = null;
private int resultsIndex = -1;
private int totalHits=-1;
private int fromIndex=0;
CloseableHttpClient httpClient = null;
public long timestampToEpoch(String timeStamp)
{
try {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'");
sdf.setTimeZone(TimeZone.getTimeZone("UTC"));
Date formattedDate = sdf.parse(timeStamp);
return formattedDate.getTime();
}
catch(ParseException pe){
LOG.error("Date Parse Exception:" + pe.getMessage());
return -1l;
}
}
@Override
public void close() throws IOException {
httpClient.close();
}
@Override
public Writable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
@Override
public Writable getCurrentValue() throws IOException, InterruptedException {
//TODO increment number of hits processed
if(results != null && resultsIndex < results.size())
{
return new Text(results.get(resultsIndex).toString());
}
return null;
}
@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
// TODO calculate based on count from query result
return 0;
}
@Override
public void initialize(InputSplit rawSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration jobConfig = context.getConfiguration();
esHost = jobConfig.get("elasticsearch.hostname");
esPort = jobConfig.get("elasticsearch.port");
batchSize = jobConfig.get("elasticsearch.batchsize");
esIndex = jobConfig.get("elasticsearch.index");
esUser = jobConfig.get("elasticsearch.username");
esDocType = jobConfig.get("elasticsearch.doctype");
esPassword = jobConfig.get("elasticsearch.password");
startTimestamp= jobConfig.get("elasticsearch.starttimestamp");
endTimeStamp = jobConfig.get("elasticsearch.endtimestamp");
esProtocol = jobConfig.get("elasticsearch.protocol");
SSLContextBuilder builder = new SSLContextBuilder();
SSLConnectionSocketFactory sslsf=null;
try {
builder.loadTrustMaterial(null, new TrustSelfSignedStrategy());
sslsf = new SSLConnectionSocketFactory(builder.build());
} catch(Exception e)
{
LOG.error(e.getMessage());
}
if(esProtocol.equalsIgnoreCase("https"))
httpClient = HttpClients.custom().setSSLSocketFactory(sslsf).build();
else if(esProtocol.equalsIgnoreCase("http"))
httpClient = HttpClients.createDefault();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(results == null || resultsIndex >= results.size() - 1)
{
if(fromIndex <= totalHits || totalHits == -1)
{
String startTimeEpoch = String.valueOf(timestampToEpoch(startTimestamp));
String endTimeEpoch = String.valueOf(timestampToEpoch(endTimeStamp));
//LOG.info("Start time epoch: " + startTimeEpoch);
//LOG.info("End time epoch: " + endTimeEpoch);
String esQuery = "{ "+
"\"query\": {" +
"\"range\" : {" +
"\"timestamp\" : {" +
"\"gte\" :" + startTimeEpoch + "," +
"\"lte\" : "+ endTimeEpoch + "}}}," +
"\"size\": " + batchSize + ","+
"\"from\": " + String.valueOf(fromIndex) + "," +
"\"sort\": [{" + "\"_uid\": {" + "\"order\":" + "\"asc\"}}] " +
"}";
HttpPost httpPost = new HttpPost(esProtocol+"://" + esUser + ":" + esPassword + "@" + esHost + ":" + esPort + "/" + esIndex + "/" + esDocType + "/_search");
StringEntity entity = new StringEntity(esQuery,"UTF-8");
entity.setContentType("application/json");
httpPost.setEntity(entity);
HttpResponse httpResp = httpClient.execute(httpPost);
if(httpResp.getStatusLine().getStatusCode() == 200)
{
InputStream in = httpResp.getEntity().getContent();
String resultEntity = IOUtils.toString(in, "UTF-8");
JSONObject termQueryResponse = (JSONObject) JSONSerializer.toJSON(resultEntity);
//TODO save off total hits count to see if we need to page
if(termQueryResponse.containsKey("hits"))
{
JSONObject jHitsObject = termQueryResponse.getJSONObject("hits");
totalHits = Integer.parseInt(jHitsObject.get("total").toString());
if(totalHits > 0)
{
fromIndex+=Integer.parseInt(batchSize);
if(jHitsObject.containsKey("hits"))
{
results = jHitsObject.getJSONArray("hits");
resultsIndex = 0;
return true;
}
}
}
}
else
{
LOG.error("Unable to complete query: "+ httpResp.getStatusLine().getReasonPhrase());
}
}
}
else if(resultsIndex < results.size()-1) {
resultsIndex++;
return true;
}
return false;
}
}