package com.amazonaws.bigdatablog.edba;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Random;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;
import com.amazonaws.regions.Region;
import com.amazonaws.regions.Regions;
import com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient;
import com.amazonaws.services.elasticmapreduce.model.AddJobFlowStepsRequest;
import com.amazonaws.services.elasticmapreduce.model.AddJobFlowStepsResult;
import com.amazonaws.services.elasticmapreduce.model.Application;
import com.amazonaws.services.elasticmapreduce.model.ClusterState;
import com.amazonaws.services.elasticmapreduce.model.ClusterSummary;
import com.amazonaws.services.elasticmapreduce.model.DescribeClusterRequest;
import com.amazonaws.services.elasticmapreduce.model.DescribeStepRequest;
import com.amazonaws.services.elasticmapreduce.model.HadoopJarStepConfig;
import com.amazonaws.services.elasticmapreduce.model.ListClustersRequest;
import com.amazonaws.services.elasticmapreduce.model.ListClustersResult;
import com.amazonaws.services.elasticmapreduce.model.StepConfig;
import com.amazonaws.services.elasticmapreduce.model.StepState;
import com.amazonaws.services.elasticmapreduce.model.Tag;
import com.amazonaws.services.elasticmapreduce.util.StepFactory;
import com.amazonaws.services.lambda.runtime.Context;
import com.amazonaws.services.lambda.runtime.events.S3Event;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.event.S3EventNotification.S3EventNotificationRecord;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.util.StringUtils;
public class LambdaContainer {
//Validation/Conversion Layer function
public void validateAndNormalizeInputData(S3Event event,Context ctx) throws Exception{
AmazonS3 s3Client;
InputStream inputFileStream=null;
InputStream readableDataStream=null;
List<S3EventNotificationRecord> notificationRecords = event.getRecords();
s3Client = new AmazonS3Client();
String eventFileName,siteName,dbfName;
CSVParser fileParser = null;
for(S3EventNotificationRecord record : notificationRecords){
eventFileName = record.getS3().getObject().getKey();
S3Object s3Object = s3Client.getObject(new GetObjectRequest(record.getS3().getBucket().getName(), record.getS3().getObject().getKey()));
inputFileStream = s3Object.getObjectContent();
fileParser = new CSVParser(new InputStreamReader(inputFileStream),CSVFormat.TDF.withCommentMarker('-'));
List<CSVRecord> records = fileParser.getRecords();
StringWriter writer = new StringWriter();
CSVPrinter printer =null;
if(records.get(0).toString().matches(".*[^0-9].*")){
records.remove(0);
}
printer = new CSVPrinter(writer,CSVFormat.DEFAULT.withRecordSeparator(System.getProperty("line.separator")));
printer.printRecords(records);
printer.flush();
readableDataStream = new ByteArrayInputStream(writer.toString().getBytes("utf-8"));
s3Client.putObject(record.getS3().getBucket().getName(),"validated/"+eventFileName+".csv",readableDataStream,new ObjectMetadata());
printer.close();
readableDataStream.close();
}
}
// Tracking Input Layer lambda function
public void auditValidatedFile(S3Event event,Context ctx) throws Exception{
Connection conn = new com.mysql.jdbc.Driver().connect(props.getProperty("url"), props);
List<S3EventNotificationRecord> notificationRecords = event.getRecords();
PreparedStatement ps = conn.prepareStatement(props.getProperty("sql.auditValidatedFile"));
for(S3EventNotificationRecord record : notificationRecords){
String fileURL = record.getS3().getBucket().getName()+"/"+record.getS3().getObject().getKey();
ps.setString(1, fileURL);
ps.setString(2, "VALIDATED");
ps.setString(3,"VALIDATED");
ps.addBatch();
}
ps.executeBatch();
ps.close();
conn.close();
}
// EMR Job Criteria Check and Submission lambda function
public void checkConditionStatusAndFireEMRStep() throws Exception{
Connection conn = new com.mysql.jdbc.Driver().connect(props.getProperty("url"), props);
Statement conditionFetchStmt = conn.createStatement();
ResultSet rs = conditionFetchStmt.executeQuery(props.getProperty("sql.conditionFetch"));
PreparedStatement updateJobConfigPS = conn.prepareStatement(props.getProperty("sql.updateJobConfigStatus"));
Statement jobInputFilesMinTimestampStmt = conn.createStatement();
Statement updateSubmittedJobsStmt=conn.createStatement();
List<String> activeClusters = getActiveTaggedClusters();
String clusterId = null;
while(rs.next()){
System.out.println("job_input_pattern ::"+rs.getString("job_input_pattern"));
System.out.println("sql.jobInputFilesMinTSAndCount :: "+props.getProperty("sql.jobInputFilesMinTSAndCount"));
ResultSet conditionQueryResult = jobInputFilesMinTimestampStmt.executeQuery(props.getProperty("sql.jobInputFilesMinTSAndCount")+" "+rs.getString("job_input_pattern"));
conditionQueryResult.next();
if(conditionQueryResult.getTimestamp("min_lvt").after(rs.getTimestamp("last_run_timestamp"))
&&
conditionQueryResult.getInt("file_count") >= rs.getInt("job_min_file_count")
&&
isAdditionalCriteriaPassed(rs.getString("job_addl_criteria"),conn)){
clusterId = activeClusters.get(new Random().nextInt(activeClusters.size()-0));
String jobId = fireEMRJob(rs.getString("job_params"),clusterId);
updateJobConfigPS.setString(1,clusterId+":"+jobId);
updateJobConfigPS.setString(2, rs.getString("job_config_id"));
updateJobConfigPS.addBatch();
updateSubmittedJobsStmt.addBatch(props.getProperty("sql.updateSubmittedJobsJSON").replaceAll("\\?", rs.getString("job_config_id"))+" "+rs.getString("job_input_pattern"));
}
}
updateJobConfigPS.executeBatch();
updateSubmittedJobsStmt.executeBatch();
updateSubmittedJobsStmt.close();
updateJobConfigPS.close();
conditionFetchStmt.close();
conn.close();
}
// EMR Job Monitor lambda function
public void monitorEMRStep() throws Exception {
List<String> stepIds = new ArrayList<String>();
Connection conn = new com.mysql.jdbc.Driver().connect(props.getProperty("url"), props);
ResultSet openStepsRS = conn.createStatement().executeQuery(props.getProperty("sql.retrieveOpenSteps"));
AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
DescribeStepRequest stepReq=new DescribeStepRequest();
PreparedStatement ps = conn.prepareStatement(props.getProperty("sql.updateStepStatus"));
while(openStepsRS.next()){
stepReq.setClusterId(openStepsRS.getString("cluster_id"));
stepReq.setStepId(openStepsRS.getString("step_id"));
String stepState = emr.describeStep(stepReq).getStep().getStatus().getState();
if(stepState.equals(StepState.COMPLETED.toString())){
ps.setString(1,StepState.COMPLETED.toString());
}else if (stepState.equals(StepState.FAILED.toString())){
ps.setString(1,StepState.FAILED.toString());
}
ps.setString(2,openStepsRS.getString("job_config_id"));
ps.addBatch();
}
ps.executeBatch();
ps.close();
conn.close();
}
// adds EMR step
protected String fireEMRJob(String paramsStr,String clusterId){
StepFactory stepFactory = new StepFactory();
AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
emr.setRegion(Region.getRegion(Regions.fromName(System.getenv().get("AWS_REGION"))));
Application sparkConfig = new Application()
.withName("Spark");
String[] params = paramsStr.split(",");
StepConfig enabledebugging = new StepConfig()
.withName("Enable debugging")
.withActionOnFailure("TERMINATE_JOB_FLOW")
.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
HadoopJarStepConfig sparkStepConf = new HadoopJarStepConfig()
.withJar("command-runner.jar")
.withArgs(params);
final StepConfig sparkStep = new StepConfig()
.withName("Spark Step")
.withActionOnFailure("CONTINUE")
.withHadoopJarStep(sparkStepConf);
AddJobFlowStepsRequest request = new AddJobFlowStepsRequest(clusterId)
.withSteps(new ArrayList<StepConfig>(){{add(sparkStep);}});
AddJobFlowStepsResult result = emr.addJobFlowSteps(request);
return result.getStepIds().get(0);
}
protected List<String> getActiveTaggedClusters() throws Exception{
AmazonElasticMapReduceClient emrClient = new AmazonElasticMapReduceClient();
List<String> waitingClusters = new ArrayList<String>();
ListClustersResult clusterResult = emrClient.listClusters(new ListClustersRequest().withClusterStates(ClusterState.WAITING));
DescribeClusterRequest specifcTagDescribe = new DescribeClusterRequest();
specifcTagDescribe.putCustomQueryParameter("Cluster.Tags",null);
for( ClusterSummary cluster : clusterResult.getClusters()){
System.out.println("list cluster id "+cluster.getId());
List<Tag> tagList = emrClient.describeCluster(specifcTagDescribe.withClusterId(cluster.getId())).getCluster().getTags();
for(Tag tag:tagList){
if(tag.getKey().equals(props.getProperty("edba.cluster.tag.key"))){
waitingClusters.add(cluster.getId());
}
}
}
return waitingClusters;
}
/**
* Checks whether additional criteria returned a non empty resultset.
*/
protected boolean isAdditionalCriteriaPassed(String sql, Connection conn) throws Exception{
if(StringUtils.isNullOrEmpty(sql)){
return true;
}
ResultSet rs = conn.createStatement().executeQuery(sql);
if(!rs.next()){
return false; // Empty Resultset
}
return true;
}
static Properties props=null;
static{
try{
props = new Properties();
props.load(LambdaContainer.class.getResourceAsStream("/edba_lambda_config.properties"));
}catch(Exception ce){
ce.printStackTrace();
}
}
}