package ml.shifu.shifu.combo;
import ml.shifu.shifu.container.obj.RawSourceData;
import ml.shifu.shifu.pig.PigExecutor;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
/**
* Created by zhanhu on 12/13/16.
*/
public class PigDataJoin {
private static Logger LOG = LoggerFactory.getLogger(PigDataJoin.class);
private static final String DATA_PREFIX = "data";
public void join(String uidColumnName, String outputPath, List<ColumnFile> columnFileList) throws IOException {
String pigCode = genPigJoinCode(uidColumnName, outputPath, columnFileList);
LOG.debug("\n" + pigCode);
// Run pig code to merge data
PigExecutor.getExecutor().submitJob(RawSourceData.SourceType.HDFS, pigCode);
}
/*
* Generate pig code for data merge
*
* @param uidColumnName - the column to join
*
* @param outputPath - the output path for joined file
*
* @param columnFileList
*
* @return
*
* @throws IOException
*/
public String genPigJoinCode(String uidColumnName, String outputPath, List<ColumnFile> columnFileList)
throws IOException {
ByteArrayOutputStream byos = new ByteArrayOutputStream();
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(byos));
try {
List<String> relations = new ArrayList<String>();
int i = 0;
for(ColumnFile columnFile: columnFileList) {
String relation = DATA_PREFIX + (i++);
writeLine(writer, relation + " = load '" + columnFile.getFilePath()
+ "' using PigStorage('|', '-schema');");
if(columnFile.hasSelectedVar(uidColumnName)) {
writeLine(writer,
relation + " = foreach " + relation + " generate " + columnFile.genFieldSelector() + ";");
} else {
writeLine(writer, relation + " = foreach " + relation + " generate " + uidColumnName + " as "
+ uidColumnName + ", " + columnFile.genFieldSelector() + ";");
}
relations.add(relation);
}
writeLine(writer, "result = group " + genGroupByClauses(relations, uidColumnName) + ";");
writeLine(writer, "result = foreach result " + genLimitClauses(relations, uidColumnName) + ";");
writeLine(writer, "result = filter result by " + genFilterSizeClauses(relations) + ";");
writeLine(writer, "result = foreach result generate " + genFlattenClauses(relations) + ";");
writeLine(writer, "result = foreach result generate " + genRenameClauses(columnFileList, relations) + ";");
writeLine(writer, "rmf " + outputPath + ";");
writeLine(writer, "store result into '" + outputPath + "' using PigStorage('|', '-schema');");
} catch (IOException e) {
LOG.error("Fail to generate pig code for data merge.", e);
throw e;
} finally {
IOUtils.closeQuietly(writer);
IOUtils.closeQuietly(byos);
}
return byos.toString();
}
/*
* Generate filter by size clauses
*/
private String genFilterSizeClauses(List<String> relations) {
List<String> filterByClauses = new ArrayList<String>();
for(String relation: relations) {
filterByClauses.add("SIZE(" + relation + ") == 1");
}
return StringUtils.join(filterByClauses, " and ");
}
/*
* Generate group by clauses
*
* @param relations - Relation list
*
* @param uidColumnName - join columnName
*
* @return - pig group list
*/
private String genGroupByClauses(List<String> relations, String uidColumnName) {
List<String> groupByClauses = new ArrayList<String>();
for(String relation: relations) {
groupByClauses.add(relation + " by " + uidColumnName);
}
return StringUtils.join(groupByClauses, ",");
}
/*
* Generate limit 1 clause after group-by
*
* @param relations - Relation list
*
* @param uidColumnName - join column Name
*
* @return pig limit list
*/
private String genLimitClauses(List<String> relations, String uidColumnName) {
List<String> limitsClauses = new ArrayList<String>();
for(String relation: relations) {
limitsClauses.add(relation + " = limit " + relation + " 1");
}
StringBuffer buf = new StringBuffer();
buf.append("generate group," + StringUtils.join(relations, ","));
limitsClauses.add(buf.toString());
return "{ " + StringUtils.join(limitsClauses, ";") + "; }";
}
/*
* Generate flatten clauses
*
* @param relations
*
* @return - pig flatten list
*/
private String genFlattenClauses(List<String> relations) {
List<String> flattenClauses = new ArrayList<String>();
for(String relation: relations) {
flattenClauses.add("FLATTEN(" + relation + ")");
}
return StringUtils.join(flattenClauses, ",");
}
/*
* Generate fields rename clauses
*
* @param columnFileList
*
* @param relations
*
* @return - pig rename(as) list
*/
private String genRenameClauses(List<ColumnFile> columnFileList, List<String> relations) {
List<String> renameClauses = new ArrayList<String>();
for(int i = 0; i < columnFileList.size(); i++) {
ColumnFile columnFile = columnFileList.get(i);
String relation = relations.get(i);
List<String> outputVars = columnFile.getOutputVarNames();
for(String var: outputVars) {
renameClauses.add(relation + "::" + var + " as " + var);
}
}
return StringUtils.join(renameClauses, ",");
}
/*
* Write line with "\n"
*
* @param writer
*
* @param line
*
* @throws IOException
*/
private void writeLine(BufferedWriter writer, String line) throws IOException {
writer.write(line);
writer.newLine();
}
}