/*******************************************************************************
* Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*******************************************************************************/
package hydrograph.engine.cascading.assembly;
import cascading.pipe.Checkpoint;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.assembly.Rename;
import cascading.pipe.assembly.Retain;
import cascading.pipe.joiner.Joiner;
import cascading.pipe.joiner.MixedJoin;
import cascading.pipe.joiner.OuterJoin;
import cascading.tuple.Fields;
import com.google.common.primitives.Booleans;
import hydrograph.engine.cascading.assembly.base.BaseComponent;
import hydrograph.engine.cascading.assembly.infra.ComponentParameters;
import hydrograph.engine.cascading.assembly.utils.JoinHelper;
import hydrograph.engine.cascading.filters.BlockAllFilter;
import hydrograph.engine.cascading.filters.JoinGetUnmatchedRecordsFilter;
import hydrograph.engine.cascading.filters.JoinOutLinkFilter;
import hydrograph.engine.cascading.filters.JoinUnusedLinkFilter;
import hydrograph.engine.core.component.entity.JoinEntity;
import hydrograph.engine.core.component.entity.elements.JoinKeyFields;
import hydrograph.engine.core.component.entity.elements.OutSocket;
import hydrograph.engine.utilities.ComponentHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
/**
* Join Component for joining two or more files.
*
* @author ganesha
*
*/
public class JoinAssembly extends BaseComponent<JoinEntity> {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory
.getLogger(JoinAssembly.class);
private final String RECORD_PRESENT_INDICATOR = "flag";
private Fields[] uniqInputFields;
private Fields[] uniqKeyFields;
private Pipe[] inputLinks;
private Joiner joiner;
private JoinHelper joinHelper;
private JoinEntity joinEntity;
private Boolean isUnusedSocketPresent;
private boolean[] joinTypes;
public JoinAssembly(JoinEntity joinEntity, ComponentParameters compParams) {
super(joinEntity, compParams);
}
@Override
protected void createAssembly() {
try {
if (LOG.isTraceEnabled()) {
LOG.trace(joinEntity.toString());
}
OutSocket outSocket = getSocketForType(
joinEntity.getOutSocketList(), "out").get(0);
List<OutSocket> unusedSocket = getSocketForType(
joinEntity.getOutSocketList(), "unused");
isUnusedSocketPresent = unusedSocket.size() > 0;
prepare();
if (isUnusedSocketPresent) {
inputLinks = addRecordPresentIndicatorField(inputLinks,
RECORD_PRESENT_INDICATOR, 1);
}
LOG.trace("Creating join assembly for '"
+ joinEntity.getComponentId() + "' for socket: '"
+ outSocket.getSocketId() + "' of type: '"
+ outSocket.getSocketType() + "'");
Pipe join = new CoGroup(joinEntity.getComponentId()+outSocket.getSocketId(),inputLinks, uniqKeyFields,
getJoinOutputFields(outSocket), joiner);
setHadoopProperties(join.getStepConfigDef());
// add checkpoint if unused port in use
Pipe joinResult;
if (isUnusedSocketPresent) {
joinResult = new Checkpoint( join);
} else {
joinResult = join;
}
setOutLink(joinResult, outSocket);
if (isUnusedSocketPresent) {
setUnusedLinks(joinResult, unusedSocket);
}
} catch (Exception e) {
LOG.error(e.getMessage(), e);
throw new RuntimeException(e.getMessage());
}
}
/**
* prepares the joiner for Join and initializes the input pipes, input
* fields & key fields
*/
private void prepare() {
joinHelper = new JoinHelper(componentParameters);
uniqInputFields = new Fields[componentParameters.getInputFieldsList()
.size()];
uniqKeyFields = new Fields[joinEntity.getAllKeyFieldSize()];
inputLinks = new Pipe[componentParameters.getInputPipes().size()];
joinTypes = new boolean[joinEntity.getAllKeyFieldSize()];
Fields inputFields;
Fields keyFields = new Fields();
for (int i = 0; i < componentParameters.getinSocketId().size(); i++) {
inputFields = componentParameters.getInputFieldsList().get(i);
int[] inputFieldsPos = inputFields.getPos();
for (JoinKeyFields joinKeyFields : joinEntity.getKeyFields()) {
if (joinKeyFields.getInSocketId().equalsIgnoreCase(
componentParameters.getinSocketId().get(i))) {
keyFields = new Fields(joinKeyFields.getFields());
joinTypes[i] = joinKeyFields.isRecordRequired();
}
}
// rename fields. prefix with file index
int[] keyFieldsPos = keyFields.getPos();
for (int j : inputFieldsPos) {
inputFields = inputFields.rename(new Fields(inputFields.get(j)
.toString()), new Fields(i + "."
+ inputFields.get(j).toString()));
}
uniqInputFields[i] = inputFields;
// rename key fields. prefix with file index
for (int j : keyFieldsPos) {
keyFields = keyFields.rename(new Fields(keyFields.get(j)
.toString()), new Fields(i + "."
+ keyFields.get(j).toString()));
}
uniqKeyFields[i] = keyFields;
Pipe inputLink = componentParameters.getInputPipes().get(i);
inputLink = new Rename(inputLink, componentParameters
.getInputFieldsList().get(i), inputFields);
inputLink = new Pipe("link_" + i, inputLink);
// retain only mapped fields and key fields
// to be done
inputLinks[i] = inputLink;
}
if (isUnusedSocketPresent) {
joiner = new OuterJoin();
} else {
joiner = new MixedJoin(joinTypes);
}
}
/**
* Fetches the List of {@link OutSocket} for specific socket type from all
* Sockets
*
* @param outSocketList
* @param socketType
* @return List of {@link OutSocket}
*/
private List<OutSocket> getSocketForType(List<OutSocket> outSocketList,
String socketType) {
List<OutSocket> outSockets = new ArrayList<OutSocket>();
List<OutSocket> unusedOutSockets = new ArrayList<OutSocket>();
for (int i = 0; i < outSocketList.size(); i++) {
if (joinEntity.getOutSocketList().get(i).getSocketType()
.equalsIgnoreCase(socketType)) {
if (socketType.equalsIgnoreCase("unused")) {
unusedOutSockets.add(joinEntity.getOutSocketList().get(i));
} else {
outSockets.add(joinEntity.getOutSocketList().get(i));
}
}
}
if (unusedOutSockets.size() > 0) {
int i = 0;
for (int j = 0; j < componentParameters.getinSocketId().size(); j++) {
for (OutSocket outSocket : unusedOutSockets) {
if (outSocket.getCopyOfInSocketId().equalsIgnoreCase(
componentParameters.getinSocketId().get(j))) {
outSockets.add(i, outSocket);
i++;
}
}
}
}
return outSockets;
}
/**
* Add field flag with value 1 for each row in input file to specify record
* is present.
*
* @param <T>
*
* @param inputPipes
* @param fieldName
* @param constantValue
* @return an array of {@link Pipe}
*/
private <T> Pipe[] addRecordPresentIndicatorField(Pipe[] inputPipes,
String fieldName, T constantValue) {
Pipe[] pipes = new Pipe[inputPipes.length];
for (int fileNum = 0; fileNum < inputPipes.length; fileNum++) {
Pipe tempFile = ComponentHelper.addConstantField(
inputPipes[fileNum], RECORD_PRESENT_INDICATOR + fileNum,
constantValue);
pipes[fileNum] = tempFile;
}
return pipes;
}
/**
* Append fields of all input files
*
* @param outSocket
*
* @return appended input Fields
*/
private Fields getJoinOutputFields(OutSocket outSocket) {
Fields joinOutputFields = new Fields();
int i = 0;
for (Fields fields : uniqInputFields) {
if (isUnusedSocketPresent)
fields = fields
.append(new Fields(RECORD_PRESENT_INDICATOR + i));
joinOutputFields = joinOutputFields.append(fields);
i++;
}
return joinOutputFields;
}
/**
* get record present flags fields of files on which outer join is applied
*
* @return appended record present flags Fields of files on which outer join
* is specified
*/
private Fields getOuterJoinFilesFlagFields() {
Fields outerJoinFilesFlagFields = new Fields();
for (int i = 0; i < uniqKeyFields.length; i++) {
if (uniqKeyFields[i] == null) {
continue;
}
if (getAllJoinTypes()[i]) {
outerJoinFilesFlagFields = outerJoinFilesFlagFields
.append(new Fields(RECORD_PRESENT_INDICATOR + i));
}
}
return outerJoinFilesFlagFields;
}
/**
* get the join type as inputsocket sequence.
*
* @return joinTypes
*/
private boolean[] getAllJoinTypes() {
boolean[] joinTypes = new boolean[joinEntity.getKeyFields().size()];
for (int i = 0; i < componentParameters.getinSocketId().size(); i++) {
for (JoinKeyFields keyFields2 : joinEntity.getKeyFields()) {
if (keyFields2.getInSocketId().equalsIgnoreCase(
componentParameters.getinSocketId().get(i))) {
joinTypes[i] = keyFields2.isRecordRequired();
}
}
}
return joinTypes;
}
/**
* Filter actual join result if FULL OUTER JOIN is performed. Apply output
* scheme and set OUT Port
*
* @param joinResult
* @param outSocket
*/
private void setOutLink(Pipe joinResult, OutSocket outSocket) {
Pipe joinFiltered;
// OUT Link : if full outer join is specified no need to apply filter
if (!Booleans.contains(getAllJoinTypes(), true)) {
setOutLink(outSocket.getSocketType(), outSocket.getSocketId(),
joinEntity.getComponentId(),
applyOutPutSchema(joinResult, outSocket),
joinHelper.getMapTargetFields(outSocket));
} else if (isUnusedSocketPresent) {
// Apply filter for join result as full outer join is performed for
// getting results for UNUSED Links out of joined file
joinFiltered = new Each(joinResult, getOuterJoinFilesFlagFields(),
new JoinOutLinkFilter());
setOutLink(outSocket.getSocketType(), outSocket.getSocketId(),
joinEntity.getComponentId(),
applyOutPutSchema(joinFiltered, outSocket),
joinHelper.getMapTargetFields(outSocket));
} else {
setOutLink(outSocket.getSocketType(), outSocket.getSocketId(),
joinEntity.getComponentId(),
applyOutPutSchema(joinResult, outSocket),
joinHelper.getMapTargetFields(outSocket));
}
}
/**
* Filter unmatched records from join result if FULL OUTER JOIN is
* performed. Set UNUSED PORT respective to each file where unmatched
* records are required.
*
* @param joinResult
*/
private void setUnusedLinks(Pipe joinResult, List<OutSocket> unusedOutSocket) {
Pipe unUsedLink;
Pipe allUnMatched;
Pipe unMatched;
// Records will be available at UNUSED PORT only if FULL OUTER join is
// not applied
if (Booleans.contains(getAllJoinTypes(), true)) {
// get unUsed records from join result
allUnMatched = new Each(joinResult, getOuterJoinFilesFlagFields(),
new JoinGetUnmatchedRecordsFilter());
} else {
allUnMatched = joinResult;
}
for (int i = 0; i < componentParameters.getinSocketId().size(); i++) {
for (OutSocket unusedSocket : unusedOutSocket) {
if (unusedSocket.getCopyOfInSocketId().equalsIgnoreCase(
componentParameters.getinSocketId().get(i))) {
// renaming pipes to avoid pipe name conflict with OUT
// PORT at tail
LOG.trace("Creating join assembly for '"
+ joinEntity.getComponentId() + "' for socket: '"
+ unusedSocket.getSocketId() + "' of type: '"
+ unusedSocket.getSocketType() + "'");
unMatched = new Pipe(ComponentHelper.getComponentName("join",joinEntity.getComponentId() ,unusedSocket.getSocketId()), allUnMatched);
if (Booleans.contains(getAllJoinTypes(), true)) {
unMatched = new Each(unMatched, new Fields(
RECORD_PRESENT_INDICATOR + i),
new JoinUnusedLinkFilter());
} else {
unMatched = new Each(unMatched, new Fields(
RECORD_PRESENT_INDICATOR + i),
new BlockAllFilter());
}
unMatched = new Retain(unMatched, uniqInputFields[i]);
unMatched = new Rename(unMatched, uniqInputFields[i],
componentParameters.getCopyOfInSocket(unusedSocket
.getCopyOfInSocketId()));
unUsedLink = unMatched;
// must register all assembly tails
setOutLink("unused", unusedSocket.getSocketId(),
joinEntity.getComponentId(), unUsedLink,
componentParameters.getCopyOfInSocket(unusedSocket
.getCopyOfInSocketId()));// uniqInputFields[i]);
}
}
}
}
/**
* Apply output scheme to join result
*
* @param joinResult
* @param outSocket
* @return Pipe array with output scheme
*/
private Pipe applyOutPutSchema(Pipe joinResult, OutSocket outSocket) {
Pipe outPort;
// Add output file scheme to join result
outPort = new Rename(joinResult,
getAllSourceFieldsWithFileIndexPrefix(outSocket),
joinHelper.getMapTargetFields(outSocket));
outPort = new Retain(outPort, joinHelper.getMapTargetFields(outSocket));
return outPort;
}
private Fields getAllSourceFieldsWithFileIndexPrefix(OutSocket outSocket) {
Fields sourceFields;
Fields combinedSourceFields = new Fields();
for (int i = 0; i < componentParameters.getinSocketId().size(); i++) {
sourceFields = joinHelper.getMapSourceFields(componentParameters
.getinSocketId().get(i), outSocket/*, i*/);
if (sourceFields == null)
continue;
int[] sourceFieldsPos = sourceFields.getPos();
// rename fields. prefix with file index
for (int j : sourceFieldsPos) {
sourceFields = sourceFields.rename(
new Fields(sourceFields.get(j).toString()), new Fields(
i + "." + sourceFields.get(j).toString()));
}
combinedSourceFields = combinedSourceFields.append(sourceFields);
}
return combinedSourceFields;
}
@Override
public void initializeEntity(JoinEntity assemblyEntityBase) {
this.joinEntity=assemblyEntityBase;
}
}