/*******************************************************************************
* Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*******************************************************************************/
package hydrograph.engine.cascading.scheme;
import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.scheme.util.DelimitedParser;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import hydrograph.engine.hadoop.inputformat.DelimitedAndFixedWidthInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import java.beans.ConstructorProperties;
import java.io.IOException;
import java.lang.reflect.Type;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Calendar;
@SuppressWarnings("rawtypes")
public class TextDelimitedAndFixedWidth
extends
Scheme<Configuration, RecordReader, OutputCollector, Object[], Object[]> {
public enum Compress {
DEFAULT, ENABLE, DISABLE
}
protected char filler;
protected boolean strict;
protected boolean safe;
protected int[] lengths;
protected Type[] types;
protected String quote;
public static final String DEFAULT_CHARSET = "UTF-8";
private static final long serialVersionUID = 1L;
public static final Fields DEFAULT_SOURCE_FIELDS = new Fields("offset",
"line");
public static final char DEFAULT_FILLER = ' ';
public static final boolean DEFAULT_STRICT = false;
public static final boolean DEFAULT_SAFE = false;
private static final Type[] DEFAULT_TYPES = null;
private static final String DEFAULT_QUOTE = "";
Compress sinkCompression = Compress.DISABLE;
String charsetName = DEFAULT_CHARSET;
String[] lengthsAndDelimiters, lengthsAndDelimitersType;
Type[] typesOfLengthsAndDelimiters;
Calendar calender;
Text text;
Charset charset;
String line;
DelimitedParser delimitedParser;
StringBuilder sb = new StringBuilder();
String recordToBeSpilled = "";
boolean hasaNewLineField = true;
boolean isLastFieldNewLine = true;
@ConstructorProperties({ "sourceFields" })
public TextDelimitedAndFixedWidth(Fields sourceFields,
String[] lengthsAndDelimiters, Type[] typesOfLengthsAndDelimiters) {
this(sourceFields, lengthsAndDelimiters, typesOfLengthsAndDelimiters,
DEFAULT_TYPES, DEFAULT_STRICT, DEFAULT_SAFE, DEFAULT_FILLER,
DEFAULT_CHARSET, DEFAULT_QUOTE);
}
public TextDelimitedAndFixedWidth(Fields sourceFields,
String[] lengthsAndDelimiters, Type[] typesOfLengthsAndDelimiters,
Type[] types) {
this(sourceFields, lengthsAndDelimiters, typesOfLengthsAndDelimiters,
types, DEFAULT_STRICT, DEFAULT_SAFE, DEFAULT_FILLER,
DEFAULT_CHARSET, DEFAULT_QUOTE);
}
public TextDelimitedAndFixedWidth(Fields sourceFields,
String[] lengthsAndDelimiters, Type[] typesOfLengthsAndDelimiters,
Type[] types, boolean strict) {
this(sourceFields, lengthsAndDelimiters, typesOfLengthsAndDelimiters,
types, strict, DEFAULT_SAFE, DEFAULT_FILLER, DEFAULT_CHARSET,
DEFAULT_QUOTE);
}
public TextDelimitedAndFixedWidth(Fields sourceFields,
String[] lengthsAndDelimiters, Type[] typesOfLengthsAndDelimiters,
Type[] types, boolean strict, boolean safe) {
this(sourceFields, lengthsAndDelimiters, typesOfLengthsAndDelimiters,
types, strict, safe, DEFAULT_FILLER, DEFAULT_CHARSET,
DEFAULT_QUOTE);
}
public TextDelimitedAndFixedWidth(Fields sourceFields,
String[] lengthsAndDelimiters, Type[] typesOfLengthsAndDelimiters,
Type[] types, boolean strict, boolean safe, String charsetName) {
this(sourceFields, lengthsAndDelimiters, typesOfLengthsAndDelimiters,
types, strict, safe, DEFAULT_FILLER, charsetName, DEFAULT_QUOTE);
}
public TextDelimitedAndFixedWidth(Fields sourceFields,
String[] lengthsAndDelimiters, Type[] typesOfLengthsAndDelimiters,
Type[] types, boolean strict, boolean safe, String charsetName,
String quote) {
this(sourceFields, lengthsAndDelimiters, typesOfLengthsAndDelimiters,
types, strict, safe, DEFAULT_FILLER, charsetName, quote);
}
public TextDelimitedAndFixedWidth(Fields sourceFields,
String[] lengthsAndDelimiters, Type[] typesOfLengthsAndDelimiters,
String quote) {
this(sourceFields, lengthsAndDelimiters, typesOfLengthsAndDelimiters,
DEFAULT_TYPES, DEFAULT_STRICT, DEFAULT_SAFE, DEFAULT_FILLER,
DEFAULT_CHARSET, quote);
}
public TextDelimitedAndFixedWidth(Fields fields,
String[] lengthsAndDelimiters, Type[] typesOfLengthsAndDelimiters,
Type[] types, boolean strict, boolean safe, char filler,
String charsetName, String quote) {
super(fields, fields);
setCharsetName(charsetName);
// SonarQube: Constructors and methods receiving arrays should clone
// objects and store the copy. This prevents that future changes from
// the user affect the internal functionality
this.lengthsAndDelimiters = lengthsAndDelimiters.clone();
this.typesOfLengthsAndDelimiters = typesOfLengthsAndDelimiters.clone();
this.hasaNewLineField = DelimitedAndFixedWidthHelper
.hasaNewLineField(lengthsAndDelimiters);
this.isLastFieldNewLine = DelimitedAndFixedWidthHelper
.isLastFieldNewLine(lengthsAndDelimiters);
this.types = types == null ? null : types.clone();
this.strict = strict;
this.safe = safe;
this.lengthsAndDelimitersType = Arrays.toString(
typesOfLengthsAndDelimiters).split(",");
this.filler = filler;
this.quote = quote == null ? "" : quote;
}
public Compress getSinkCompression() {
return sinkCompression;
}
protected void setCharsetName(String charsetName) {
if (charsetName != null)
this.charsetName = charsetName;
Charset.forName(this.charsetName);
}
public void setSinkCompression(Compress sinkCompression) {
if (sinkCompression != null)
this.sinkCompression = sinkCompression;
}
private boolean hasZippedFiles(Path[] paths) {
if (paths == null || paths.length == 0)
return false;
boolean isZipped = paths[0].getName().endsWith(".zip");
for (int i = 1; i < paths.length; i++) {
if (isZipped != paths[i].getName().endsWith(".zip"))
throw new IllegalStateException(
"cannot mix zipped and upzipped files");
}
return isZipped;
}
@Override
public void sourceCleanup(FlowProcess<? extends Configuration> flowProcess,
SourceCall<Object[], RecordReader> sourceCall) {
sourceCall.setContext(null);
}
@SuppressWarnings("unchecked")
@Override
public boolean source(FlowProcess<? extends Configuration> flowProcess,
SourceCall<Object[], RecordReader> sourceCall) throws IOException {
if (!sourceCall.getInput().next(sourceCall.getContext()[0],
sourceCall.getContext()[1]))
return false;
Tuple tuple = sourceCall.getIncomingEntry().getTuple();
tuple.clear();
tuple.addAll(new Tuple(DelimitedAndFixedWidthHelper.getFields(
getSourceFields(), makeEncodedString(sourceCall.getContext()),
lengthsAndDelimiters, lengthsAndDelimitersType, types, safe, quote)));
return true;
}
@Override
public void sourcePrepare(FlowProcess<? extends Configuration> flowProcess,
SourceCall<Object[], RecordReader> sourceCall) {
if (sourceCall.getContext() == null)
sourceCall.setContext(new Object[3]);
sourceCall.getContext()[0] = sourceCall.getInput().createKey();
sourceCall.getContext()[1] = sourceCall.getInput().createValue();
sourceCall.getContext()[2] = Charset.forName(charsetName);
}
protected String makeEncodedString(Object[] context) {
Text temporary = (Text) context[1];
return temporary.toString();
}
@Override
public void sourceConfInit(
FlowProcess<? extends Configuration> flowProcess,
Tap<Configuration, RecordReader, OutputCollector> tap,
Configuration conf) {
if (hasZippedFiles(FileInputFormat
.getInputPaths(asJobConfInstance(conf))))
throw new IllegalStateException("cannot read zip files: "
+ Arrays.toString(FileInputFormat
.getInputPaths(asJobConfInstance(conf))));
conf.setBoolean("mapred.mapper.new-api", false);
conf.setClass("mapred.input.format.class",
DelimitedAndFixedWidthInputFormat.class, InputFormat.class);
conf.set("charsetName", charsetName);
conf.set("quote", quote);
conf.set("lengthsAndDelimiters", DelimitedAndFixedWidthHelper
.arrayToString(lengthsAndDelimiters));
conf.setStrings("lengthsAndDelimitersType", lengthsAndDelimitersType);
}
public static JobConf asJobConfInstance(Configuration configuration) {
if (configuration instanceof JobConf)
return (JobConf) configuration;
return new JobConf(configuration);
}
@Override
public void sinkConfInit(FlowProcess<? extends Configuration> flowProcess,
Tap<Configuration, RecordReader, OutputCollector> tap,
Configuration conf) {
if (tap.getFullIdentifier(conf).endsWith(".zip"))
throw new IllegalStateException("cannot write zip files: "
+ getOutputPath(conf));
conf.setBoolean("mapred.mapper.new-api", false);
if (getSinkCompression() == Compress.DISABLE)
conf.setBoolean("mapred.output.compress", false);
else if (getSinkCompression() == Compress.ENABLE)
conf.setBoolean("mapred.output.compress", true);
conf.setClass("mapred.output.key.class", Text.class, Object.class);
conf.setClass("mapred.output.value.class", Text.class, Object.class);
conf.setClass("mapred.output.format.class", TextOutputFormat.class,
OutputFormat.class);
}
public static Path getOutputPath(Configuration conf) {
String name = conf.get("mapred.output.dir");
return name == null ? null : new Path(name);
}
@SuppressWarnings("unchecked")
@Override
public void sink(FlowProcess<? extends Configuration> flowProcess,
SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
text = (Text) sinkCall.getContext()[0];
charset = (Charset) sinkCall.getContext()[1];
sb.append(DelimitedAndFixedWidthHelper.createLine(sinkCall
.getOutgoingEntry().getTuple(), lengthsAndDelimiters,
lengthsAndDelimitersType, strict, filler, types, quote));
if (hasaNewLineField) {
recordToBeSpilled = DelimitedAndFixedWidthHelper
.spillOneLineToOutput(sb, lengthsAndDelimiters);
sinkCall.getOutput().collect(null, new Text(recordToBeSpilled));
sb = new StringBuilder(sb.toString().replace(recordToBeSpilled, "")
.trim());
}
}
@SuppressWarnings("unchecked")
@Override
public void sinkCleanup(FlowProcess<? extends Configuration> flowProcess,
SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
sinkCall.getOutput().collect(null, new Text(sb.toString()));
sb.setLength(0);
}
@Override
public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess,
SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
sinkCall.setContext(new Object[3]);
sinkCall.getContext()[0] = new Text();
sinkCall.getContext()[1] = Charset.forName(charsetName);
}
}