/*
* Copyright [2012-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.shifu.core;
import ml.shifu.shifu.container.obj.EvalConfig;
import ml.shifu.shifu.container.obj.ModelConfig;
import ml.shifu.shifu.util.CommonUtils;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.commons.jexl2.JexlException;
import org.apache.commons.jexl2.MapContext;
import org.apache.commons.lang.StringUtils;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
/**
* DataPurifier class
*/
public class DataPurifier {
private static Logger log = LoggerFactory.getLogger(DataPurifier.class);
private String[] headers;
private String dataDelimiter;
private Expression dataFilterExpr;
private ShifuMapContext jc = new ShifuMapContext();
public DataPurifier(ModelConfig modelConfig) throws IOException {
if(StringUtils.isNotBlank(modelConfig.getFilterExpressions())) {
JexlEngine jexl = new JexlEngine();
try {
dataFilterExpr = jexl.createExpression(modelConfig.getFilterExpressions());
} catch (JexlException e) {
log.error("The expression is {} is invalid, please use correct expression.",
modelConfig.getFilterExpressions());
dataFilterExpr = null;
}
this.headers = CommonUtils.getFinalHeaders(modelConfig);
dataDelimiter = modelConfig.getDataSetDelimiter();
}
}
public DataPurifier(EvalConfig evalConfig) throws IOException {
if(StringUtils.isNotBlank(evalConfig.getDataSet().getFilterExpressions())) {
JexlEngine jexl = new JexlEngine();
try {
dataFilterExpr = jexl.createExpression(evalConfig.getDataSet().getFilterExpressions());
} catch (JexlException e) {
log.error("The expression is {} is invalid, please use correct expression.", evalConfig.getDataSet()
.getFilterExpressions());
dataFilterExpr = null;
}
headers = CommonUtils.getFinalHeaders(evalConfig);
dataDelimiter = evalConfig.getDataSet().getDataDelimiter();
}
}
public Boolean isFilterOut(String record) {
if(dataFilterExpr == null) {
return true;
}
String[] fields = CommonUtils.split(record, dataDelimiter);
if(fields == null || fields.length != headers.length) {
// illegal format data, just skip
return false;
}
jc.clear();
for(int i = 0; i < fields.length; i++) {
jc.set(headers[i], ((fields[i] == null) ? "" : fields[i].toString()));
}
Boolean result = Boolean.FALSE;
Object retObj = null;
try {
retObj = dataFilterExpr.evaluate(jc);
} catch (Throwable e) {
log.debug("Error occurred when trying to evaluate", dataFilterExpr.toString(), e);
}
if(retObj != null && retObj instanceof Boolean) {
result = (Boolean) retObj;
}
return result;
}
public Boolean isFilterOut(Tuple input) throws ExecException {
if(dataFilterExpr == null) {
return true;
}
if(input == null || input.size() != headers.length) {
// illegal format data, just skip
return false;
}
jc.clear();
for(int i = 0; i < input.size(); i++) {
jc.set(headers[i], ((input.get(i) == null) ? null : input.get(i).toString()));
}
Boolean result = Boolean.FALSE;
Object retObj = null;
try {
retObj = dataFilterExpr.evaluate(jc);
} catch (Throwable e) {
log.debug("Error occurred when trying to evaluate", dataFilterExpr.toString(), e);
}
if(retObj != null && retObj instanceof Boolean) {
result = (Boolean) retObj;
}
return result;
}
// reuse context
public static class ShifuMapContext extends MapContext {
public ShifuMapContext() {
super();
}
public void clear() {
if(super.map != null) {
map.clear();
}
}
}
}