/* * Copyright [2012-2014] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.udf; import ml.shifu.shifu.container.obj.EvalConfig; import ml.shifu.shifu.core.DataPurifier; import ml.shifu.shifu.util.Constants; import org.apache.pig.data.Tuple; import org.apache.pig.tools.pigstats.PigStatusReporter; import java.io.IOException; /** * PurifyDataUDF class purify the data for training and evaluation. * The setting for purify is in in @ModelConfig.dataSet.filterExpressions or */ public class PurifyDataUDF extends AbstractTrainerUDF<Boolean> { private DataPurifier dataPurifier; public PurifyDataUDF(String source, String pathModelConfig, String pathColumnConfig) throws IOException { super(source, pathModelConfig, pathColumnConfig); dataPurifier = new DataPurifier(modelConfig); } public PurifyDataUDF(String source, String pathModelConfig, String pathColumnConfig, String evalSetName) throws IOException { super(source, pathModelConfig, pathColumnConfig); EvalConfig evalConfig = modelConfig.getEvalConfigByName(evalSetName); dataPurifier = new DataPurifier(evalConfig); } /* * (non-Javadoc) * * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple) */ @SuppressWarnings("deprecation") @Override public Boolean exec(Tuple input) throws IOException { // update model run time for stats if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT")) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT").increment(1); } Boolean filterOut = dataPurifier.isFilterOut(input); if(filterOut != null && filterOut) { // update model run time for stats if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT")) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT") .increment(1); } } return filterOut; } }