/*
* Copyright [2012-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.shifu.actor.worker;
import akka.actor.ActorRef;
import ml.shifu.shifu.container.obj.ColumnConfig;
import ml.shifu.shifu.container.obj.EvalConfig;
import ml.shifu.shifu.container.obj.ModelConfig;
import ml.shifu.shifu.core.DataPurifier;
import ml.shifu.shifu.message.NormPartRawDataMessage;
import ml.shifu.shifu.message.RunModelDataMessage;
import ml.shifu.shifu.message.StatsPartRawDataMessage;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.List;
/**
* DataFilterWorker class is to filter data by setting.
* It do filtering by the setting in @ModelConfig.dataSet.filterExpressions or
*/
public class DataFilterWorker extends AbstractWorkerActor {
public static final Logger log = LoggerFactory.getLogger(DataFilterWorker.class);
private DataPurifier dataPurifier;
public DataFilterWorker(
ModelConfig modelConfig,
List<ColumnConfig> columnConfigList,
ActorRef parentActorRef,
ActorRef nextActorRef) throws IOException {
super(modelConfig, columnConfigList, parentActorRef, nextActorRef);
dataPurifier = new DataPurifier(modelConfig);
}
public DataFilterWorker(
ModelConfig modelConfig,
List<ColumnConfig> columnConfigList,
ActorRef parentActorRef,
ActorRef nextActorRef,
EvalConfig evalConfig) throws IOException {
super(modelConfig, columnConfigList, parentActorRef, nextActorRef);
dataPurifier = new DataPurifier(evalConfig);
}
/* (non-Javadoc)
* @see ml.shifu.shifu.actor.worker.AbstractWorkerActor#handleMsg(java.lang.Object)
*/
@Override
public void handleMsg(Object message) throws Exception {
if (message instanceof StatsPartRawDataMessage) {
StatsPartRawDataMessage msg = (StatsPartRawDataMessage) message;
purifyData(msg.getRawDataList());
nextActorRef.tell(msg, getSelf());
} else if (message instanceof NormPartRawDataMessage) {
NormPartRawDataMessage msg = (NormPartRawDataMessage) message;
purifyData(msg.getRawDataList());
nextActorRef.tell(msg, getSelf());
} else if (message instanceof RunModelDataMessage) {
RunModelDataMessage msg = (RunModelDataMessage) message;
purifyData(msg.getEvalDataList());
nextActorRef.tell(msg, getSelf());
} else {
unhandled(message);
}
}
/**
* Filter the data - it uses @dataPurifier to filter data
*
* @param inputDataList - input data to filter
*/
private void purifyData(List<String> inputDataList) {
log.info("starting to filter data ... ");
CollectionUtils.filter(inputDataList, new Predicate() {
@Override
public boolean evaluate(Object object) {
String inputData = (String) object;
return dataPurifier.isFilterOut(inputData);
}
});
log.info("there are {} records after filter.", inputDataList.size());
}
}