/*
* Copyright [2012-2015] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.shifu.core.autotype;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ml.shifu.shifu.container.obj.ColumnConfig;
import ml.shifu.shifu.container.obj.ModelConfig;
import ml.shifu.shifu.container.obj.RawSourceData.SourceType;
import ml.shifu.shifu.core.DataPurifier;
import ml.shifu.shifu.util.CommonUtils;
import ml.shifu.shifu.util.Constants;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
/**
* {@link AutoTypeDistinctCountMapper} is a mapper to get {@link HyperLogLogPlus} statistics per split. Such statistics
* will be merged in our reducer.
*/
public class AutoTypeDistinctCountMapper extends Mapper<LongWritable, Text, IntWritable, CountAndFrequentItemsWritable> {
private final static Logger LOG = LoggerFactory.getLogger(AutoTypeDistinctCountMapper.class);
/**
* Model Config read from HDFS
*/
private ModelConfig modelConfig;
/**
* To filter records by customized expressions
*/
private DataPurifier dataPurifier;
/**
* Output key cache to avoid new operation.
*/
private IntWritable outputKey;
/**
* Using approximate method to estimate real frequent items and store into this map
*/
private Map<Integer, CountAndFrequentItems> variableCountMap;
/**
* Tag column index
*/
private int tagColumnNum = -1;
/**
* Column Config list read from HDFS
*/
private List<ColumnConfig> columnConfigList;
// cache tags in set for search
private Set<String> tags;
/**
* Missing or invalid values
*/
private Set<String> missingOrInvalidValues;
private void loadConfigFiles(final Context context) {
try {
SourceType sourceType = SourceType.valueOf(context.getConfiguration().get(
Constants.SHIFU_MODELSET_SOURCE_TYPE, SourceType.HDFS.toString()));
this.modelConfig = CommonUtils.loadModelConfig(
context.getConfiguration().get(Constants.SHIFU_MODEL_CONFIG), sourceType);
this.columnConfigList = CommonUtils.loadColumnConfigList(
context.getConfiguration().get(Constants.SHIFU_COLUMN_CONFIG), sourceType);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
loadConfigFiles(context);
this.dataPurifier = new DataPurifier(this.modelConfig);
loadTagWeightNum();
this.variableCountMap = new HashMap<Integer, CountAndFrequentItems>();
this.outputKey = new IntWritable();
this.tags = new HashSet<String>(modelConfig.getFlattenTags());
this.missingOrInvalidValues = new HashSet<String>(this.modelConfig.getDataSet().getMissingOrInvalidValues());
}
/**
* Load tag weight index field.
*/
private void loadTagWeightNum() {
for(ColumnConfig config: this.columnConfigList) {
if(config.isTarget()) {
this.tagColumnNum = config.getColumnNum();
break;
}
}
if(this.tagColumnNum == -1) {
throw new RuntimeException("No valid target column.");
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String valueStr = value.toString();
// StringUtils.isBlank is not used here to avoid import new jar
if(valueStr == null || valueStr.length() == 0 || valueStr.trim().length() == 0) {
LOG.warn("Empty input.");
return;
}
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT").increment(1L);
if(!this.dataPurifier.isFilterOut(valueStr)) {
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT").increment(1L);
return;
}
String[] units = CommonUtils.split(valueStr, this.modelConfig.getDataSetDelimiter());
// tagColumnNum should be in units array, if not IndexOutofBoundException
String tag = CommonUtils.trimTag(units[this.tagColumnNum]);
if(!this.tags.contains(tag)) {
if(System.currentTimeMillis() % 50 == 0L) {
LOG.warn("Data with invalid tag is ignored in distinct count computing, invalid tag: {}.", tag);
}
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L);
return;
}
int i = 0;
for(String unit: units) {
CountAndFrequentItems countAndFrequentItems = this.variableCountMap.get(i);
if(countAndFrequentItems == null) {
countAndFrequentItems = new CountAndFrequentItems();
this.variableCountMap.put(i, countAndFrequentItems);
}
countAndFrequentItems.offer(this.missingOrInvalidValues, unit);
i++;
}
}
/**
* Write column info to reducer for merging.
*/
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for(Map.Entry<Integer, CountAndFrequentItems> entry: this.variableCountMap.entrySet()) {
this.outputKey.set(entry.getKey());
byte[] bytes = entry.getValue().hyper.getBytes();
Set<String> frequentItems = entry.getValue().frequentItems;
context.write(this.outputKey, new CountAndFrequentItemsWritable(entry.getValue().count,
entry.getValue().invalidCount, entry.getValue().validNumCount, bytes, frequentItems));
}
}
public static class CountAndFrequentItems {
private final HyperLogLogPlus hyper = new HyperLogLogPlus(8);;
private final Set<String> frequentItems = new HashSet<String>();
private long count;
private long invalidCount;
private long validNumCount;
public void offer(Set<String> missingorInvalidValues, String unit) {
count += 1;
if(unit == null || missingorInvalidValues.contains(unit.toLowerCase())) {
invalidCount += 1;
return;
}
hyper.offer(unit);
try {
Double.parseDouble(unit);
validNumCount += 1;
} catch (NumberFormatException e) {
// ignore as only do stats on validNumCount
}
if(frequentItems.size() <= CountAndFrequentItemsWritable.FREQUET_ITEM_MAX_SIZE
&& !frequentItems.contains(unit)) {
frequentItems.add(unit);
}
}
/**
* @return the hyper
*/
public HyperLogLogPlus getHyper() {
return hyper;
}
/**
* @return the frequentItems
*/
public Set<String> getFrequentItems() {
return frequentItems;
}
/**
* @return the count
*/
public long getCount() {
return count;
}
/**
* @return the invalidCount
*/
public long getInvalidCount() {
return invalidCount;
}
/**
* @return the validNumCount
*/
public long getValidNumCount() {
return validNumCount;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return "CountAndFrequentItems [hyper=" + hyper + ", frequentItems=" + frequentItems + ", count=" + count
+ ", invalidCount=" + invalidCount + ", validNumCount=" + validNumCount + "]";
}
}
}