/* * Copyright [2013-2015] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.core.autotype; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import org.apache.hadoop.io.Writable; /** * A mixed writable class to wrapper HyperLogLogPlus byte instance and frequent items together. * * <p> * {@link #frequetItems} is used to check 0-1 variables which is not set to be categorical variables. The size of it is * limited to {@link #FREQUET_ITEM_MAX_SIZE}. * * @author Zhang David (pengzhang@paypal.com) */ public class CountAndFrequentItemsWritable implements Writable { public static final int FREQUET_ITEM_MAX_SIZE = 20; /** * Serializing form for HyperLogLogPlus instance. */ private byte[] hyperBytes; /** * Frequent items for one column, this set is limited to 10 and which is used so far to check 0-1 variables, such * 0-1 variables cannot be set to categorical variable. */ private Set<String> frequetItems; /** * Total input count per each feature in current mapper */ private long count; /** * Total invalid count set by missing or invalid values per each feature in current mapper */ private long invalidCount; /** * Total valid number count per each feature in current mapper */ private long validNumCount; public CountAndFrequentItemsWritable() { } public CountAndFrequentItemsWritable(byte[] hyperBytes, Set<String> frequetItems) { this.hyperBytes = hyperBytes; this.frequetItems = frequetItems; } public CountAndFrequentItemsWritable(long count, long invalidCount, long validNumCount, byte[] hyperBytes, Set<String> frequetItems) { this.count = count; this.invalidCount = invalidCount; this.validNumCount = validNumCount; this.hyperBytes = hyperBytes; this.frequetItems = frequetItems; } /** * @return the count */ public long getCount() { return count; } /** * @return the invalidCount */ public long getInvalidCount() { return invalidCount; } /** * @return the validNumCount */ public long getValidNumCount() { return validNumCount; } /** * @param count * the count to set */ public void setCount(long count) { this.count = count; } /** * @param invalidCount * the invalidCount to set */ public void setInvalidCount(long invalidCount) { this.invalidCount = invalidCount; } /** * @param validNumCount * the validNumCount to set */ public void setValidNumCount(long validNumCount) { this.validNumCount = validNumCount; } /* * (non-Javadoc) * * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) */ @Override public void write(DataOutput out) throws IOException { out.writeLong(this.count); out.writeLong(this.invalidCount); out.writeLong(this.validNumCount); if(hyperBytes == null) { out.writeInt(0); } else { out.writeInt(hyperBytes.length); for(int i = 0; i < hyperBytes.length; i++) { out.writeByte(hyperBytes[i]); } } if(frequetItems == null) { out.writeInt(0); } else { int setSize = Math.min(frequetItems.size(), FREQUET_ITEM_MAX_SIZE); out.writeInt(setSize); Iterator<String> iter = frequetItems.iterator(); int i = 0; while(i < setSize) { String unit = iter.next(); if(unit == null) { out.writeBoolean(false); } else { out.writeBoolean(true); out.writeUTF(unit); } i++; } } } /* * (non-Javadoc) * * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) */ @Override public void readFields(DataInput in) throws IOException { this.count = in.readLong(); this.invalidCount = in.readLong(); this.validNumCount = in.readLong(); int len = in.readInt(); hyperBytes = new byte[len]; if(len != 0) { for(int i = 0; i < len; i++) { hyperBytes[i] = in.readByte(); } } len = in.readInt(); frequetItems = new HashSet<String>(len, 1f); if(len != 0) { for(int i = 0; i < len; i++) { if(in.readBoolean()) { frequetItems.add(in.readUTF()); } } } } /** * @return the hyperBytes */ public byte[] getHyperBytes() { return hyperBytes; } /** * @param hyperBytes * the hyperBytes to set */ public void setHyperBytes(byte[] hyperBytes) { this.hyperBytes = hyperBytes; } /** * @return the frequetItems */ public Set<String> getFrequetItems() { return frequetItems; } /** * @param frequetItems * the frequetItems to set */ public void setFrequetItems(Set<String> frequetItems) { this.frequetItems = frequetItems; } }