package org.shanbo.feluca.data2; import java.io.StringReader; import java.util.Map.Entry; import java.util.HashMap; import java.util.Properties; import org.shanbo.feluca.data2.Vector; import org.shanbo.feluca.data2.Vector.VectorType; public abstract class DataStatistic { //basic public final static String NUM_VECTORS = "numVectors"; public final static String MAX_FEATURE_ID = "maxFeatureId"; public final static String TOTAL_FEATURES = "totalFeatures"; public final static String MAX_VECTORSIZE = "maxVectorSize"; //with weight public final static String SUM_WEIGHTS = "sumWeights"; public final static String LABEL_INFO = "labelInfo"; public final static String CLASSES = "classes"; //with id public final static String MAX_VECTOR_ID = "maxVectorId"; VectorType statVectorType; public DataStatistic counter; protected DataStatistic(DataStatistic counter){ if (counter != null){ this.counter = counter; } } protected abstract void doStat(Vector vector); protected abstract Properties getStatResult(); protected abstract void clear(); public final void stat(Vector vector){ if (counter != null){ counter.stat(vector); } doStat(vector); this.statVectorType = vector.getOutVectorType(); } public final void clearStat(){ if (counter != null){ counter.clearStat(); } clear(); } public static Properties parseStr(String prop){ try{ Properties p = new Properties(); p.load(new StringReader(prop)); return p; }catch (Exception e) { throw new RuntimeException(e); } } public static DataStatistic createVWstat(){ return new VIDStatistic(new WeightStatistic(new BasicStatistic())); } public static DataStatistic createLWstat(){ return new LabelStatistic(new WeightStatistic(new BasicStatistic())); } public String toString(){ Properties p = new Properties(); if (counter != null){ p.putAll( parseStr(counter.toString())); } p.putAll(getStatResult()); p.put("vectorType", statVectorType); //we don't use properties's toString() or store() StringBuilder builder = new StringBuilder(); for(Entry<Object, Object> entry : p.entrySet()){ builder.append(entry.getKey() + "=" + entry.getValue() + "\n"); } return builder.toString(); } public static class BasicStatistic extends DataStatistic{ int numVectors = 0; int totalFeatures = 0; int maxFeatureId = 0; int maxVectorSize = 1; public BasicStatistic() { super(null); clear(); } @Override protected void doStat(Vector vector) { numVectors += 1; totalFeatures += vector.getSize(); maxVectorSize = vector.getSize() > maxVectorSize ? vector.getSize(): maxVectorSize; for(int i = 0 ; i < vector.getSize(); i++){ maxFeatureId = vector.getFId(i) > maxFeatureId ?vector.getFId(i):maxFeatureId ; } } @Override protected Properties getStatResult() { Properties p = new Properties(); p.put(NUM_VECTORS, this.numVectors); p.put(MAX_FEATURE_ID, this.maxFeatureId); p.put(TOTAL_FEATURES, this.totalFeatures); p.put(MAX_VECTORSIZE, this.maxVectorSize); return p; } @Override protected void clear() { numVectors = 0; totalFeatures = 0; maxFeatureId = 0; maxVectorSize = 1; } } /** * just for test * @author lgn * */ public static class MinStatistic extends DataStatistic{ int minId = Integer.MAX_VALUE; public MinStatistic(DataStatistic counter) { super(counter); clear(); } @Override protected void doStat(Vector vector) { for(int i = 0 ; i < vector.getSize(); i++){ minId = vector.getFId(i) < minId ?vector.getFId(i):minId ; } } @Override protected Properties getStatResult() { Properties p = new Properties(); p.put("minFeatureId", this.minId); return p; } @Override protected void clear() { minId = Integer.MAX_VALUE; } } public static class LabelStatistic extends DataStatistic{ HashMap<Integer, int[]> labelInfoBag = new HashMap<Integer, int[]>(); int i = 0; public LabelStatistic(DataStatistic counter) { super(counter); } @Override protected void doStat(Vector vector) { int[] labelInfo = labelInfoBag.get(vector.getIntHeader()); if (labelInfo == null){ labelInfoBag.put(vector.getIntHeader(), new int[]{labelInfoBag.size(), 1}); }else{ labelInfo[1] += 1; } } @Override protected Properties getStatResult() { Properties p = new Properties(); p.put(CLASSES, this.labelInfoBag.size()); StringBuilder sb = new StringBuilder(); for(Entry<Integer, int[]> entry : labelInfoBag.entrySet()){ sb.append(String.format("%d:%d:%d ", entry.getKey(), entry.getValue()[0], entry.getValue()[1])); } p.put(LABEL_INFO, sb.toString()); return p; } @Override protected void clear() { labelInfoBag.clear(); } } public static class VIDStatistic extends DataStatistic{ int maxVId = Integer.MIN_VALUE; public VIDStatistic(DataStatistic counter) { super(counter); clear(); } @Override protected void doStat(Vector vector) { maxVId = maxVId > vector.getIntHeader() ? maxVId : vector.getIntHeader(); } @Override protected Properties getStatResult() { Properties p = new Properties(); p.put(MAX_VECTOR_ID, this.maxVId); return p; } @Override protected void clear() { maxVId = Integer.MIN_VALUE; } } public static class WeightStatistic extends DataStatistic{ double weightSum ; int i = 0; public WeightStatistic(DataStatistic counter) { super(counter); } @Override protected void doStat(Vector vector) { for(int i = 0 ; i < vector.getSize(); i++){ weightSum += vector.getWeight(i); } } @Override protected Properties getStatResult() { Properties p = new Properties(); p.put(SUM_WEIGHTS, this.weightSum); return p; } @Override protected void clear() { weightSum = 0.0; } } }