TestFeatureClusterer.java example

Explorer
h-store-master
package edu.brown.markov;

import java.io.File;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.junit.Test;
import org.voltdb.VoltProcedure;
import org.voltdb.benchmark.tpcc.procedures.neworder;
import org.voltdb.catalog.Procedure;

import weka.classifiers.Classifier;
import weka.clusterers.AbstractClusterer;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.NumericToNominal;
import edu.brown.BaseTestCase;
import edu.brown.catalog.CatalogUtil;
import edu.brown.markov.FeatureClusterer.SplitType;
import edu.brown.markov.features.BasePartitionFeature;
import edu.brown.markov.features.FeatureUtil;
import edu.brown.markov.features.ParamArrayLengthFeature;
import edu.brown.markov.features.ParamHashPartitionFeature;
import edu.brown.statistics.Histogram;
import edu.brown.statistics.ObjectHistogram;
import edu.brown.utils.CollectionUtil;
import edu.brown.utils.ProjectType;
import edu.brown.utils.StringUtil;
import edu.brown.workload.TransactionTrace;
import edu.brown.workload.Workload;
import edu.brown.workload.filters.ProcedureLimitFilter;
import edu.brown.workload.filters.ProcedureNameFilter;
import edu.brown.hstore.conf.HStoreConf;

/**
 * NOTE: 2012-10-20
 * I am getting random JVM crashes with some of these test cases.
 * I think it's because of Weka, but I don't have time to look into it
 * I've commented out the tests for now.
 * @author pavlo
 */
public class TestFeatureClusterer extends BaseTestCase {

    private static final Class<? extends VoltProcedure> TARGET_PROCEDURE = neworder.class;
    private static final int WORKLOAD_XACT_LIMIT = 1000;
//    private static final int BASE_PARTITION = 1;
    private static final int NUM_PARTITIONS = 50;

    private static Procedure catalog_proc;
    private static Workload workload;
    private static Instances data;
    
    private FeatureClusterer fclusterer;
    
    @Override
    protected void setUp() throws Exception {
        super.setUp(ProjectType.TPCC);
        this.addPartitions(NUM_PARTITIONS);
        
        HStoreConf.singleton().site.markov_path_caching = false;
        
        if (workload == null) {
            catalog_proc = this.getProcedure(TARGET_PROCEDURE);
            
            File file = this.getWorkloadFile(ProjectType.TPCC);
            workload = new Workload(catalog);

            // Check out this beauty:
            // (1) Filter by procedure name
            // (2) Filter on partitions that start on our BASE_PARTITION
            // (3) Filter to only include multi-partition txns
            // (4) Another limit to stop after allowing ### txns
            // Where is your god now???
            edu.brown.workload.filters.Filter filter = new ProcedureNameFilter(false)
                    .include(TARGET_PROCEDURE.getSimpleName())
//                    .attach(new ProcParameterValueFilter().include(1, new Long(5))) // D_ID
//                    .attach(new ProcParameterArraySizeFilter(CatalogUtil.getArrayProcParameters(catalog_proc).get(0), 10, ExpressionType.COMPARE_EQUAL))
//                    .attach(new BasePartitionTxnFilter(p_estimator, BASE_PARTITION))
//                    .attach(new MultiPartitionTxnFilter(p_estimator))
                    .attach(new ProcedureLimitFilter(WORKLOAD_XACT_LIMIT));
            workload.load(file, catalog_db, filter);
            assert(workload.getTransactionCount() > 0);
            
            // Now extract the FeatureSet that we will use in our tests
            Map<Procedure, FeatureSet> fsets = new FeatureExtractor(catalogContext, p_estimator).calculate(workload);
            FeatureSet fset = fsets.get(catalog_proc);
            assertNotNull(fset);
            data = fset.export(catalog_proc.getName(), false);
            
            NumericToNominal weka_filter = new NumericToNominal();
            weka_filter.setInputFormat(data);
            data = Filter.useFilter(data, weka_filter);
        }
        assertNotNull(data);
        
        fclusterer = new FeatureClusterer(catalogContext, catalog_proc, workload, catalogContext.getAllPartitionIds());
    }
    
    
    /**
     * testSplitPercentages
     */
    @Test
    public void testSplitPercentages() {
        double total = 0.0d;
        for (SplitType stype : FeatureClusterer.SplitType.values()) {
            total += stype.getPercentage();
        }
        assertEquals(1.0d, total);
    }
    
//    /**
//     * testCalculateGlobalCost
//     */
//    @Test
//    public void testCalculateGlobalCost() throws Exception {
//        this.fclusterer.splitWorkload(data);
//        this.fclusterer.calculateGlobalCost();
//        int counters[] = this.fclusterer.getGlobalCounters();
//        assertNotNull(counters);
//        for (int i = 0; i < counters.length; i++) {
//            int val = counters[i];
//            assert(val >= 0) : String.format("Invalid Counter[%d] => %d", i, val);
//        } // FOR
//    }
    
//    /**
//     * testCalculate
//     */
//    @Test
//    public void testCalculate() throws Exception {
//        this.fclusterer.setNumRounds(1);
//        this.fclusterer.setAttributeTopK(0.50);
//        MarkovAttributeSet aset = this.fclusterer.calculate(data);
//        assertNotNull(aset);
//        
//        System.err.println(aset);
//        System.err.println("COST: " + aset.getCost());
//        
//        
//    }
    
    /**
     * testCreateMarkovAttributeSetFilter
     */
    @Test
    public void testCreateMarkovAttributeSetFilter() throws Exception {
        // Test that we can create a filter from an MarkovAttributeSet
        MarkovAttributeSet aset = new MarkovAttributeSet(data, FeatureUtil.getFeatureKeyPrefix(ParamArrayLengthFeature.class));
        assertEquals(CatalogUtil.getArrayProcParameters(catalog_proc).size(), aset.size());
        
        Filter filter = aset.createFilter(data);
        Instances newData = Filter.useFilter(data, filter);
        for (int i = 0, cnt = newData.numInstances(); i < cnt; i++) {
            Instance processed = newData.instance(i);
//            System.err.println(processed);
            assertEquals(aset.size(), processed.numAttributes());
        } // WHILE
        assertEquals(data.numInstances(), newData.numInstances());
//        System.err.println("MarkovAttributeSet: " + aset);
        
    }
    
    /**
     * testCreateClusterer
     */
    @Test
    public void testCreateClusterer() throws Exception {
        // Construct a simple MarkovAttributeSet that only contains the BasePartitionFeature
        MarkovAttributeSet base_aset = new MarkovAttributeSet(data, FeatureUtil.getFeatureKeyPrefix(BasePartitionFeature.class));
        assertFalse(base_aset.isEmpty());
        int base_partition_idx = CollectionUtil.first(base_aset).index();
        
        AbstractClusterer clusterer = this.fclusterer.createClusterer(base_aset, data);
        assertNotNull(clusterer);
        
        // Make sure that each Txn gets mapped to the same cluster as its base partition
        Map<Integer, Histogram<Integer>> p_c_xref = new HashMap<Integer, Histogram<Integer>>();
        for (int i = 0, cnt = data.numInstances(); i < cnt; i++) {
            Instance inst = data.instance(i);
            assertNotNull(inst);
            long txn_id = FeatureUtil.getTransactionId(inst);

            TransactionTrace txn_trace = workload.getTransaction(txn_id);
            assertNotNull(txn_trace);
            Integer base_partition = p_estimator.getBasePartition(txn_trace);
            assertNotNull(base_partition);
            assertEquals(base_partition.intValue(), (int)inst.value(base_partition_idx));

            int c = clusterer.clusterInstance(inst);
            Histogram<Integer> h = p_c_xref.get(base_partition);
            if (h == null) {
                h = new ObjectHistogram<Integer>();
                p_c_xref.put(base_partition, h);
            }
            h.put(c);
        } // FOR
        
//        System.err.println(StringUtil.formatMaps(p_c_xref));
//        Set<Integer> c_p_xref = new HashSet<Integer>();
//        for (Entry<Integer, Histogram> e : p_c_xref.entrySet()) {
//            Set<Integer> clusters = e.getValue().values();
//            
//            // Make sure that each base partition is only mapped to one cluster
//            assertEquals(e.getKey().toString(), 1, clusters.size());
//            
//            // Make sure that two different base partitions are not mapped to the same cluster
//            assertFalse(c_p_xref.contains(CollectionUtil.getFirst(clusters)));
//            c_p_xref.addAll(clusters);
//        } // FOR
    }

//    /**
//     * testCalculateAttributeSetCost
//     */
//    @Test
//    public void testCalculateAttributeSetCost() throws Exception {
//        Set<Attribute> attributes = FeatureClusterer.prefix2attributes(data,
//            FeatureUtil.getFeatureKeyPrefix(ParamArrayLengthFeature.class, this.getProcParameter(catalog_proc, 4)),
//            FeatureUtil.getFeatureKeyPrefix(ParamHashPartitionFeature.class, this.getProcParameter(catalog_proc, 1))
//        );
//        
//        Instances instances[] = fclusterer.splitWorkload(data);
//        assertNotNull(instances);
//        MarkovAttributeSet aset = new MarkovAttributeSet(attributes);
//        assertNotNull(aset);
//        fclusterer.calculateAttributeSetCost(aset);
//        assert(aset.getCost() > 0);
//    }
    
//    /**
//     * testGenerateDecisionTree
//     */
//    @Test
//    public void testGenerateDecisionTree() throws Exception {
//        Set<Attribute> attributes = FeatureClusterer.prefix2attributes(data,
//              FeatureUtil.getFeatureKeyPrefix(ParamArrayLengthFeature.class, this.getProcParameter(catalog_proc, 4)),
//              FeatureUtil.getFeatureKeyPrefix(ParamHashPartitionFeature.class, this.getProcParameter(catalog_proc, 1))
//        );
//        MarkovAttributeSet aset = new MarkovAttributeSet(attributes);
//        assertNotNull(aset);
//
//        Histogram<String> key_h = new Histogram<String>();
//        int key_len = aset.size();
//        for (int i = 0, cnt = data.numInstances(); i < cnt; i++) {
//            Instance inst = data.instance(i);
//            Object key[] = new Object[key_len];
//            for (int ii = 0; ii < key_len; ii++) {
//                key[ii] = inst.value(aset.get(ii));
//            }
//            key_h.put(Arrays.toString(key));
//        } // FOR
//        System.err.println("Number of Elements: " + key_h.getValueCount());
//        System.err.println(key_h);
//        System.err.println(StringUtil.repeat("+", 100));
//        
////        Instances instances[] = fclusterer.splitWorkload(data);
////        assertNotNull(instances);
//        
//        AbstractClusterer clusterer = fclusterer.createClusterer(aset, data);
//        assertNotNull(clusterer);
//        
//        Classifier classifier = fclusterer.generateDecisionTree(clusterer, aset, data);
//        assertNotNull(classifier);
//    }

}