/* Copyright 2003, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.classify.multi; import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import edu.cmu.minorthird.classify.Feature; import edu.cmu.minorthird.classify.Instance; /** * An inverted index, mapping features to examples which contain the * features. * * @author Cameron Williams, Frank Lin */ public class MultiDatasetIndex implements Serializable{ static final long serialVersionUID=20080131L; private SortedMap<Feature,List<MultiExample>> indexByFeature; private SortedMap<String,List<MultiExample>> indexByClass; private int sumFeatureValues; private int exampleCount; public MultiDatasetIndex(){ indexByFeature=new TreeMap<Feature,List<MultiExample>>(); indexByClass=new TreeMap<String,List<MultiExample>>(); sumFeatureValues=0; } /** Construct an index of a dataset. */ public MultiDatasetIndex(MultiDataset data){ this(); for(Iterator<MultiExample> i=data.multiIterator();i.hasNext();){ addMultiExample(i.next()); } } /** Add a single example to the index. */ public void addMultiExample(MultiExample e){ classIndex(e.getMultiLabel().bestClassName().toString()).add(e); for(Iterator<Feature> j=e.featureIterator();j.hasNext();){ Feature f=j.next(); featureIndex(f).add(e); sumFeatureValues++; } exampleCount++; } /** Iterate over all features indexed. */ public Iterator<Feature> featureIterator(){ return indexByFeature.keySet().iterator(); } /** Number of examples containing non-zero values for feature f. */ public int size(Feature f){ return featureIndex(f).size(); } /** Number of examples with the given class label. */ public int size(String label){ return classIndex(label).size(); } /** Get i-th example containing feature f. */ public MultiExample getMultiExample(Feature f,int i){ return featureIndex(f).get(i); } /** Get i-th example with given class label. */ public MultiExample getMultiExample(String label,int i){ return classIndex(label).get(i); } /** Get all examples with a feature in common with the given instance. */ public Iterator<MultiExample> getNeighbors(Instance instance){ Set<MultiExample> set=new HashSet<MultiExample>(); for(Iterator<Feature> i=instance.featureIterator();i.hasNext();){ Feature feature=i.next(); for(Iterator<MultiExample> j=featureIndex(feature).iterator();j.hasNext();){ MultiExample e=j.next(); set.add(e); } } return set.iterator(); } // // statistics about the dataset // /** Number of features indexed. */ public int numberOfFeatures(){ return indexByFeature.keySet().size(); } /** Average number of non-zero feature values in examples. */ public double averageFeaturesPerExample(){ return sumFeatureValues/((double)exampleCount); } // // subroutines // protected List<MultiExample> featureIndex(Feature feature){ List<MultiExample> result=indexByFeature.get(feature); if(result==null){ indexByFeature.put(feature,result=new ArrayList<MultiExample>()); } return result; } protected List<MultiExample> classIndex(String label){ List<MultiExample> result=indexByClass.get(label); if(result==null){ indexByClass.put(label,result=new ArrayList<MultiExample>()); } return result; } @Override public String toString(){ StringBuffer buf=new StringBuffer("[index"); for(Iterator<Feature> i=featureIterator();i.hasNext();){ Feature f=i.next(); buf.append("\n"+f+":"); for(int j=0;j<size(f);j++){ buf.append("\n\t"+getMultiExample(f,j).toString()); } } for(Iterator<String> i=indexByClass.keySet().iterator();i.hasNext();){ String label=i.next(); buf.append("\n"+label+":"); for(int j=0;j<size(label);j++){ buf.append("\n\t"+getMultiExample(label,j).toString()); } } buf.append("\nindex]"); return buf.toString(); } // // main // static public void main(String[] args){ System.out.println("MultiDatasetIndex"); } }