/* Copyright 2003, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.classify; import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; /** * An inverted index, mapping features to examples which contain the * features. * * @author William Cohen */ public class DatasetIndex implements Serializable{ static final long serialVersionUID=20080128L; private SortedMap<Feature,List<Example>> indexByFeature; private SortedMap<String,List<Example>> indexByClass; private int sumFeatureValues; private int exampleCount; public DatasetIndex(){ indexByFeature=new TreeMap<Feature,List<Example>>(); indexByClass=new TreeMap<String,List<Example>>(); sumFeatureValues=0; exampleCount=0; } /** Construct an index of a dataset. */ public DatasetIndex(Dataset data){ this(); for(Iterator<Example> i=data.iterator();i.hasNext();){ addExample(i.next()); } } /** Add a single example to the index. */ public void addExample(Example e){ classIndex(e.getLabel().bestClassName()).add(e); for(Iterator<Feature> j=e.featureIterator();j.hasNext();){ Feature f=j.next(); featureIndex(f).add(e); sumFeatureValues++; } exampleCount++; } /** Iterate over all features indexed. */ public Iterator<Feature> featureIterator(){ return indexByFeature.keySet().iterator(); } /** Number of examples containing non-zero values for feature f. */ public int size(Feature f){ return featureIndex(f).size(); } /** Number of examples with the given class label. */ public int size(String label){ return classIndex(label).size(); } /** Get i-th example containing feature f. */ public Example getExample(Feature f,int i){ return featureIndex(f).get(i); } /** Get i-th example with given class label. */ public Example getExample(String label,int i){ return classIndex(label).get(i); } /** Get all examples with a feature in common with the given instance. */ public Iterator<Example> getNeighbors(Instance instance){ Set<Example> set=new HashSet<Example>(); for(Iterator<Feature> i=instance.featureIterator();i.hasNext();){ Feature f=i.next(); for(Iterator<Example> j=featureIndex(f).iterator();j.hasNext();){ set.add(j.next()); } } return set.iterator(); } // statistics about the dataset /** Number of features indexed. */ public int numberOfFeatures(){ return indexByFeature.keySet().size(); } /** Average number of non-zero feature values in examples. */ public double averageFeaturesPerExample(){ return sumFeatureValues/((double)exampleCount); } // subroutines protected List<Example> featureIndex(Feature f){ List<Example> result=indexByFeature.get(f); if(result==null){ result=new ArrayList<Example>(); indexByFeature.put(f,result); } return result; } protected List<Example> classIndex(String label){ List<Example> result=indexByClass.get(label); if(result==null){ result=new ArrayList<Example>(); indexByClass.put(label,result); } return result; } @Override public String toString(){ StringBuilder buf=new StringBuilder("[index"); for(Iterator<Feature> i=featureIterator();i.hasNext();){ Feature f=i.next(); buf.append("\n"+f+":"); for(int j=0;j<size(f);j++){ buf.append("\n\t"+getExample(f,j).toString()); } } for(Iterator<String> i=indexByClass.keySet().iterator();i.hasNext();){ String label=i.next(); buf.append("\n"+label+":"); for(int j=0;j<size(label);j++){ buf.append("\n\t"+getExample(label,j).toString()); } } buf.append("\nindex]"); return buf.toString(); } // main static public void main(String[] args){ System.out.println(new DatasetIndex(SampleDatasets.sampleData("toy",false))); } }