SegmentDataset.java example

Explorer
MinorThird-master
/* Copyright 2003, Carnegie Mellon, All Rights Reserved */

package edu.cmu.minorthird.classify.sequential;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;

import edu.cmu.minorthird.classify.BasicDataset;
import edu.cmu.minorthird.classify.Dataset;
import edu.cmu.minorthird.classify.Example;
import edu.cmu.minorthird.classify.ExampleSchema;
import edu.cmu.minorthird.classify.FeatureFactory;
import edu.cmu.minorthird.classify.GUI;
import edu.cmu.minorthird.classify.Splitter;
import edu.cmu.minorthird.util.gui.Viewer;
import edu.cmu.minorthird.util.gui.ZoomedViewer;

/**
 * A SequenceDataset that additionally includes examples for 'sliding
 * windows' over the original data.  
 *
 * @author William Cohen
 */

public class SegmentDataset implements Dataset{

	int maxWindowSize=-1;

	private List<CandidateSegmentGroup> groupList=new ArrayList<CandidateSegmentGroup>();

	private Set<String> classNameSet=new HashSet<String>();

	private int totalSize=0;

	private FeatureFactory factory=new FeatureFactory();

	private boolean compressGroups=true;

	public SegmentDataset(){
		;
	}

	public void setDataCompression(boolean flag){
		compressGroups=flag;
	}

	@Override
	public FeatureFactory getFeatureFactory(){
		return factory;
	}

	public int getMaxWindowSize(){
		return maxWindowSize;
	}

	@Override
	public int size(){
		return totalSize;
	}

	public int getNumberOfSegmentGroups(){
		return groupList.size();
	}

	/** Add a new sequence of examples to the dataset. */
	public void addCandidateSegmentGroup(CandidateSegmentGroup group){
		if(maxWindowSize<0)
			maxWindowSize=group.getMaxWindowSize();
		if(maxWindowSize>=0&&group.getMaxWindowSize()!=maxWindowSize){
			throw new IllegalArgumentException("mismatched window sizes: "+
					maxWindowSize+", "+group.getMaxWindowSize());
		}
		if(compressGroups)
			groupList.add(new CompactCandidateSegmentGroup(factory,group));
		else
			groupList.add(group);
		classNameSet.addAll(group.classNameSet());
		totalSize+=group.size();
	}

	@Override
	public ExampleSchema getSchema(){
		ExampleSchema schema=
				new ExampleSchema(classNameSet
						.toArray(new String[classNameSet.size()]));
		if(schema.equals(ExampleSchema.BINARY_EXAMPLE_SCHEMA))
			return ExampleSchema.BINARY_EXAMPLE_SCHEMA;
		else
			return schema;
	}

	/**
	 * Add an example to the dataset. <br>
	 * <br>
	 * This method compresses the example before adding it to the dataset.  If
	 * you want/need the example to be compressed then call {@link #add(Example, boolean)}
	 *
	 * @param example The Example that you want to add to the dataset.
	 */
	@Override
	public void add(Example example){
		add(example,false);
	}

	/**
	 * Add an Example to the dataset. <br>
	 * <br>
	 * This method lets the caller specify whether or not to compress the example
	 * before adding it to the dataset.
	 *
	 * @param example The example to add to the dataset
	 * @param compress Boolean specifying whether or not to compress the example.
	 */
	@Override
	public void add(Example example,boolean compress){
		MutableCandidateSegmentGroup g=new MutableCandidateSegmentGroup(1,1);

		if(compress)
			g.setSubsequence(0,1,factory.compress(example.asInstance()),example
					.getLabel());
		else
			g.setSubsequence(0,1,example.asInstance(),example.getLabel());
		addCandidateSegmentGroup(g);
	}

	/** Iterate over all examples */
	@Override
	public Iterator<Example> iterator(){
		List<Example> result=new ArrayList<Example>();
		for(Iterator<CandidateSegmentGroup> i=groupList.iterator();i.hasNext();){
			CandidateSegmentGroup g=i.next();
			for(int j=0;j<g.getSequenceLength();j++){
				for(int k=1;k<=g.getMaxWindowSize();k++){
					Example e=g.getSubsequenceExample(j,j+k);
					if(e!=null)
						result.add(e);
				}
			}
		}
		return result.iterator();
	}

	public Iterator<CandidateSegmentGroup> candidateSegmentGroupIterator(){
		return groupList.iterator();
	}

	@Override
	public String toString(){
		StringBuffer buf=new StringBuffer("");
		buf.append("size = "+size()+"\n");
		for(Iterator<CandidateSegmentGroup> i=groupList.iterator();i.hasNext();){
			buf.append(i.next()+"\n");
		}
		return buf.toString();
	}

	/** Randomly re-order the examples. */
	@Override
	public void shuffle(Random r){
		Collections.shuffle(groupList,r);
	}

	/** Randomly re-order the examples. */
	@Override
	public void shuffle(){
		Collections.shuffle(groupList,new Random(0));
	}

	/** Make a shallow copy of the dataset. */
	@Override
	public Dataset shallowCopy(){
		SegmentDataset copy=new SegmentDataset();
		for(Iterator<CandidateSegmentGroup> i=groupList.iterator();i.hasNext();){
			copy.addCandidateSegmentGroup(i.next());
		}
		return copy;
	}

	//
	// split
	//
	
	@Override
	public Split split(final Splitter<Example> splitter){
		throw new UnsupportedOperationException();
	}
	
	public Split splitCandidateSegmentGroup(final Splitter<CandidateSegmentGroup> splitter){
		splitter.split(groupList.iterator());
		return new Split(){

			@Override
			public int getNumPartitions(){
				return splitter.getNumPartitions();
			}

			@Override
			public Dataset getTrain(int k){
				return invertIteration(splitter.getTrain(k));
			}

			@Override
			public Dataset getTest(int k){
				return invertIteration(splitter.getTest(k));
			}
		};
	}

	protected Dataset invertIteration(Iterator<CandidateSegmentGroup> i){
		SegmentDataset copy=new SegmentDataset();
		while(i.hasNext()){
			CandidateSegmentGroup o=i.next();
			copy.addCandidateSegmentGroup(o);
		}
		return copy;
	}

	/** A GUI view of the dataset. */
	@Override
	public Viewer toGUI(){
		//return new VanillaViewer(this);
		Viewer dbGui=new BasicDataset.SimpleDatasetViewer();
		dbGui.setContent(this);
		Viewer instGui=GUI.newSourcedExampleViewer();
		return new ZoomedViewer(dbGui,instGui);
	}

	public int getNumPosExamples(){
		return -1;
	}
}