/* Copyright 2003, Carnegie Mellon, All Rights Reserved */
package edu.cmu.minorthird.classify.sequential;
import java.awt.Component;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;
import javax.swing.JComponent;
import javax.swing.JList;
import javax.swing.JScrollPane;
import javax.swing.ListCellRenderer;
import edu.cmu.minorthird.classify.Dataset;
import edu.cmu.minorthird.classify.DatasetLoader;
import edu.cmu.minorthird.classify.Example;
import edu.cmu.minorthird.classify.ExampleSchema;
import edu.cmu.minorthird.classify.FeatureFactory;
import edu.cmu.minorthird.classify.GUI;
import edu.cmu.minorthird.classify.SampleDatasets;
import edu.cmu.minorthird.classify.Splitter;
import edu.cmu.minorthird.util.Saveable;
import edu.cmu.minorthird.util.StringUtil;
import edu.cmu.minorthird.util.gui.ComponentViewer;
import edu.cmu.minorthird.util.gui.Viewer;
import edu.cmu.minorthird.util.gui.ViewerFrame;
import edu.cmu.minorthird.util.gui.Visible;
import edu.cmu.minorthird.util.gui.ZoomedViewer;
/**
* A dataset of sequences of examples.
*
* @author William Cohen
*/
public class SequenceDataset implements Dataset,SequenceConstants,Visible,
Saveable{
protected List<Example[]> sequenceList=new ArrayList<Example[]>();
protected int totalSize=0;
private int historyLength=1;
private String[] history=new String[historyLength];
protected Set<String> classNameSet=new HashSet<String>();
protected FeatureFactory factory=new FeatureFactory();
@Override
public FeatureFactory getFeatureFactory(){
return factory;
}
/** Set the current history length.
* Examples produced by the iterator() will
* contain the last k class labels as features.
*/
public void setHistorySize(int k){
historyLength=k;
history=new String[historyLength];
}
/** Return the current history length.
* Examples produced by the iterator() will
* contain the last k class labels as features.
*/
public int getHistorySize(){
return historyLength;
}
@Override
public ExampleSchema getSchema(){
ExampleSchema schema=
new ExampleSchema(classNameSet
.toArray(new String[classNameSet.size()]));
if(schema.equals(ExampleSchema.BINARY_EXAMPLE_SCHEMA))
return ExampleSchema.BINARY_EXAMPLE_SCHEMA;
else
return schema;
}
/** Add a new example to the dataset. <br>
* <br>
* This method compresses the example before adding it to the
* the dataset. To prevent this compresstion call {@link #add(Example, boolean)}.
*
* @param example The example to add to the dataset.
*/
@Override
public void add(Example example){
addSequence(new Example[]{example});
}
/** Add a new example to the dataset. <br>
* <br>
* This method allows the caller to specify if they want the examples to be
* compressed or not.
*
* @param example The example to add to the dataset.
* @param compress Boolean specifying whether or not to compress the example.
*/
@Override
public void add(Example example,boolean compress){
addSequence(new Example[]{example},compress);
}
/** Add a new sequence of examples to the dataset. <br>
* <br>
* This method compresses each example before adding it to the
* the dataset. To prevent this compresstion call {@link #addSequence(Example[], boolean)}.
*/
public void addSequence(Example[] sequence){
addSequence(sequence,true);
}
/** Add a new sequence of examples to the dataset <br>
* <br>
* This method allows the caller to specify if they want the examples to be
* compressed or not.
*
* @param sequence The sequence of examples to add to the dataset
* @param compress Boolean specifying whether or not to compress the examples.
*/
public void addSequence(Example[] sequence,boolean compress){
// If the user wants to compress the examples in the sequence then
// create a new array and fill it with the compressed examples.
// Then add the new array of examples to the dataset.
if(compress){
Example[] compressedSeq=new Example[sequence.length];
for(int i=0;i<sequence.length;i++){
compressedSeq[i]=factory.compress(sequence[i]);
classNameSet.addAll(sequence[i].getLabel().possibleLabels());
}
sequenceList.add(compressedSeq);
}
// If the caller doesn't want the examples compressed then just add
// the array of examples to the dataset.
else{
sequenceList.add(sequence);
}
totalSize+=sequence.length;
}
/** Iterate over all examples, extended so as to contain history information. */
@Override
public Iterator<Example> iterator(){
return new MyIterator();
}
/** Return the number of examples. */
@Override
public int size(){
return totalSize;
}
/** Return the number of sequences. */
public int numberOfSequences(){
return sequenceList.size();
}
/** Return an iterator over all sequences.
* Each item returned by this will be of type Example[]. */
public Iterator<Example[]> sequenceIterator(){
return sequenceList.iterator();
}
/** Randomly re-order the examples. */
@Override
public void shuffle(Random r){
Collections.shuffle(sequenceList,r);
}
/** Randomly re-order the examples. */
@Override
public void shuffle(){
shuffle(new Random(0));
}
/** Make a shallow copy of the dataset. Sequences are shared, but not the
* ordering of the Sequences. */
@Override
public Dataset shallowCopy(){
SequenceDataset copy=new SequenceDataset();
copy.setHistorySize(getHistorySize());
for(Iterator<Example[]> i=sequenceList.iterator();i.hasNext();){
copy.addSequence(i.next());
}
return copy;
}
//
// split
//
@Override
public Split split(final Splitter<Example> splitter){
throw new UnsupportedOperationException("Use splitSequence instead.");
}
public Split splitSequence(final Splitter<Example[]> splitter){
splitter.split(sequenceList.iterator());
return new Split(){
@Override
public int getNumPartitions(){
return splitter.getNumPartitions();
}
@Override
public Dataset getTrain(int k){
return invertIteration(splitter.getTrain(k));
}
@Override
public Dataset getTest(int k){
return invertIteration(splitter.getTest(k));
}
};
}
protected Dataset invertIteration(Iterator<Example[]> i){
SequenceDataset copy=new SequenceDataset();
copy.setHistorySize(getHistorySize());
while(i.hasNext()){
Example[] o=i.next();
copy.addSequence(o);
}
return copy;
}
//
// iterate over examples, having added extra history fields to them
//
private class MyIterator implements Iterator<Example>{
private Iterator<Example[]> i;
private Example[] buf;
private int j;
public MyIterator(){
i=sequenceList.iterator();
if(i.hasNext())
buf=i.next();
else
buf=new Example[]{};
j=0;
}
@Override
public boolean hasNext(){
return(j<buf.length||i.hasNext());
}
@Override
public Example next(){
if(j>=buf.length){
buf=i.next();
j=0;
}
// build history
InstanceFromSequence.fillHistory(history,buf,j);
//for (int k=0; k<historyLength; k++) {
// if (j-k-1>=0) history[k] = buf[j-k-1].getLabel().bestClassName();
// else history[k] = NULL_CLASS_NAME;
//}
Example e=buf[j++];
if(e==null)
throw new IllegalStateException("null example at pos "+j+" buf "+
StringUtil.toString(buf));
return new Example(new InstanceFromSequence(e,history),e.getLabel());
}
@Override
public void remove(){
throw new UnsupportedOperationException("can't remove");
}
}
@Override
public String toString(){
StringBuffer buf=new StringBuffer("[SeqData:\n");
for(Iterator<Example[]> i=sequenceList.iterator();i.hasNext();){
Example[] seq=i.next();
for(int j=0;j<seq.length;j++){
buf.append(" "+seq[j]);
}
buf.append("\n");
}
buf.append("]");
return buf.toString();
}
//
// Implement Saveable interface.
//
static private final String FORMAT_NAME="Minorthird Sequential Dataset";
@Override
public String[] getFormatNames(){
return new String[]{FORMAT_NAME};
}
@Override
public String getExtensionFor(String s){
return ".seqdata";
}
@Override
public void saveAs(File file,String format) throws IOException{
if(!format.equals(FORMAT_NAME))
throw new IllegalArgumentException("illegal format "+format);
DatasetLoader.saveSequence(this,file);
}
@Override
public Object restore(File file) throws IOException{
try{
return DatasetLoader.loadSequence(file);
}catch(NumberFormatException ex){
throw new IllegalStateException("error loading from "+file+": "+ex);
}
}
/** A GUI view of the dataset. */
@Override
public Viewer toGUI(){
Viewer dbGui=new MyDataViewer();
dbGui.setContent(this);
Viewer seqGui=GUI.newSourcedExampleViewer();
return new ZoomedViewer(dbGui,seqGui);
}
private static class MyDataViewer extends ComponentViewer{
static final long serialVersionUID=20080207L;
@Override
public JComponent componentFor(Object o){
SequenceDataset d=(SequenceDataset)o;
final Example[] arr=new Example[d.size()];
int k=0;
for(Iterator<Example> i=d.iterator();i.hasNext();){
arr[k++]=i.next();
}
JList jList=new JList(arr);
jList.setCellRenderer(new ListCellRenderer(){
@Override
public Component getListCellRendererComponent(JList el,Object v,
int index,boolean sel,boolean focus){
return GUI.conciseExampleRendererComponent(arr[index],100,sel);
}
});
monitorSelections(jList);
return new JScrollPane(jList);
}
}
public static void main(String[] args) throws IOException{
SequenceDataset d=SampleDatasets.makeToySequenceData();
System.out.println(d.toString());
new ViewerFrame("Sequence data",d.toGUI());
if(args.length>0)
DatasetLoader.saveSequence(d,new File(args[0]));
}
public int getNumPosExamples(){
return -1;
}
}