/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package edu.nd.nina.graph.load; import java.rmi.dgc.VMID; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import edu.nd.nina.types.Alphabet; import edu.nd.nina.types.AlphabetCarrying; import edu.nd.nina.types.Instance; /** * The abstract superclass of all Pipes, which transform one data type to * another. Pipes are most often used for feature extraction. * <p> * Although Pipe does not have any "abstract methods", in order to use a Pipe * subclass you must override either the {@link pipe} method or the * {@link newIteratorFrom} method. The former is appropriate when the pipe's * processing of an Instance is strictly one-to-one. For every Instance coming * in, there is exactly one Instance coming out. The later is appropriate when * the pipe's processing may result in more or fewer Instances than arrive * through its source iterator. * <p> * A pipe operates on an {@link cc.mallet.types.Instance}, which is a carrier of * data. A pipe reads from and writes to fields in the Instance when it is * requested to process the instance. It is up to the pipe which fields in the * Instance it reads from and writes to, but usually a pipe will read its input * from and write its output to the "data" field of an instance. * <p> * A pipe doesn't have any direct notion of input or output - it merely modifies * instances that are handed to it. A set of helper classes, which implement the * interface {@link Iterator<Instance>}, iterate over commonly encountered input * data structures and feed the elements of these data structures to a pipe as * instances. * <p> * A pipe is frequently used in conjunction with an * {@link cc.mallet.types.InstanceList} As instances are added to the list, they * are processed by the pipe associated with the instance list and the processed * Instance is kept in the list. * <p> * In one common usage, a {@link cc.mallet.pipe.iterator.FileIterator} is given * a list of directories to operate over. The FileIterator walks through each * directory, creating an instance for each file and putting the data from the * file in the data field of the instance. The directory of the file is stored * in the target field of the instance. The FileIterator feeds instances to an * InstanceList, which processes the instances through its associated pipe and * keeps the results. * <p> * Pipes can be hierachically composed. In a typical usage, a SerialPipe is * created, which holds other pipes in an ordered list. Piping an instance * through a SerialPipe means piping the instance through each of the child * pipes in sequence. * <p> * A pipe holds two separate Alphabets: one for the symbols (feature names) * encountered in the data fields of the instances processed through the pipe, * and one for the symbols (e.g. class labels) encountered in the target fields. * <p> * * @author Andrew McCallum <a * href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ public abstract class Pipe implements AlphabetCarrying { Alphabet dataAlphabet = null; Alphabet targetAlphabet = null; boolean dataAlphabetResolved = false; boolean targetAlphabetResolved = false; boolean targetProcessing = true; VMID instanceId = new VMID(); // used in readResolve to distinguish // persistent instances /** * Construct a pipe with no data and target dictionaries */ public Pipe() { this(null, null); } /** * Construct pipe with data and target dictionaries. Note that, since the * default values of the dataDictClass and targetDictClass are null, that if * you specify null for one of the arguments here, this pipe step will not * ever create any corresponding dictionary for the argument. * * @param dataDict * Alphabet that will be used as the data dictionary. * @param targetDict * Alphabet that will be used as the target dictionary. */ public Pipe(Alphabet dataDict, Alphabet targetDict) { this.dataAlphabet = dataDict; this.targetAlphabet = targetDict; } // TODO Really this should be 'protected', but isn't for historical reasons. /** Really this should be 'protected', but isn't for historical reasons. */ public Instance pipe(Instance inst) { throw new UnsupportedOperationException( "Pipes of class " + this.getClass().getName() + " do not guarantee one-to-one mapping of Instances. Use 'newIteratorFrom' method instead."); } /** * Set whether input is taken from target field of instance during * processing. If argument is false, don't expect to find input material for * the target. By default, this is true. */ public void setTargetProcessing(boolean lookForAndProcessTarget) { targetProcessing = lookForAndProcessTarget; } /** * Return true iff this pipe expects and processes information in the * <tt>target</tt> slot. */ public boolean isTargetProcessing() { return targetProcessing; } /** * Given an InstanceIterator, return a new InstanceIterator whose instances * have also been processed by this pipe. If you override this method, be * sure to check and obey this pipe's {@link skipIfFalse(Instance)} method. */ public Iterator<Instance> newIteratorFrom(Iterator<Instance> source) { return new SimplePipeInstanceIterator(source); } // If this Pipe produces objects that use a Alphabet, this // method returns that dictionary. Even if this particular Pipe // doesn't use a Alphabet it may return non-null if // objects passing through it use a dictionary. // This method should not be called until the dictionary is really // needed, because it may set off a chain of events that "resolve" // the dictionaries of an entire pipeline, and generally this // resolution should not take place until the pipeline is completely // in place, and pipe() is being called. // xxx Perhaps desire to wait until pipe() is being called is unrealistic // and unnecessary. public Alphabet getDataAlphabet() { return dataAlphabet; } public Alphabet getTargetAlphabet() { return targetAlphabet; } public Alphabet getAlphabet() { return getDataAlphabet(); } public List<Alphabet> getAlphabets() { List<Alphabet> x = new ArrayList<Alphabet>(); x.add(getDataAlphabet()); x.add(getTargetAlphabet()); return x; } public boolean alphabetsMatch(AlphabetCarrying object) { List<Alphabet> oas = object.getAlphabets(); return oas.size() == 2 && oas.get(0).equals(getDataAlphabet()) && oas.get(1).equals(getTargetAlphabet()); } public void setDataAlphabet(Alphabet dDict) { if (dataAlphabet != null && dataAlphabet.size() > 0) throw new IllegalStateException( "Can't set this Pipe's Data Alphabet; it already has one."); dataAlphabet = dDict; } public boolean isDataAlphabetSet() { if (dataAlphabet != null && dataAlphabet.size() > 0) return true; return false; } public void setOrCheckDataAlphabet(Alphabet a) { if (dataAlphabet == null) dataAlphabet = a; else if (!dataAlphabet.equals(a)) throw new IllegalStateException("Data alphabets do not match"); } public void setTargetAlphabet(Alphabet tDict) { if (targetAlphabet != null) throw new IllegalStateException( "Can't set this Pipe's Target Alphabet; it already has one."); targetAlphabet = tDict; } public void setOrCheckTargetAlphabet(Alphabet a) { if (targetAlphabet == null) targetAlphabet = a; else if (!targetAlphabet.equals(a)) throw new IllegalStateException("Target alphabets do not match"); } protected void preceedingPipeDataAlphabetNotification(Alphabet a) { if (dataAlphabet == null) dataAlphabet = a; } protected void preceedingPipeTargetAlphabetNotification(Alphabet a) { if (targetAlphabet == null) targetAlphabet = a; } public VMID getInstanceId() { return instanceId; } // for debugging // The InstanceIterator used to implement the one-to-one pipe() method // behavior. private class SimplePipeInstanceIterator implements Iterator<Instance> { Iterator<Instance> source; public SimplePipeInstanceIterator(Iterator<Instance> source) { this.source = source; } public boolean hasNext() { return source.hasNext(); } public Instance next() { Instance input = source.next(); return pipe(input); } /** * Return the @link{Pipe} that processes @link{Instance}s going through * this iterator. */ public Pipe getPipe() { return null; } public Iterator<Instance> getSourceIterator() { return source; } public void remove() { throw new IllegalStateException("Not supported."); } } }