/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.extract;
import java.io.Serializable;
import java.util.Iterator;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.iterator.PipeInputIterator;
import cc.mallet.types.Alphabet;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.Instance;
// Analogous to base.classify.Classifier
/**
* Generic interface for objects that do information extraction.
* Typically, this will mean extraction of database records
* (see @link{Record}) from Strings, but this interface is not
* specific to this case.
*/
//TODO: Possibly in the future, create Document and Corpus objects.
// (This would allow calling an extractor on multiple documents in a type-safe manner.
public interface Extractor extends Serializable
{
/**
* Performs extraction given a raw object. The object will
* be passed through the Extractor's pipe.
* @param o The document to extract from (often a String).
* @return Extraction the results of performing extraction
*/
public Extraction extract (Object o);
/**
* Performs extraction from an object that has been
* already been tokenized. This method will pass spans
* through the extractor's pipe.
* @param toks A tokenized document
* @return Extraction the results of performing extraction
*/
public Extraction extract (Tokenization toks);
/**
* Performs extraction on a a set of raw documents. The
* Instances output from source will be passed through
* both the tokentization pipe and the feature extraction
* pipe.
* @param source A source of raw documents
* @return Extraction the results of performing extraction
*/
public Extraction extract (Iterator<Instance> source);
/**
* Returns the pipe used by this extractor for. The pipe
* takes an Instance and converts it into a form usable
* by the particular extraction algorithm. This pipe expects
* the Instance's data field to be a Tokenization. For example,
* pipes often perform feature extraction. The type of
* raw object expected by the pipe depends on the particular
* subclass of extractor.
* @return a pipe
*/
public Pipe getFeaturePipe ();
/**
* Returns the pipe used by this extractor to tokenize the input.
* The type of Instance of this pipe expects is specific to the
* individual extractor. This pipe will return an Instance whose
* data is a Tokenization.
* @return a pipe
*/
public Pipe getTokenizationPipe ();
/**
* Sets the pipe used by this extractor for tokenization. The pipe should
* takes a raw object and convert it into a Tokenization.
* <P>
* The pipe @link{edu.umass.cs.mallet.base.pipe.CharSequence2TokenSequence} is an
* example of a pipe that could be used here.
*/
public void setTokenizationPipe (Pipe pipe);
/**
* Returns an alphabet of the features used by the extractor.
* The alphabet maps strings describing the features to indices.
* @return the input alphabet
*/
public Alphabet getInputAlphabet ();
/**
* Returns an alphabet of the labels used by the extractor.
* Labels include entity types (such as PERSON) and slot
* names (such as EMPLOYEE-OF).
* @return the target alphabet
*/
public LabelAlphabet getTargetAlphabet ();
}