package edu.cmu.minorthird.text.learn;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.cmu.minorthird.classify.Feature;
import edu.cmu.minorthird.classify.Instance;
import edu.cmu.minorthird.classify.MutableInstance;
import edu.cmu.minorthird.text.AnnotatorLoader;
import edu.cmu.minorthird.text.EmptyLabels;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.StopWords;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.Token;
/**
* A Feature Extractor which converts a Span to an Instance.
*
* <p>
* Typical use of this would be something like the following: <code><pre>
* SpanFE fe=new SpanFE(labels){
*
* public void extractFeatures(Span span){
* from(span).tokens().emit();
* from(span).left().subSpan(-2,2).emit();
* from(span).right().subSpan(0,2).emit();
* from(span).right().contains("obj").emit();
* }
* };
*
* Instance inst=fe.extractInstance(span);
* </pre></code> Generally, to use this class, one subclasses it and implements the
* extractFeatures method, using a chain of feature-extracting actions which
* starts with 'from' and ends with 'emit'.
* <p>
* The methods tokens(), subSpan(), and so on are defined in subclasses of
* SpanFE.Result, and are summarized here.
* <ul>
* <li> result.trace() - prints some stuff to stdout by called
* SpanFE.trace(result). SpanFE.trace can be overloaded for different behavior.
* <li> result.emit() - ends a feature extraction pipeline by calling
* SpanFE(result), which can be overloaded.
* <li> result.left() - if result contains a single span, find the left context
* of that span (a span containing all tokens before it).
* <li> result.right() - if result contains a single span, find the right
* context of that span (all tokens after it).
* <li> result.contains(String type) - if result contains a single span, find
* the set of all spans of given type contained by that span.
* <li> result.subSpan(int lo,int len) - if result contains a single span, find
* the appropriate subspan of that span.
* <li> result.tokens() - if result contains a single span, find the set of all
* tokens contained in that span (a 'bag of words'. Extends to a set of spans as
* well.
* <li> result.token(int i) - if result contains a single span, construct the
* set containing the i-th token only.
* <li> result.first(), result.last() - return the first/last element of a set
* of Spans.
* <li> result.eq() - for a set of tokens, construct a set of features of the
* form 'x y z eq v' where v is the value of the token and 'x y z' is the path
* of feature extraction steps needed to get to set of tokens.
* </ul>
*
*
* @author William Cohen
*/
abstract public class SpanFE implements SpanFeatureExtractor,MixupCompatible,
Serializable{
// for serialization
static private final long serialVersionUID=20080306L;
/**
* Store features as binary, whenever possible, even if occurence counts are
* ignored.
*/
static public final int STORE_AS_BINARY=1;
/** Store features as numeric counts, whenever possible */
static public final int STORE_AS_COUNTS=2;
/**
* Store features as binary or counts, trying to reduce storage while
* maintaining information.
*/
static public final int STORE_COMPACTLY=3;
private int featureStoragePolicy=STORE_AS_COUNTS;
// buffers for intermediate results & inputs in feature extraction
transient protected MutableInstance instance;
transient private TextLabels textLabels=new EmptyLabels();
protected String requiredAnnotation=null;
protected String requiredAnnotationFileToLoad=null;
protected AnnotatorLoader annotatorLoader=null;
/** Create a feature extractor */
public SpanFE(){
}
//
// getters and setters
//
/**
* Set the policy for creating features.
*
* @param p
* should be one of SpanFE.STORE_AS_BINARY, SpanFE.STORE_AS_COUNTS,
* SpanFE.STORE_COMPACTLY
*/
public void setFeatureStoragePolicy(int p){
this.featureStoragePolicy=p;
}
/**
* Simultaneously specify an annotator to run before feature generation and a
* mixup file or class that generates it.
*/
public void setRequiredAnnotation(String requiredAnnotation,
String annotationProvider){
setRequiredAnnotation(requiredAnnotation);
setAnnotationProvider(annotationProvider);
}
//
// simpler getter-setter interface, e.g. for GUI configuration
//
/** Specify an annotator to run before feature generation. */
@Override
public void setRequiredAnnotation(String requiredAnnotation){
this.requiredAnnotation=requiredAnnotation;
}
@Override
public String getRequiredAnnotation(){
return requiredAnnotation==null?"":requiredAnnotation;
}
/**
* Specify a mixup file or java class to use to provide the annotation.
*/
public void setAnnotationProvider(String classNameOrMixupFileName){
this.requiredAnnotationFileToLoad=classNameOrMixupFileName;
}
public String getAnnotationProvider(){
return requiredAnnotationFileToLoad==null?"":requiredAnnotationFileToLoad;
}
@Override
public void setAnnotatorLoader(AnnotatorLoader newLoader){
this.annotatorLoader=newLoader;
}
//
// preprocessing for extraction
//
/** Make sure the required annotation is present. */
public void requireMyAnnotation(TextLabels labels){
labels.require(requiredAnnotation,requiredAnnotationFileToLoad,
annotatorLoader);
}
//
// extraction
//
// /** @deprecated Use extractInstance(TextLabels labels,Span s) */
// final public Instance extractInstance(Span span){
// instance=new MutableInstance(span,span.getDocumentGroupId());
// extractFeatures(span);
// return instance;
// }
/** Extract an Instance from a span */
@Override
final public Instance extractInstance(TextLabels labels,Span span){
instance=new MutableInstance(span,span.getDocumentGroupId());
textLabels=labels;
extractFeatures(labels,span);
return instance;
}
/**
* Starts a 'pipeline' of extraction steps, and adds the resulting features to
* the instance being built.
* <p>
* As an example: <code>fe.from(s).tokens(s).eq().emit()</code> adds
* bag-of-words type features.
*/
final public SpanResult from(Span s){
return new SpanResult(new String[0],this,s);
}
/**
* Starts a 'pipeline' of extraction steps, and adds the resulting features to
* the instance being built.
*
* <p>
* This is intended to be used as an alternative to using the SpanFE class to
* build an Span2Instance converter, eg
*
* <pre><code>
* fe=new Span2Instance(){
*
* public extractInstance(Span s){
* FeatureBuffer buf=new FeatureBuffer(s);
* SpanFE.from(s,buf).tokens().emit();
* SpanFE.from(s,buf).left().subspan(-2,2).emit();
* SpanFE.from(s,buf).right().subspan(0,2).emit();
* buf.getInstance();
* }
* }
* </code></pre>
*
*/
final static public SpanResult from(Span s,FeatureBuffer buffer){
return new SpanResult(new String[0],buffer,s);
}
/**
* Called by some SpanFE.Result subclasses when a 'pipeline' of extraction
* steps is ended with a StringBagResult.
*/
public void emit(StringBagResult result){
for(Iterator<String> i=result.asBag().iterator();i.hasNext();){
String s=i.next();
Feature f=new Feature(result.extend(s));
if(featureStoragePolicy==STORE_AS_BINARY){
instance.addBinary(f);
}else{
int c=result.asBag().getCount(s);
if(featureStoragePolicy==STORE_COMPACTLY&&c==1)
instance.addBinary(f);
else
instance.addNumeric(f,c);
}
}
}
/**
* Called by some SpanFE.Result subclass when a 'pipeline' of extraction steps
* is ended with a TokenSetResult.
*/
public void emit(TokenSetResult result){
emit(result.eq());
}
/**
* Called by some SpanFE.Result subclass when a 'pipeline' of extraction steps
* is ended with a SpanSetResult.
*/
public void emit(SpanSetResult result){
emit(result.tokens());
}
/**
* Called by some SpanFE.Result subclass when a 'pipeline' of extraction steps
* is ended with a SpanResult.
*/
public void emit(SpanResult result){
emit(result.tokens());
}
/**
* Implement this with a specific set of SpanFE 'pipelines'. Each pipeline
* will typically start with 'start(span)' and end with 'emit()'.
*
*/
public void extractFeatures(Span span){
throw new IllegalStateException(
"you probably meant to use extractFeatures(labels,span) instead");
}
/**
* Implement this with a specific set of SpanFE 'pipelines'. Each pipeline
* will typically start with 'start(span)' and end with 'emit()'.
*/
abstract public void extractFeatures(TextLabels labels,Span span);
/** Subclass this to change the tracing behavior. */
public void trace(Result result){
String[] name=result.getName();
for(int i=0;i<name.length;i++)
System.out.print(" "+name[i]);
System.out.println(" -> "+result);
}
//
// SpanFE.Result classes
//
/** Encodes an intermediate result of the SpanFE process. */
static abstract public class Result{
protected String[] name;
protected SpanFE fe;
public Result(String[] name,SpanFE fe){
this.name=name;
this.fe=fe;
if(fe==null)
throw new IllegalArgumentException("null fe");
}
// extend the name
public String[] extend(String addition){
return extend(name,addition);
}
public String[] extend(String[] partial,String addition){
String[] extension=new String[partial.length+1];
for(int i=0;i<partial.length;i++)
extension[i]=partial[i];
extension[partial.length]=addition;
return extension;
}
// for traces
protected Result doTrace(){
fe.trace(this);
return this;
}
public String[] getName(){
return name;
}
/** Terminates a feature extraction pipeline by actually emitting features. */
abstract public void emit();
}
/**
* An intermediate result of a SpanFE process where the object being operated
* on is a Set of something.
*/
abstract static public class SetResult<T> extends Result{
protected SortedSet<T> set;
public SetResult(String[] name,SpanFE fe,SortedSet<T> set){
super(name,fe);
this.set=set;
if(this.set==null)
throw new IllegalArgumentException("null set");
}
/**
* Convert to a plain old set.
*/
public Set<T> asSet(){
return set;
}
/**
* Filter the set using a user-defined filter.
*/
protected SortedSet<T> applyFilter(Filter f){
SortedSet<T> s=new TreeSet<T>();
for(Iterator<T> i=set.iterator();i.hasNext();){
T o=i.next();
if(f.match(o))
s.add(o);
}
return s;
}
/**
* Modify each element in the set using a user-defined function.
*/
protected SortedSet<T> mapFunction(Function f){
SortedSet<T> s=new TreeSet<T>();
for(Iterator<T> i=set.iterator();i.hasNext();){
s.add(f.apply(i.next()));
}
return s;
}
}
/**
* An intermediate result of an SpanFE process where a span is being
* processed.
*/
static public class SpanResult extends Result{
private Span s;
public SpanResult(String[] name,SpanFE fe,Span s){
super(name,fe);
this.s=s;
}
public SpanResult trace(){
return (SpanResult)doTrace();
}
@Override
public void emit(){
fe.emit(this);
}
@Override
public String toString(){
return "[SpanResult: "+s+"]";
}
public Span getSpan(){
return s;
}
/**
* Move to the span consisting of all tokens in the same document that
* precede the current span.
*/
public SpanResult left(){
Span lSpan=s.documentSpan().subSpan(0,s.documentSpanStartIndex());
return new SpanResult(extend("left"),fe,lSpan);
}
/**
* Move to the span consisting of all tokens in the same document that
* follow the current span.
*/
public SpanResult right(){
Span rSpan=
s.documentSpan().subSpan(s.documentSpanStartIndex()+s.size(),
s.documentSpan().size()-s.documentSpanStartIndex()-s.size());
return new SpanResult(extend("right"),fe,rSpan);
}
/**
* Move to the document containing this span.
*/
public SpanResult doc(){
Span docSpan=s.documentSpan();
return new SpanResult(extend("doc"),fe,docSpan);
}
/**
* Move to a set of all spans of the named type that are contained by the
* current span.
*/
public SpanSetResult contains(String type){
SortedSet<Span> set=new TreeSet<Span>();
for(Iterator<Span> i=
fe.textLabels.instanceIterator(type,s.getDocumentId());i.hasNext();){
Span other=i.next();
if(s.contains(other)){
set.add(other);
}
}
return new SpanSetResult(extend("contains_"+type),fe,set);
}
/**
* Move to the specified subspan of the current span. Invalid indices will
* be trimmed to a valid size. Negative indices mean to extract a subspan
* from the end of the current span, e.g., subSpan(-2,2) means to extract a
* span containing the last two tokens.
*/
public SpanResult subSpan(int lo,int len){
if(s.size()==0)
return this;
if(lo>=0){
lo=Math.min(lo,s.size()-1);
len=Math.min(s.size()-lo,len);
return new SpanResult(extend("subspan_"+lo+"_"+len),fe,s
.subSpan(lo,len));
}else if(lo<0){
lo=Math.max(s.size()+lo,0);
len=Math.min(s.size()-lo,len);
return new SpanResult(extend("subspanNeg_"+lo+"_"+len),fe,s.subSpan(lo,
len));
}else{
throw new IllegalArgumentException("illegal subSpan indices "+lo+", "+
len);
}
}
/** Move to the set of all tokens contained by this span. */
public TokenSetResult tokens(){
SortedSet<Token> set=new TreeSet<Token>();
for(int i=0;i<s.size();i++){
set.add(s.getToken(i));
}
return new TokenSetResult(extend("tokens"),fe,set);
}
/**
* Move to the specified token inside the span. A negative index means to
* count from the end. An invalid index will result in an empty
* TokenSetResult.
*/
public TokenSetResult token(int index){
String namex;
int index1;
if(index<0){
index1=s.size()+index;
namex="tokenNeg_"+(-index);
}else{
index1=index;
namex="token_"+index;
}
SortedSet<Token> set=new TreeSet<Token>();
if(index1>=0&&index1<s.size()){
set.add(s.getToken(index1));
}
return new TokenSetResult(extend(namex),fe,set);
}
/** Move to the string value of the span. */
public StringBagResult eq(){
Bag<String> stringBag=new Bag<String>();
stringBag.add(s.asString());
return new StringBagResult(extend("eq"),fe,stringBag);
}
/**
* Make length of the span a feature. Eg feature is #tokens=3 for a 3-token
* span.
*/
public StringBagResult size(){
Bag<String> stringBag=new Bag<String>();
stringBag.add("#tokens",s.size());
return new StringBagResult(name,fe,stringBag);
}
/**
* Make exact length of span a feature. Eg, feature is #tokens.3=1 for a
* 3-token span, #tokens_2=1 for a two-token span.
*/
public StringBagResult exactSize(){
Bag<String> stringBag=new Bag<String>();
stringBag.add("#tokens_"+s.size());
return new StringBagResult(name,fe,stringBag);
}
}
/**
* An intermediate result of a SpanFE process where the object being operated
* on is a set of spans.
*/
static public class SpanSetResult extends SetResult<Span>{
public SpanSetResult(String[] name,SpanFE fe,SortedSet<Span> set){
super(name,fe,set);
}
public SpanSetResult trace(){
return (SpanSetResult)doTrace();
}
@Override
public void emit(){
fe.emit(this);
}
@Override
public String toString(){
return "[SpanSetResult: "+set+"]";
}
/**
* Move to the first span in the set.
*/
public SpanSetResult first(){
SortedSet<Span> newSet=new TreeSet<Span>();
if(set.size()>0)
newSet.add(set.first());
return new SpanSetResult(extend("first"),fe,newSet);
}
/**
* Move to the last span in the set.
*/
public SpanSetResult last(){
SortedSet<Span> newSet=new TreeSet<Span>();
if(set.size()>0)
newSet.add(set.last());
return new SpanSetResult(extend("last"),fe,newSet);
}
/**
* Find the set of all tokens contained by any span in the set.
*/
public TokenSetResult tokens(){
SortedSet<Token> accum=new TreeSet<Token>();
for(Iterator<Span> i=set.iterator();i.hasNext();){
SpanResult r=new SpanResult(name,fe,i.next());
accum.addAll(r.tokens().asSet());
}
return new TokenSetResult(extend("tokens"),fe,accum);
}
/** Move a set of all string values of spans in the set */
public StringBagResult eq(){
Bag<String> stringBag=new Bag<String>();
for(Iterator<Span> i=set.iterator();i.hasNext();){
stringBag.add(i.next().asString());
}
return new StringBagResult(extend("eq"),fe,stringBag);
}
/** Filter out spans that don't match the filter. */
public SpanSetResult filter(Filter f){
return new SpanSetResult(extend("filter_"+f.getName()),fe,applyFilter(f));
}
public SpanSetResult map(Function f){
return new SpanSetResult(extend("map_"+f.getName()),fe,mapFunction(f));
}
}
/**
* An intermediate result of a SpanFE process where the object being operated
* on is a set of tokens.
*/
static public class TokenSetResult extends SetResult<Token>{
public TokenSetResult(String[] name,SpanFE fe,SortedSet<Token> set){
super(name,fe,set);
}
public TokenSetResult trace(){
return (TokenSetResult)doTrace();
}
@Override
public void emit(){
fe.emit(this);
}
@Override
public String toString(){
return "[TokenSetResult: "+set+"]";
}
/** Find all values of a token in this set. */
public StringBagResult eq(){
Bag<String> stringBag=new Bag<String>();
for(Iterator<Token> i=set.iterator();i.hasNext();){
Token token=i.next();
stringBag.add(token.getValue());
}
return new StringBagResult(extend("eq"),fe,stringBag);
}
/** Find the value of some given property. */
public StringBagResult prop(String property){
Bag<String> stringBag=new Bag<String>();
for(Iterator<Token> i=set.iterator();i.hasNext();){
Token token=i.next();
String value=fe.textLabels.getProperty(token,property);
if(value!=null){
stringBag.add(value);
}
}
return new StringBagResult(extend(property),fe,stringBag);
}
/** Filter out tokens that have some property set to a non-null value. */
public TokenSetResult hasProp(String property){
SortedSet<Token> filteredSet=new TreeSet<Token>();
for(Iterator<Token> i=set.iterator();i.hasNext();){
Token token=i.next();
String value=fe.textLabels.getProperty(token,property);
if(value!=null)
filteredSet.add(token);
}
return new TokenSetResult(extend("hasProp_"+property),fe,filteredSet);
}
/**
* Filter out tokens that have a property set to some particular value. A
* targetValue of 'null' will filter out tokens with null values of the
* property.
*/
public TokenSetResult hasProp(String property,String targetValue){
SortedSet<Token> filteredSet=new TreeSet<Token>();
for(Iterator<Token> i=set.iterator();i.hasNext();){
Token token=i.next();
String value=fe.textLabels.getProperty(token,property);
if((targetValue==null&&value==null)||
(targetValue!=null&&targetValue.equals(value)))
filteredSet.add(token);
}
String targetValueTag=(targetValue==null)?"NULL":targetValue;
return new TokenSetResult(extend("hasProp_"+property+"_"+targetValueTag),
fe,filteredSet);
}
}
/**
* An intermediate result of a SpanFE process where the object being operated
* on is a set of strings.
*/
static public class StringBagResult extends SetResult<String>{
private Bag<String> bag;
public StringBagResult(String[] name,SpanFE fe,Bag<String> bag){
super(name,fe,bag.asSet());
this.bag=bag;
}
@Override
public void emit(){
fe.emit(this);
}
public StringBagResult trace(){
return (StringBagResult)doTrace();
}
@Override
public String toString(){
return "[StringBagResult: "+bag+"]";
}
public Bag<String> asBag(){
return bag;
}
public StringBagResult lc(){
Bag<String> lcBag=new Bag<String>();
for(Iterator<String> i=bag.iterator();i.hasNext();){
String str=i.next();
int n=bag.getCount(str);
lcBag.add(str.toLowerCase(),n);
}
return new StringBagResult(extend("lc"),fe,lcBag);
}
public StringBagResult toConst(String replacement){
Bag<String> trBag=new Bag<String>();
for(Iterator<String> i=bag.iterator();i.hasNext();){
String str=i.next();
int n=bag.getCount(str);
trBag.add(replacement,n);
}
return new StringBagResult(extend("toConst"),fe,trBag);
}
public StringBagResult tr(String regex,String replacement){
Bag<String> trBag=new Bag<String>();
for(Iterator<String> i=bag.iterator();i.hasNext();){
String str=i.next();
int n=bag.getCount(str);
trBag.add(str.replaceAll(regex,replacement),n);
}
return new StringBagResult(extend("tr/"+regex+"/"+replacement),fe,trBag);
}
public StringBagResult charTypes(){
Bag<String> trBag=new Bag<String>();
for(Iterator<String> i=bag.iterator();i.hasNext();){
String str=i.next();
String charTypes=
str.replaceAll("[A-Z]","A").replaceAll("[a-z]","a").replaceAll(
"[0-9]","0");
int n=bag.getCount(str);
trBag.add(charTypes,n);
}
return new StringBagResult(extend("charTypes"),fe,trBag);
}
public StringBagResult charTypePattern(){
Bag<String> trBag=new Bag<String>();
for(Iterator<String> i=bag.iterator();i.hasNext();){
String str=i.next();
String pattern=
str.replaceAll("[A-Z]+","X+").replaceAll("[a-z]+","x+").replaceAll(
"[0-9]+","9+");
int n=bag.getCount(str);
trBag.add(pattern,n);
}
return new StringBagResult(extend("charTypePattern"),fe,trBag);
}
// Removes punctuation and numbers
public StringBagResult punk(){
Bag<String> punkBag=new Bag<String>();
for(Iterator<String> i=bag.iterator();i.hasNext();){
String str=i.next();
int n=bag.getCount(str);
Pattern p=Pattern.compile("[\\W\\d]+");
Matcher m=p.matcher(str);
if(!m.find()){
punkBag.add(str,n);
}
}
return new StringBagResult(extend("punk"),fe,punkBag);
}
// Use or Remove words in String Array
public StringBagResult stopwords(String action){
String[] wordArray=StopWords.LONG; // change with SHORT
Bag<String> swBag=new Bag<String>();
for(Iterator<String> i=bag.iterator();i.hasNext();){
String str=i.next();
int n=bag.getCount(str);
if(action.equalsIgnoreCase("use")){
// "use" words as sole features
for(int j=0;j<wordArray.length;j++){
if(wordArray[j].equals(str)){
swBag.add(str,n);
}
}
}else if(action.equalsIgnoreCase("remove")){
// "remove" words from retieved features
boolean isAbsent=true;
for(int j=0;j<wordArray.length;j++){
if((wordArray[j].equals(str))){
isAbsent=false;
}
}
if(isAbsent){
swBag.add(str,n);
}
}else{
throw new IllegalArgumentException("Error: action is missing!");
}
}
return new StringBagResult(extend("stopwords-"+action),fe,swBag);
}
/** Use ONLY words in Dictionary File. */
public StringBagResult usewords(String filename) throws IOException{
Bag<String> uwBag=new Bag<String>();
for(Iterator<String> i=bag.iterator();i.hasNext();){
String str=i.next();
int n=bag.getCount(str);
File dictFile=new File(filename);
FileReader fr=new FileReader(dictFile);
BufferedReader in=new BufferedReader(fr);
String line;
while((line=in.readLine())!=null){
line=line.trim();
// Check whether str is in Dictionary File
if(line.equals(str)){
uwBag.add(str,n);
}
}
}
return new StringBagResult(extend("usewords"),fe,uwBag);
}
}
/**
* An abstract class that can be used to filter SpanSetResults.
*/
static public abstract class Filter{
/**
* A short name, used to help construct feature names associated with this
* filter.
*/
abstract public String getName();
/**
* Should return true for all items that will be accepted by the filter.
*/
abstract public boolean match(Object o);
}
/**
* An abstract class that can be used to change SpanSets
*/
static public abstract class Function{
/**
* A short name, used to help construct feature names associated with this
* filter.
*/
abstract public String getName();
/** Should return the modified object. */
abstract public <T>T apply(T o);
}
}