package edu.cmu.minorthird.text.mixup;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import edu.cmu.minorthird.text.BasicTextBase;
import edu.cmu.minorthird.text.BasicTextLabels;
import edu.cmu.minorthird.text.BoneheadStemmer;
import edu.cmu.minorthird.text.MonotonicTextLabels;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.Token;
import edu.cmu.minorthird.util.ProgressCounter;
/** A simple pattern-matching and information extraction language.
<pre>
EXAMPLE:
... in('begin') @number? [ any{2,5} in('end') ] ... && [!in('begin')*] && [!in('end')*]
BNF:
simplePrim -> [!] simplePrim1
simplePrim1 -> id | a(DICT) | ai(DICT) | eq(CONST) | eqi(CONST) | re(REGEX)
| any | ... | PROPERTY:VALUE | PROPERTY:a(foo) )
prim -> < simplePrim [,simplePrim]* > | simplePrim
repeatedPrim -> [L] prim [R] repeat | @type | @type?
repeat -> {int,int} | {,int} | {int,} | {int} | ? | * | +
pattern -> | repeatedPrim pattern
basicExpr -> pattern [ pattern ] pattern
basicExpr -> (expr)
expr -> basicExpr "||" expr
expr -> basicExpr "&&" expr
SEMANTICS:
basicExpr is pattern match - like a regex, but returns all matches, not just the longest one
token-level tests:
eq('foo') check token is exactly foo
'foo' is short for eq('foo')
re('regex') checks if token matches the regex
eqi('foo') check lowercase version of token is foo
'foo' or eq('foo') checks a token is equal to 'foo'
a(bar) checks a token is in dictionary 'bar'
ai(bar) checks that the token is in dictionary 'bar', ignoring case
color:red checks that the token has property 'color' set to 'red'
color:a(primaryColor) checks that the token's property 'color' is in the dictionary 'primaryColor'
!test is negation of test
<test1, test2, ... test3> conjoins token-level tests
any is true for any token
token-sequences:
test? is 0 or 1 tokens matching test
test+ is 1+ tokens matching test
test* is 0+ tokens matching test
test{3,7} is between 3 and 7 tokens matching test
... is equal to any*
<code>@foo</code> matches a span of type foo
<code>@foo?</code> matches a span of type foo or the empty sequence
L means sequence can't be extended to left and still match
R means sequence can't be extended to right and still match
expr || expr is union
expr && expr is piping: generate with expr1, filter with expr2
</pre>
The name's an acronym for My Information eXtraction and Understanding Package.
*
* @author William Cohen
*/
public class Mixup implements Serializable{
static private final long serialVersionUID=20080303L;
/** Without constraints, the maximum number of times a mixup
* expression can extract something from a document of length N is
* O(N*N). The maxNumberOfMatches... variables below constrain
* this behavior, for efficiency. The variable below is a threshold
* after which these constraints kick in.
*/
public static int minMatchesToApplyConstraints=5000;
/** Without constraints, the maximum number of times a mixup
* expression can extract something from a document of length N is
* O(N*N), since any token can be the begin or end of an extracted
* span. The maxNumberOfMatchesPerToken value limits this to
* maxNumberOfMatchesPerToken*N.
*/
public static int maxNumberOfMatchesPerToken=5;
/** Without constrains, the maximum number of times a mixup
* expression can extract something from a document of length N is
* O(N*N), since any token can be the begin or end of an extracted
* span. This limits the number of matches to a fixed number.
*/
public static int maxNumberOfMatches=134217728; //2^27
private static final boolean DEBUG=false;
// tokenize: words, single-quoted strings, "&&", "||", "..." or single non-word chars
public static final Pattern tokenizerPattern=Pattern.compile("\\s*((\\n)|(\\w+)|(\\/\\/)|('(\\\\'|[^\\'])*')|\\&\\&|\\|\\||\\.\\.\\.|\\\\\\;|\\W)\\s*");
//Pattern.compile("\\s*(\\w+|'([^']|\\\\')*'|\\&\\&|\\|\\||\\.\\.\\.|\\W)\\s*");
// legal functions
private static Set<String> legalFunctions;
static{
legalFunctions=new HashSet<String>();
String[] tmp=new String[]{"re","eq","eqi","a","ai","any","prop","propDict"};
for(int i=0;i<tmp.length;i++)
legalFunctions.add(tmp[i]);
}
private final static int RE=0;
private final static int EQ=1;
private final static int EQI=2;
private final static int A=3;
private final static int AI=4;
private final static int ANY=5;
private final static int PROP=6;
private final static int PROPDICT=7;
private final static int ELIPSE=9;
private Expr expr;
/** Create a new mixup query. */
public Mixup(String pattern) throws ParseException{
MixupTokenizer tok=new MixupTokenizer(pattern);
if(tok.advance())
expr=new MixupParser(tok).parseExpr();
}
public Mixup(MixupTokenizer tok) throws ParseException{
expr=new MixupParser(tok).parseExpr();
}
/** Extract subspans from each generated span using the mixup expression.
*/
public Iterator<Span> extract(TextLabels labels,Iterator<Span> spanLooper){
return expr.match(labels,spanLooper);
}
public String toString(){
return expr.toString();
}
public static class MixupTokenizer{
public String input;
public Matcher matcher;
private String token;
public String nextToken;
private int cursor;
public int nextCursor=0;
public MixupTokenizer(String input){
this.input=input;
this.matcher=tokenizerPattern.matcher(input);
}
public boolean advance(){
if(matcher.find()){
cursor=matcher.start(1);
token=matcher.group(1);
if((token.equals(";"))){
token=null;
return false;
}
return true;
}else{
token=null;
return false;
}
}
// advance to next token, and check that it's what's expected
public String advance(Set<String> set) throws Mixup.ParseException{
if(!matcher.find()){
token=null;
cursor=input.length();
return null;
}
cursor=matcher.start(1);
token=matcher.group(1);
if((token.equals(";"))){
token=null;
return null;
}
if(set!=null&&!set.contains(token)){
System.out.println("Token at Error: "+token);
parseError("statement error: expected one of: "+setContents(set)+
" in "+token);
}
return token;
}
private void parseError(String msg) throws ParseException{
throw new ParseException(msg+": "+input.substring(0,cursor)+"^^^"+
input.substring(cursor,input.length()));
}
/** convert a set to a string listing the elements */
private String setContents(Set<String> set){
StringBuffer buf=new StringBuffer("");
for(Iterator<String> i=set.iterator();i.hasNext();){
if(buf.length()>0)
buf.append(" ");
buf.append("'"+i.next().toString()+"'");
}
return buf.toString();
}
}
//
// recursive descent parser for the BNF above
//
private static class MixupParser{
private MixupTokenizer tok;
public MixupParser(MixupTokenizer tok){
this.tok=tok;
}
private Expr parseExpr() throws ParseException{
// Expr expr1=null;
Expr expr2=null;
String op=null;
BasicExpr basic=parseBasicExpr();
if("&&".equals(tok.token)||"||".equals(tok.token)){
op=tok.token;
tok.advance();
expr2=parseExpr();
}
return new Expr(basic,expr2,op);
}
private BasicExpr parseBasicExpr() throws ParseException{
List<RepeatedPrim> list=new ArrayList<RepeatedPrim>();
int left=-1,right=-1;
if("(".equals(tok.token)){
tok.advance();
Expr expr=parseExpr();
if(!")".equals(tok.token))
tok.parseError("expected close paren");
tok.advance(); // past ')'
return new BasicExpr(expr);
}else{
while(tok.token!=null&&!"||".equals(tok.token)&&
!"&&".equals(tok.token)&&!")".equals(tok.token)){
if("[".equals(tok.token)){
left=list.size();
tok.advance();
}else if("]".equals(tok.token)){
right=list.size();
tok.advance();
}else{
list.add(parseRepeatedPrim());
}
}
if(left<0)
tok.parseError("no left bracket");
if(right<0)
tok.parseError("no right bracket");
return new BasicExpr((RepeatedPrim[])list.toArray(new RepeatedPrim[list
.size()]),left,right);
}
}
private RepeatedPrim parseRepeatedPrim() throws ParseException{
RepeatedPrim buf=new RepeatedPrim();
if("@".equals(tok.token)){
tok.advance();
buf.type=tok.token;
tok.advance();
buf.maxCount=1;
if("?".equals(tok.token)){
buf.minCount=0;
tok.advance();
}else{
buf.minCount=1;
}
return buf;
}else{
if("L".equals(tok.token)){
buf.leftMost=true;
tok.advance();
}
parsePrim(buf);
parseRepeat(buf);
if("R".equals(tok.token)){
buf.rightMost=true;
tok.advance();
}
buf.expandShortcuts();
if(!buf.checkFunction())
tok.parseError("syntax error");
return buf;
}
}
private void parsePrim(RepeatedPrim buf) throws ParseException{
if("<".equals(tok.token)){
tok.advance();
parseSimplePrim(buf);
while(",".equals(tok.token)){
tok.advance();
parseSimplePrim(buf);
}
if(">".equals(tok.token))
tok.advance();
else
tok.parseError("expected '>'");
}else{
parseSimplePrim(buf);
}
}
private void parseSimplePrim(RepeatedPrim buf) throws ParseException{
Prim prim=new Prim();
if("!".equals(tok.token)){
prim.negated=true;
tok.advance();
}
prim.funcString=tok.token;
// int funcLength=tok.token.length();
// char firstLetter=tok.token.charAt(0);
if("a".equals(tok.token))
prim.function=A;
else if("eq".equals(tok.token))
prim.function=EQ;
else if("ai".equals(tok.token))
prim.function=AI;
else if("re".equals(tok.token))
prim.function=RE;
else if("any".equals(tok.token))
prim.function=ANY;
else if("eqi".equals(tok.token))
prim.function=EQI;
else if("...".equals(tok.token))
prim.function=ELIPSE;
else if("prop".equals(tok.token))
prim.function=PROP;
else if("propDict".equals(tok.token))
prim.function=PROPDICT;
tok.advance();
if("(".equals(tok.token)){
tok.advance(); // to argument
prim.argument=tok.token;
tok.advance(); // to ')'
if(!")".equals(tok.token))
tok.parseError("expected close paren");
tok.advance(); // past prim
}else if(":".equals(tok.token)){
prim.property=prim.funcString;
prim.function=PROP;
prim.funcString="prop";
tok.advance(); // to property value
if("a".equals(tok.token)){
tok.advance(); // to '('
if(!"(".equals(tok.token)){
prim.value="a";
tok.advance(); // past value
}else{
tok.advance(); // to dictionary name
prim.function=PROPDICT;
prim.funcString="propDict";
prim.value=tok.token;
tok.advance();
if(!")".equals(tok.token))
tok.parseError("expected close paren");
tok.advance(); // past close paren
}
}else{
prim.value=tok.token;
tok.advance(); // past value
}
}
prim.expandShortcuts();
buf.primList.add(prim);
}
private void parseRepeat(RepeatedPrim buf) throws ParseException{
String min=null,max=null;
if("{".equals(tok.token)){
tok.advance();
if(!",".equals(tok.token)){
min=tok.token;
tok.advance(); // to ","
}else{
min="0";
}
if("}".equals(tok.token)){
max=min;
tok.advance();
}else{
if(!",".equals(tok.token))
tok.parseError("expected \",\"");
tok.advance();
if(!"}".equals(tok.token)){
max=tok.token;
tok.advance(); // to "}"
}else{
max="-1";
}
if(!"}".equals(tok.token))
tok.parseError("expected \"}\"");
tok.advance();
}
}else if("+".equals(tok.token)){
min="1";
max="-1";
tok.advance();
}else if("*".equals(tok.token)){
min="0";
max="-1";
tok.advance();
}else if("?".equals(tok.token)){
min="0";
max="1";
tok.advance();
}else{
min=max="1";
}
try{
buf.minCount=Integer.parseInt(min);
buf.maxCount=Integer.parseInt(max);
}catch(NumberFormatException e){
tok.parseError("expected an integer: min = '"+min+"' max='"+max+"'");
}
}
}
/** Signals an error in parsing a mixup document. */
public static class ParseException extends Exception{
static final long serialVersionUID=20080303L;
public ParseException(String s){
super(s);
}
}
//
// encodes a pattern that matches a single TextToken
//
private static class Prim implements Serializable{
static private final long serialVersionUID=20080303L;
public boolean negated=false;
public int function=-1;
public String funcString="";
public String argument="";
public String property="",value="";
private Pattern pattern=null;
/** See if the predicate for this pattern succeeds for this TextToken. */
public boolean matchesPrim(TextLabels labels,Token token){
boolean status=matchesUnnegatedPrim(labels,token);
return negated==!status;
}
private boolean matchesUnnegatedPrim(TextLabels labels,Token token){
if(function==A)
return labels.inDict(token,argument); //a
if(function==EQ)
return token.getValue().equals(argument); //eq
else if(function==AI){ //ai
final String lc=token.getValue().toLowerCase();
Token lcToken=new Token(){
public String toString(){
return "[lcToken "+lc+"]";
}
public String getValue(){
return lc;
}
// public int getIndex(){
// return 0;
// }
};
return labels.inDict(lcToken,argument);
}else if(function==RE){ //re
return pattern.matcher(token.getValue()).find();
}else if(function==ANY)
return true; //any
else if(function==EQI)
return token.getValue().equalsIgnoreCase(argument); //eqi
else if(function==PROP){ //prop
return value.equals(labels.getProperty(token,property));
}else if(function==PROPDICT){ //propDict
final String propVal=labels.getProperty(token,property);
if(propVal==null)
return false;
Token propValToken=new Token(){
public String toString(){
return "[token:"+propVal+"]";
}
public String getValue(){
return propVal;
}
// public int getIndex(){
// return 0;
// }
};
//System.out.println("testing "+propValToken+" for membership in dict "+value);
return labels.inDict(propValToken,value);
}else{
throw new IllegalStateException("illegal function '"+funcString+"'");
}
}
/** Expand some syntactic sugar-like abbreviations. */
public void expandShortcuts(){
// expand the 'const' abbreviation to eq('const')
if(funcString.startsWith("'")&&funcString.endsWith("'")){
argument=funcString;
function=EQ;
funcString="eq";
}
// unquote a quoted argument
if(argument.startsWith("'")&&argument.endsWith("'")){
argument=argument.substring(1,argument.length()-1);
argument=argument.replaceAll("\\\\'","'");
}
// precompile a regex
if(RE==function)
pattern=Pattern.compile(argument);
// check for correctness
}
/** is this a legal function? */
public boolean checkFunction(){
return legalFunctions.contains(funcString);
}
public String toString(){
StringBuffer buf=new StringBuffer("");
if(negated)
buf.append("!");
if(PROP!=function){
buf.append(funcString);
if(argument!=null)
buf.append("("+argument+")");
}else{
buf.append(property+":"+value);
}
return buf.toString();
}
}
// encodes a pattern matching a series of Token's
private static class RepeatedPrim implements Serializable{
static private final long serialVersionUID=20080303L;
public boolean leftMost=false;
public boolean rightMost=false;
public List<Prim> primList=new ArrayList<Prim>();
public boolean[] whereIMatch;
public Span whatIIndexed=null;
public int minCount;
public int maxCount; // -1 indicates infinity
String type=null; // non-null for @type and @type?
/** Expand some syntactic sugar-like abbreviations. */
public void expandShortcuts(){
// expand the 'const' abbreviation to eq('const')
if(primList.size()==1){
Prim prim=primList.get(0);
if(ELIPSE==prim.function){
prim.function=ANY;
prim.funcString="any";
minCount=0;
maxCount=-1;
return;
}
}
}
public boolean checkFunction(){
for(Iterator<Prim> i=primList.iterator();i.hasNext();){
Prim prim=i.next();
if("...".equals(prim.funcString)&&primList.size()!=1)
return false;
if(!prim.checkFunction())
return false;
}
return true;
}
public String toString(){
if(type!=null){
if(minCount==0)
return "@"+type+"?";
else
return "@"+type;
}else{
StringBuffer buf=new StringBuffer("");
if(leftMost)
buf.append("L ");
if(primList.size()==1)
buf.append((Prim)primList.get(0));
else if(primList.size()==0)
throw new IllegalStateException("empty prim list");
else{
buf.append("<"+primList.get(0).toString());
for(int i=1;i<primList.size();i++){
buf.append(", "+primList.get(i).toString());
}
buf.append(">");
}
buf.append("{"+minCount+","+maxCount+"}");
if(rightMost)
buf.append("R");
return buf.toString();
}
}
/** Indexes where tokens match in the PrimList */
public void index(Span s,TextLabels labels){
whatIIndexed=s;
whereIMatch=new boolean[s.size()];
for(int i=0;i<s.size();i++){
whereIMatch[i]=matchesPrimList(labels,s.getToken(i));
}
}
/** See if this pattern matches span.subSpan(lo,len). */
public boolean matchesSubspan(TextLabels labels,Span span,int lo,int len){
if(type!=null){
if(minCount==1){
return labels.hasType(span.subSpan(lo,len),type);
}else{
return len==0||labels.hasType(span.subSpan(lo,len),type);
}
}else{
// check and see if this span has been indexed or not
//String span1 = span.asString();
//String span2 = "";
//if(whatIIndexed != null) span2 = whatIIndexed.asString();
//if(!span1.trim().equals(span2.trim())) index(span, labels);
if(whatIIndexed==null||!whatIIndexed.equals(span))
index(span,labels);
if(len>maxCount&&maxCount>=0)
return false;
if(len<minCount)
return false;
int spanSize=span.size();
for(int i=lo;i<lo+len;i++){
if(i>=spanSize)
return false;
//if (!matchesPrimList(labels,span.getToken(i))) return false;
if(!whereIMatch[i])
return false;
}
if(leftMost&&(len<maxCount||maxCount<0)){
if(lo>0&&
/*matchesPrimList(labels,span.getToken(lo-1))*/whereIMatch[lo-1])
return false;
}
if(rightMost&&(len<maxCount||maxCount<0)){
if(lo+len<spanSize&&
/*matchesPrimList(labels,span.getToken(lo+len))*/whereIMatch[lo+
len])
return false;
}
return true;
}
}
private boolean matchesPrimList(TextLabels labels,Token token){
for(Iterator<Prim> i=primList.iterator();i.hasNext();){
Prim prim=i.next();
if(!prim.matchesPrim(labels,token))
return false;
}
return true;
}
}
//
// encodes a basicExpr in the BNF above
//
private static class BasicExpr implements Serializable{
static private final long serialVersionUID=20080303L;
public final Expr expr;
public final RepeatedPrim[] repPrim;
public final int leftBracket,rightBracket;
private static Logger log=Logger.getLogger(BasicExpr.class);
public BasicExpr(Expr expr){
this.expr=expr;
this.repPrim=null;
this.leftBracket=this.rightBracket=-1;
}
public BasicExpr(RepeatedPrim[] repPrim,int leftBracket,int rightBracket){
this.expr=null;
this.repPrim=repPrim;
this.leftBracket=leftBracket;
this.rightBracket=rightBracket;
}
public String toString(){
if(expr!=null){
return "("+expr.toString()+")";
}else{
StringBuffer buf=new StringBuffer();
for(int i=0;i<repPrim.length;i++){
if(i==leftBracket)
buf.append("[");
buf.append(" "+repPrim[i].toString());
if(i+1==rightBracket)
buf.append("]");
}
return buf.toString();
}
}
public Iterator<Span> match(TextLabels labels,Iterator<Span> spanLooper){
if(expr!=null){
return expr.match(labels,spanLooper);
}else{
ProgressCounter pc=
new ProgressCounter("mixup","span");
Set<Span> accum=new TreeSet<Span>();
while(spanLooper.hasNext()){
pc.progress();
Span span=spanLooper.next();
// match(labels,accum,span,new int[repPrim.length],new int[repPrim.length],1,0,0);
fastMatch(labels,span,accum);
}
pc.finished();
return accum.iterator();
}
}
// most time taken here
private void fastMatch(TextLabels labels,Span span,Set<Span> accum){
// log.debug("span size: " + span.size() + " - " + span.asString());
// there are at most span.length^2 matches of every repeated primitive
log.debug("matching span id/size="+span.getDocumentId()+"/"+span.size());
log.debug("before alloc: max/free="+Runtime.getRuntime().maxMemory()+"/"+
Runtime.getRuntime().freeMemory());
// We may overflow the int datatype if there are too many tokens in the span, in which case we should use
// the largest available int as it is highly unlikely that there will *actually* be anywhere near that
// many matches to store.
//int maxRepeatedPrimMatches = span.size() * (span.size()+1);
int maxRepeatedPrimMatches;
if(span.size()>(Integer.MAX_VALUE/(span.size()+1)))
maxRepeatedPrimMatches=Integer.MAX_VALUE; // overflow
else
maxRepeatedPrimMatches=span.size()*(span.size()+1);
// Now apply any constraints that may further limit the number of possible matches
if(maxRepeatedPrimMatches>minMatchesToApplyConstraints){
if(maxNumberOfMatchesPerToken>0){
// If the span is large enough (ie has more than Integer.MAX_VALUE/maxNumberOfMatchesPerToken tokens) then we will
// overflow int again here so check for that and only attempt to adjust for the constraint if it will
// succeed. Otherwise stick with the maximum int value.
if(span.size()>(Integer.MAX_VALUE/maxNumberOfMatchesPerToken))
maxRepeatedPrimMatches=
Math.min(maxNumberOfMatchesPerToken*span.size(),
maxRepeatedPrimMatches);
}
// Now we can arbitrarily set a limit to the number of matches so if this is the case, then we should
// use that limit if it is larger than the number of possible matches we computed.
if((maxNumberOfMatches>0)&&(maxNumberOfMatches<maxRepeatedPrimMatches)){
maxRepeatedPrimMatches=maxNumberOfMatches;
}
}
int[] loIndexBuffer=new int[maxRepeatedPrimMatches];
int[] lengthBuffer=new int[maxRepeatedPrimMatches];
log.debug("alloc hi-lo: max/free="+Runtime.getRuntime().maxMemory()+"/"+
Runtime.getRuntime().freeMemory());
// store possible places that repPrim[i] can match
int[][] possibleLos=new int[repPrim.length][];
int[][] possibleLens=new int[repPrim.length][];
// also record min/max length
int[] minLen=new int[repPrim.length];
int[] maxLen=new int[repPrim.length];
boolean[] isAny=new boolean[repPrim.length];
log.debug("after alloc: max/free="+Runtime.getRuntime().maxMemory()+"/"+
Runtime.getRuntime().freeMemory());
for(int i=0;i<repPrim.length;i++){
// work out possible lengths for repPrim[i]
RepeatedPrim rp=repPrim[i];
minLen[i]=rp.minCount;
maxLen[i]=span.size();
if(rp.maxCount>=0&&rp.maxCount<maxLen[i])
maxLen[i]=rp.maxCount;
// see if repPrim[i] is "any"
if(rp.primList.size()==1){
Prim prim=(Prim)rp.primList.get(0);
isAny[i]=
(ANY==prim.function&&!prim.negated&&!rp.leftMost&&!rp.rightMost);
}
if(!isAny[i]){
// find all places this matches
int numMatches=0;
if(rp.type!=null){
// look up matches from the labels for a spantype repPrim, eg @foo
for(Iterator<Span> el=
labels.instanceIterator(rp.type,span.getDocumentId());el
.hasNext();){
if(numMatches>=maxRepeatedPrimMatches){
overflowWarning(numMatches,maxRepeatedPrimMatches,span,i);
return;
}
Span s=el.next();
if(span.contains(s)){
if(numMatches>=maxRepeatedPrimMatches){
overflowWarning(numMatches,maxRepeatedPrimMatches,span,i);
return;
}
loIndexBuffer[numMatches]=
s.documentSpanStartIndex()-span.documentSpanStartIndex();
lengthBuffer[numMatches]=s.size();
numMatches++;
}
}
}
if(rp.type==null||(rp.type!=null&&rp.minCount==0)){
// something besides @foo or @foo?
// check all possible subspans
for(int j=0;j<=span.size();j++){
int topLen=Math.min(maxLen[i],span.size()-j);
for(int k=minLen[i];k<=topLen;k++){
if(numMatches>=maxRepeatedPrimMatches){
overflowWarning(numMatches,maxRepeatedPrimMatches,span,i);
return;
}
//84% time taken in matchesSubspan
if(rp.matchesSubspan(labels,span,j,k)){
loIndexBuffer[numMatches]=j;
lengthBuffer[numMatches]=k;
numMatches++;
}
}
}
}
// save matches from buffer into loIndices, lengths
possibleLos[i]=new int[numMatches];
possibleLens[i]=new int[numMatches];
for(int m=0;m<numMatches;m++){
possibleLos[i][m]=loIndexBuffer[m];
possibleLens[i][m]=lengthBuffer[m];
}
}
}
//
// now find a good series of loIndex/length pairs
//
int[] lows=new int[repPrim.length];
int[] highs=new int[repPrim.length];
fastMatch(labels,accum,span,lows,highs,1,0,0,possibleLos,possibleLens,
isAny,minLen,maxLen);
}
private void overflowWarning(int numMatches,int maxRepeatedPrimMatches,
Span span,int i){
log.warn("mixup warning at pattern #"+(i+1)+" "+repPrim[i]+") on "+span);
log.warn("not enough room to store all matches: adjust Mixup.maxNumberOfMatches(PerToken)");
log.warn("size="+span.size()+" numMatches="+numMatches+" max="+
maxRepeatedPrimMatches+" minConstraint="+
minMatchesToApplyConstraints);
}
private void fastMatch(TextLabels labels, // passed along to subroutines
Set<Span> accum, // accumulate matches
Span span, // span being matched
int[] lows, // lows[i] is lo index of match to repPrim[i]
int[] highs, // highs[i] is high index of match to repPrim[i]
int tab, // for debugging
int spanCursor, // index into the span being matched
int patternCursor, // index into the repPrim's being matched
int[][] possibleLos, // loIndices[i] is all places repPrim[i] might match
int[][] possibleLens, // lengths[i] is parallel-to-loIndices array of lengths
boolean[] isAny, // true if repPrim[i] is "any"
int[] minLen, // min lengths of subseq matching an isAny==true repPrim[i]
int[] maxLen) // max lengths of subseq matching an isAny==true repPrim[i]
{
if(patternCursor==repPrim.length){
if(spanCursor==span.size()){
// a complete, successful match
if(DEBUG)
showMatch(tab,"complete",span,lows,highs,patternCursor);
int lo=lows[leftBracket];
int hi=highs[rightBracket-1];
accum.add(span.subSpan(lo,hi-lo));
}else{
// a deadend
if(DEBUG)
showMatch(tab,"failed",span,lows,highs,patternCursor);
}
}else{
// continue a partial match
if(isAny[patternCursor]){
if(patternCursor+1<repPrim.length&&!isAny[patternCursor+1]){
// trick to handle something like '...' followed by a specific pattern
for(int i=0;i<possibleLos[patternCursor+1].length;i++){
int nextSpanCursor=possibleLos[patternCursor+1][i];
int len=nextSpanCursor-spanCursor;
if(len>=minLen[patternCursor]&&len<=maxLen[patternCursor]){
lows[patternCursor]=spanCursor;
highs[patternCursor]=spanCursor+len;
if(DEBUG)
showMatch(tab,"partial",span,lows,highs,patternCursor+1);
fastMatch(labels,accum,span,lows,highs,tab+1,spanCursor+len,
patternCursor+1,possibleLos,possibleLens,isAny,minLen,
maxLen);
}
}
}else{
int topLen=Math.min(maxLen[patternCursor],span.size()-spanCursor);
for(int len=minLen[patternCursor];len<=topLen;len++){
lows[patternCursor]=spanCursor;
highs[patternCursor]=spanCursor+len;
if(DEBUG)
showMatch(tab,"partial",span,lows,highs,patternCursor+1);
fastMatch(labels,accum,span,lows,highs,tab+1,spanCursor+len,
patternCursor+1,possibleLos,possibleLens,isAny,minLen,maxLen);
}
}
}else{
int topLen=span.size()-spanCursor;
for(int i=0;i<possibleLos[patternCursor].length;i++){
if(possibleLos[patternCursor][i]==spanCursor&&
possibleLens[patternCursor][i]<=topLen){
int len=possibleLens[patternCursor][i];
lows[patternCursor]=spanCursor;
highs[patternCursor]=spanCursor+len;
if(DEBUG)
showMatch(tab,"partial",span,lows,highs,patternCursor+1);
fastMatch(labels,accum,span,lows,highs,tab+1,spanCursor+len,
patternCursor+1,possibleLos,possibleLens,isAny,minLen,maxLen);
}
}
}
}
}
//
// obsolete slower match routine, kept around as a reference implementation for debugging
//
// private void match(TextLabels env,Set accum,Span span,int[] lows,
// int[] highs,int tab,int spanCursor,int patternCursor){
// if(patternCursor==repPrim.length){
// if(spanCursor==span.size()){
// // a complete, successful match
// if(DEBUG)
// showMatch(tab,"complete",span,lows,highs,patternCursor);
// int lo=lows[leftBracket];
// int hi=highs[rightBracket-1];
// accum.add(span.subSpan(lo,hi-lo));
// }else{
// // a deadend
// if(DEBUG)
// showMatch(tab,"failed",span,lows,highs,patternCursor);
// }
// }else{
// // continue a partial match
// RepeatedPrim nextPattern=repPrim[patternCursor];
// int maxLen=span.size()-spanCursor;
// if(nextPattern.maxCount>=0&&nextPattern.maxCount<maxLen)
// maxLen=nextPattern.maxCount;
// for(int len=nextPattern.minCount;len<=maxLen;len++){
// // 84% time taken in matchesSubspan
// boolean lenOk=nextPattern.matchesSubspan(env,span,spanCursor,len);
// if(lenOk){
// lows[patternCursor]=spanCursor;
// highs[patternCursor]=spanCursor+len;
// if(DEBUG)
// showMatch(tab,"partial",span,lows,highs,patternCursor+1);
// match(env,accum,span,lows,highs,tab+1,spanCursor+len,
// patternCursor+1);
// }
// }
// }
// }
// for debugging
private void showMatch(int tab,String msg,Span span,int[] lows,int[] highs,
int patternCursor){
for(int i=0;i<tab;i++){
System.out.print("| ");
}
System.out.print(msg+":");
for(int i=0;i<patternCursor;i++){
System.out.print(" "+repPrim[i].toString()+"["+lows[i]+":"+highs[i]+
"]<");
for(int j=lows[i];j<highs[i];j++){
if(j>lows[i])
System.out.print(" ");
System.out.print(span.getToken(j).getValue());
}
System.out.print(">");
}
System.out.println();
}
}
//
// encodes an expression in the BNF above
//
private static class Expr implements Serializable{
static private final long serialVersionUID=20080303L;
private BasicExpr expr1;
private Expr expr2;
private String op;
public Expr(BasicExpr expr1,Expr expr2,String op){
this.expr1=expr1;
this.expr2=expr2;
this.op=op;
}
public Iterator<Span> match(TextLabels labels,Iterator<Span> spanIt){
if(expr2==null){
return expr1.match(labels,spanIt);
}else if("&&".equals(op)){
return expr2.match(labels,expr1.match(labels,spanIt));
}else{
if(!"||".equals(op))
throw new IllegalStateException("illegal operator '"+op+"'");
// copy the input looper
SortedSet<Span> save=new TreeSet<Span>();
while(spanIt.hasNext())
save.add(spanIt.next());
// union the outputs of expr1 and expr2
Iterator<Span> a=expr1.match(labels,save.iterator());
Iterator<Span> b=expr2.match(labels,save.iterator());
SortedSet<Span> union=new TreeSet<Span>();
while(a.hasNext())
union.add(a.next());
while(b.hasNext())
union.add(b.next());
return union.iterator();
}
}
public String toString(){
StringBuffer buf=new StringBuffer();
buf.append(expr1.toString());
if(expr2!=null)
buf.append(" "+op+" "+expr2.toString());
return buf.toString();
}
}
//
// interactive test routine
//
public static void main(String[] args){
try{
Mixup mixup=new Mixup(args[0]);
System.out.println("normalized expression = "+mixup);
BasicTextBase b=new BasicTextBase();
MonotonicTextLabels labels=new BasicTextLabels(b);
for(int i=1;i<args.length;i++){
b.loadDocument("arg_"+i,args[i]);
}
new BoneheadStemmer().stem(b,labels);
//System.out.println("labels="+labels);
//labels.addWord("the", "det");
//labels.addWord("thi", "det");
for(Iterator<Span> i=mixup.extract(labels,b.documentSpanIterator());i
.hasNext();){
System.out.println(i.next());
}
}catch(Exception e){
e.printStackTrace();
}
}
}