ConcatRegexFeatures.java example

Explorer
MinorThird-master
package iitb.Model;
import iitb.CRF.DataSequence;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

/**
 * ConcatRegexFeatures generates features by matching the token with the character patterns.
 * Character patterns are regular expressions for checking whether the token is capitalized word, 
 * a number, small case word, whether the token contains any special characters and like.
 * It uses regular expression to match a sequence of character pattern and generates features 
 * accordingly.
 * <P> 
 * The feature generated here is whether a sequence of tokens has a particular sequence of given pattern or not.
 * For example, if a pattern is to mathc a capital word, then for two token context window, various features 
 * generated are weither two token (bigram) sequence is having any of the following pattern or not: 
 * 	(1) Capital, Capital 
 *	(2) Capital, Non-Capital 
 *	(3) Non-capital, Capital.
 *
 * You can use any window around the current token (segment) for creating regular expression based features.
 * Also, you can define your own patterns, by writing down the regular expression in a file, 
 * whose format is specified below.
 * </p>
 * <p> 
 * The object of this class should be wrap around {@link FeatureTypesEachLabel} as follows:
 * <pre>
 *	 new FeatureTypesEachLabel(model, new ConcreteConcatRegexFeatures(model,relSegmentStart, relSegmentEnd, maxMemory, patternFile));
 * </pre>
 * </p>
 * A token in a token sequence has a index relative to the current token index, which is described below:
 * <pre>
 	x0 x1 x2 x3 x4 x5 x6 x7 .... xn
	-4 -3 -2 -1 0  0  0  1 2 ...  
 * </pre>
 * <p>
 * In above example, the current segment is from postion 4 to 6 with value of pos = 6 and prevPos = 3 in 
 * startScanFeaturesAt() call of FeatureGenerator.
 * You can refer to any of the token relative to current position by using the index below the token sequence.
 * Thus, you can create a pattern concat features for any token sequence in the neighbourhood of the current token, 
 * using relSegmentStart and relSegmentEnd.
 * For, example to create pattern for two tokens to the left of the current token, following is the parameters 
 * to be passed to the constructor of the class:
 * </p>
 * <pre>
 *  	new FeatureTypesEachLabel(model, new ConcreteConcatRegexFeatures(model,-2, -1, maxMemory, patternFile));
 * </pre>
 * 
 * @author 	Imran Mansuri
 * @since 1.2
 * @version 1.3
 */
 
public class ConcatRegexFeatures extends FeatureTypes {

	/**
	 * 
	 */
	private static final long serialVersionUID = -4246100603296345601L;

	/**
	 *      Various patterns are defined here.
	 *      First dimension of this two dimensional array is feature name and second value is the
	 *      regular expression pattern to be matched against a token. You can add your own patterns
	 *      in this array.
	 */
	String patternString[][] = {
	    {"singleCapLetterWithDot",  "[A-Z]\\."  			},
		{"singleCapLetter",  		"[A-Z]"  				},
        {"isInitCapital",           "[A-Z][a-z]+"        },
        {"isAllCapital",            "[A-Z]+"                },
        {"isAllSmallCase",          "[a-z]+"                },
        
		{"singleDot", 		"[.]"			},
		{"singleComma", 		"[,]"			},
        {"singleQuote",         "[,]"           },
		{"isSpecialCharacter",		"[#;:\\-/<>'\"()&]"},
        
        //{"isWord",                  "[a-zA-Z][a-zA-Z]+"     },
		//{"isAlpha",           		"[a-zA-Z]+"             },
		//{"isAlphaNumeric",      	"[a-zA-Z0-9]+"          },

        {"singleDigit", 				"\\s*\\d\\s*"					},
		{"twoDigits", 				"\\s*\\d{2}\\s*"					},
		{"threeDigits", 				"\\s*\\d{3}\\s*"					},
		{"fourDigits", 				"\\s*\\(*\\d{4}\\)*\\s*"	},
        {"isDigits",                "\\d+"                  },
        {"containsDigit",           ".*\\d+.*"              },              
		{"isNumberRange", 			"\\d+\\s*([-]{1,2}\\s*\\d+)?"},
        
        {"endsWithDot",             "\\p{Alnum}+\\."        },
        {"endsWithComma",           "\\w+[,]"              },
        {"endsWithPunctuation",     "\\w+[;:,.?!]"          },
        {"singlePunctuation",       "\\p{Punct}"            },
        {"singleAmp",       "[&]"           },
        
		{"isDashSeparatedWords", 		"(\\w[-])+\\w"},
		{"isDashSeparatedSeq", 			"((\\p{Alpha}+|\\p{Digit}+)[-])+(\\p{Alpha}+|\\p{Digit}+)"},		
		{"isURL", 					"\\p{Alpha}+://(\\w+\\.)\\w+(:(\\d{2}|\\d{4}))?(/\\w+)*(/|(/\\w+\\.\\w+))?"	},
		{"isEmailId", 				"\\w+@(\\w+\\.)+\\w+"	},
		{"containsDashes",			".*--.*"},
        {"containsSpecialCharacters",".*[#;:\\-/<>'\"()&].*"},
	};

	Pattern p[];
	transient protected DataSequence data;
	protected int index, idbase, curId, window;
	protected int relSegmentStart, relSegmentEnd;
	protected int maxMemory;
	protected int left, right;
    /**
     * @param relSegmentStart2
     * @param relSegmentEnd2
     * @return
     */
    private int getWindowSize(int relSegmentStart, int relSegmentEnd) {
        if((sign(relSegmentEnd) == sign(relSegmentStart)) && relSegmentStart != 0)
            return relSegmentEnd - relSegmentStart + 1;
        else
            return relSegmentEnd - relSegmentStart + maxMemory;
    }

	/**
	 * Constructs an object of ConcatRegexFeatures to be used to generate features for the token 
	 * sequence as specified.
	 * You can specify the sequence of tokens on which the pattern has to be applied using relSegmentStart 
	 * and relSegmentEnd, which denotes segment boundries.
	 * The maxMemory denotes the maximum segment size, for normal CRF the value of maxMemory is 1.
	 * There are certain default patterns defined in the class. You can specify your own pattern in a file, and pass
	 * the name of the file in this constructor. The file should begin with integer value for number of pattern in the 
	 * file. This should be follwoed by one pattern definition on each line. The first word is the name of the pattern
	 * and second word is regular expression for the pattern.
	 *
	 * @param fgen			a {@link Model} object
	 * @param relSegmentStart	index of the reltive position for left boundary
	 * @param relSegmentEnd		index of the reltive position for right boundary
	 * @param maxMemory		maximum size of a segment
	 * @param patternFile		file which contains the pattern definition
	 */
	public ConcatRegexFeatures(FeatureGenImpl fgen, int relSegmentStart, int relSegmentEnd, int maxMemory, String patternFile){
		super(fgen);
        
		assert(relSegmentEnd >= relSegmentStart);
		this.relSegmentStart = relSegmentStart;
		this.relSegmentEnd = relSegmentEnd;
		this.maxMemory = maxMemory;
		
		window = getWindowSize(relSegmentStart, relSegmentEnd);		
		idbase = (int) Math.pow(2, window-1);
        if ((patternFile != null) && (patternFile.length()>0))
            patternString = getPatterns(patternFile);
		assert(patternString != null);
		p = new Pattern[patternString.length];
		for(int i = 0; i < patternString.length; i++){
			//System.out.println("i"+ i +" " + patternString[i][1]);
			p[i] = Pattern.compile(patternString[i][1]);

		}
		cache=true;
	}
	
    /**
	 * Constructs an object of ConcatRegexFeatures to be used to generate features for current token.
	 
	 * @param m		a {@link Model} object
	 * @param relSegmentStart	index of the reltive position for left boundary
	 * @param relSegmentEnd		index of the reltive position for right boundary
	 * @param maxMemory		maximum size of a segment
	 */
	public ConcatRegexFeatures(FeatureGenImpl m, int relSegmentStart, int relSegmentEnd, int maxMemory){	  
        this(m,relSegmentStart,relSegmentEnd,maxMemory,null);
	}

	/**
	 * Constructs an object of ConcatRegexFeatures to be used to generate features for current token.
	 
	 * @param m			a {@link Model} object
	 * @param relSegmentStart	index of the reltive position for left boundary
	 * @param relSegmentEnd		index of the reltive position for right boundary
	 */
	public ConcatRegexFeatures(FeatureGenImpl m, int relSegmentStart, int relSegmentEnd){
		this(m, relSegmentStart, relSegmentEnd, 1);
	}

    public ConcatRegexFeatures(FeatureGenImpl m){
        this(m, 0,0,1);
    }
    public ConcatRegexFeatures(FeatureGenImpl m, String patternFile){
        this(m, 0,0,1,patternFile);
    }
	/**
	 * Constructs an object of ConcatRegexFeatures to be used to generate features for current token.
	 
	 * @param m			a {@link Model} object
	 * @param relSegmentStart	index of the reltive position for left boundary
	 * @param relSegmentEnd		index of the reltive position for right boundary
	 * @param patternFile		file which contains the pattern definition
	 */
	public ConcatRegexFeatures(FeatureGenImpl m, int relSegmentStart, int relSegmentEnd, String patternFile){
		this(m, relSegmentStart, relSegmentEnd, 1, patternFile);
	}
	private int sign(int boundary){
		if(boundary == 0)
			return 0;
		else if(boundary < 0)
			return -1;
		else
			return 1;
	}

	/**
	 * Reads patterns to be matched from the file.
	 * The format of the file is as follows:
	 * The first line of the file is number of patterns, followed by a list of patterns one per line.
	 * Each line describes a pattern's name and pattern string itself.
	 *
	 * @param patternFile		name of the pattern file
	 */
	public static String[][] getPatterns(String patternFile){
		String line;
		String patterns[][];
		try {
			BufferedReader in = new BufferedReader(new FileReader(patternFile));
			int len = Integer.parseInt(in.readLine());
			patterns = new String[len][2];

			for(int k = 0; k < len; k++){
				StringTokenizer strTokenizer = new StringTokenizer(in.readLine());
				patterns[k][0] = strTokenizer.nextToken();
				patterns[k][1] = strTokenizer.nextToken();
				//System.out.println(patterns[k][0] + " " + patterns[k][1]);
			}
		}catch(IOException ioe){
			System.err.println("Could not read pattern file : " + patternFile);
			ioe.printStackTrace();
			return null;
		}

		return patterns;
	}

	/**
	 * Initaites scanning of features in a sequence at specified position. 
	 *
	 * @param data		a training sequence 
	 * @param prevPos	the previous label postion
	 * @param pos		Current token postion
	 */
	public boolean startScanFeaturesAt(DataSequence data, int prevPos, int pos){
		assert(patternString != null);
		this.data = data;
		index = 0;
		if (relSegmentStart <= 0) {
			left = prevPos + 1 + relSegmentStart;
		} else {
			left = pos + relSegmentStart;
		}

		if (relSegmentEnd < 0) {
			right = prevPos + 1 + relSegmentEnd;
		} else {
			right = pos + relSegmentEnd;
		}

		if(!(left >= 0 && left < data.length() && right >= 0 && right < data.length()))
			index = patternString.length;

		//System.out.println("DataLength:" + data.length() + " segment(" + (prevPos+1) + "," + pos + ") rs(" +relSegmentStart + "," + relSegmentEnd + ") window(" + left + "," + right + ") idbase:" + idbase);
		advance();

		return true;
	}
	
	/**
	 * Returns true if there are any more feature(s) for the current scan.
	 *
	 */
	public boolean hasNext() {
		return index < patternString.length;
	}

	/**
	 * Generates the next feature for the current scan.
	 *
	 * @param f	Copies the feature generated to the argument 
	 */
	public void next(FeatureImpl f) {

		if(featureCollectMode()){
			//This is a feature collection mode, so return id and name
			f.strId.name = "R_" + patternString[index][0];
			if ((window > 1) && (curId > 0)) {
			    f.strId.name =  f.strId.name + ("_" + window + "_" + Integer.toBinaryString(curId));
			}
		}
		
		/*//Return feature on token window
		int base = 1;
		f.strId.id = 0;
		for(int k = left; k <= right; k++){
			boolean match = p[index].matcher((String)data.x(k)).matches();	
			f.strId.id += base * (match? 1:0);
			base = base * 2;
		}
		f.val = (f.strId.id > 0) ? 1:0; //In case of no match return 0 as feature value 
		f.ystart = -1;
		f.strId.id += idbase * index++;*/

		f.val = 1;
		f.strId.id = curId + idbase * index++;
		f.ystart = -1;
		advance();
	}

	private void advance(){
		curId = 0;
		while(curId <= 0 && index < patternString.length){
			int base = 1;
			for(int k = left; k <= right; k++){
				boolean match = p[index].matcher(data.x(k).toString()).matches();	
				curId += base * (match? 1:0);
				base = base * 2;
			}
			if(curId > 0)
				break;				
			index++;
		}
	}

	public int maxFeatureId(){
	    return idbase * patternString.length; //(maximum base i.e. most significat bits + maximum offset)
	}
	int offsetLabelIndependentId(FeatureImpl f) {
	    return f.strId.id;
    }
};