RegexTokenizer.java example

Explorer
MinorThird-master
/* Copyright 2007, Carnegie Mellon, All Rights Reserved */

package edu.cmu.minorthird.text;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

/** Maintains information about what's in a set of documents.
 * Specifically, this contains a set of character sequences (TextToken's)
 * from some sort of set of containing documents - typically found by
 * tokenization.
 */

public class RegexTokenizer implements Tokenizer{

	private static Logger log=Logger.getLogger(RegexTokenizer.class);

	/** How to split tokens up */
	public static final String TOKEN_REGEX_PROP="edu.cmu.minorthird.tokenRegex";

	public static final String TOKEN_REGEX_DEFAULT_VALUE=
			"\\s*([0-9]+|[a-zA-Z]+|\\W)\\s*";

	public static String standardTokenRegexPattern;
	static{
		Properties props=new Properties();
		try{
			InputStream in=
					FancyLoader.class.getClassLoader().getResourceAsStream(
							"token.properties");
			if(in!=null){
				props.load(in);
				log.debug("loaded properties from stream "+in);
			}else{
				log.info("no token.properties found on classpath");
			}
		}catch(Exception ex){
			log.debug("can't open token.properties:"+ex);
		}
		standardTokenRegexPattern=
				props.getProperty(TOKEN_REGEX_PROP,System.getProperty(TOKEN_REGEX_PROP,
						TOKEN_REGEX_DEFAULT_VALUE));
		log.info("tokenization regex: "+standardTokenRegexPattern);
	}

	public String regexPattern=standardTokenRegexPattern;

	public RegexTokenizer(){
	}

	public RegexTokenizer(String pattern){
		this.regexPattern=pattern;
	}

	/** Tokenize a string. */
	@Override
	public String[] splitIntoTokens(String string){
		List<String> list=new ArrayList<String>();
		Pattern pattern=Pattern.compile(regexPattern);
		Matcher matcher=pattern.matcher(string);
		while(matcher.find()){
			list.add(matcher.group(1));
		}
		return list.toArray(new String[list.size()]);
	}

	/** Tokenize a document. */
	@Override
	public TextToken[] splitIntoTokens(Document document){
		List<TextToken> tokenList=new ArrayList<TextToken>();
		TextToken[] tokenArray;
		String string=document.getText();

		Pattern pattern=Pattern.compile(regexPattern);
		Matcher matcher=pattern.matcher(string);

		while(matcher.find()){
			tokenList.add(new TextToken(document,matcher.start(1),matcher.end(1)-
					matcher.start(1)));
		}
		tokenArray=tokenList.toArray(new TextToken[0]);

		return tokenArray;
	}
}