/* Copyright 2007, Carnegie Mellon, All Rights Reserved */
package edu.cmu.minorthird.text;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
/** Maintains information about what's in a set of documents.
* Specifically, this contains a set of character sequences (TextToken's)
* from some sort of set of containing documents - typically found by
* tokenization.
*/
public class RegexTokenizer implements Tokenizer{
private static Logger log=Logger.getLogger(RegexTokenizer.class);
/** How to split tokens up */
public static final String TOKEN_REGEX_PROP="edu.cmu.minorthird.tokenRegex";
public static final String TOKEN_REGEX_DEFAULT_VALUE=
"\\s*([0-9]+|[a-zA-Z]+|\\W)\\s*";
public static String standardTokenRegexPattern;
static{
Properties props=new Properties();
try{
InputStream in=
FancyLoader.class.getClassLoader().getResourceAsStream(
"token.properties");
if(in!=null){
props.load(in);
log.debug("loaded properties from stream "+in);
}else{
log.info("no token.properties found on classpath");
}
}catch(Exception ex){
log.debug("can't open token.properties:"+ex);
}
standardTokenRegexPattern=
props.getProperty(TOKEN_REGEX_PROP,System.getProperty(TOKEN_REGEX_PROP,
TOKEN_REGEX_DEFAULT_VALUE));
log.info("tokenization regex: "+standardTokenRegexPattern);
}
public String regexPattern=standardTokenRegexPattern;
public RegexTokenizer(){
}
public RegexTokenizer(String pattern){
this.regexPattern=pattern;
}
/** Tokenize a string. */
@Override
public String[] splitIntoTokens(String string){
List<String> list=new ArrayList<String>();
Pattern pattern=Pattern.compile(regexPattern);
Matcher matcher=pattern.matcher(string);
while(matcher.find()){
list.add(matcher.group(1));
}
return list.toArray(new String[list.size()]);
}
/** Tokenize a document. */
@Override
public TextToken[] splitIntoTokens(Document document){
List<TextToken> tokenList=new ArrayList<TextToken>();
TextToken[] tokenArray;
String string=document.getText();
Pattern pattern=Pattern.compile(regexPattern);
Matcher matcher=pattern.matcher(string);
while(matcher.find()){
tokenList.add(new TextToken(document,matcher.start(1),matcher.end(1)-
matcher.start(1)));
}
tokenArray=tokenList.toArray(new TextToken[0]);
return tokenArray;
}
}