SplitTokenizer.java example

Explorer
MinorThird-master
/* Copyright 2007, Carnegie Mellon, All Rights Reserved */

package edu.cmu.minorthird.text;

import java.util.ArrayList;
import java.util.List;

public class SplitTokenizer implements Tokenizer{

	private String splitString;

	public SplitTokenizer(String s){
		this.splitString=s;
	}

	public String getSplitString(){
		return splitString;
	}

	@Override
	public String[] splitIntoTokens(String string){
		return string.split(splitString);
	}

	/** Tokenize a document */
	@Override
	public TextToken[] splitIntoTokens(Document document){
		List<TextToken> tokenList=new ArrayList<TextToken>();
		TextToken[] tokenArray;
		String documentText=document.getText();
		int currPos=0;

		// Split the document text by the specified split string.
		String[] tokenValues=documentText.split(splitString);

		// Create the tokens.
		for(int i=0;i<tokenValues.length;i++){
			// Skip upto the first char in the next token
			currPos=documentText.indexOf(tokenValues[i],currPos);
			// Create the token
			tokenList.add(new TextToken(document,currPos,tokenValues[i].length()));
			// Skip past the text in the token.
			currPos=currPos+tokenValues[i].length();
		}
		tokenArray=tokenList.toArray(new TextToken[0]);

		return tokenArray;
	}
}