/* Copyright 2007, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.text; import java.util.ArrayList; import java.util.List; public class SplitTokenizer implements Tokenizer{ private String splitString; public SplitTokenizer(String s){ this.splitString=s; } public String getSplitString(){ return splitString; } @Override public String[] splitIntoTokens(String string){ return string.split(splitString); } /** Tokenize a document */ @Override public TextToken[] splitIntoTokens(Document document){ List<TextToken> tokenList=new ArrayList<TextToken>(); TextToken[] tokenArray; String documentText=document.getText(); int currPos=0; // Split the document text by the specified split string. String[] tokenValues=documentText.split(splitString); // Create the tokens. for(int i=0;i<tokenValues.length;i++){ // Skip upto the first char in the next token currPos=documentText.indexOf(tokenValues[i],currPos); // Create the token tokenList.add(new TextToken(document,currPos,tokenValues[i].length())); // Skip past the text in the token. currPos=currPos+tokenValues[i].length(); } tokenArray=tokenList.toArray(new TextToken[0]); return tokenArray; } }