//------------------------------------------------------------------------------ // Copyright (c) 2005, 2006 IBM Corporation and others. // All rights reserved. This program and the accompanying materials // are made available under the terms of the Eclipse Public License v1.0 // which accompanies this distribution, and is available at // http://www.eclipse.org/legal/epl-v10.html // // Contributors: // IBM Corporation - initial implementation //------------------------------------------------------------------------------ package org.eclipse.epf.search.analysis; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import com.ibm.icu.text.BreakIterator; /** * A text tokenizer that uses ICU4J to segment text into words. * * @author Kelvin Low * @since 1.0 */ public final class TextTokenizer extends Tokenizer { private final static int BUFFER_SIZE = 4096; private String text; private BreakIterator iterator; /** * Creates a new instance. * * @param reader * the text source */ public TextTokenizer(Reader reader) { super(reader); StringBuffer textBuffer = new StringBuffer(BUFFER_SIZE); char[] buffer = new char[BUFFER_SIZE]; int charsRead; try { while ((charsRead = reader.read(buffer, 0, BUFFER_SIZE)) > 0) { textBuffer.append(buffer, 0, charsRead); } text = textBuffer.toString(); iterator = BreakIterator.getWordInstance(); iterator.setText(text); } catch (IOException e) { iterator = null; } } /** * @see org.apache.lucene.analysis.TokenStream#next() */ public final Token next() throws IOException { if (iterator != null) { while (true) { int start = iterator.current(); int end = iterator.next(); if (end != BreakIterator.DONE) { String tokenText = text.substring(start, end).toLowerCase(); if (!tokenText.equals(" ")) { //$NON-NLS-1$ if (tokenText.endsWith("'s")) { //$NON-NLS-1$ tokenText = tokenText.substring(0, tokenText .length() - 2); } return new Token(tokenText, 0, tokenText.length()); } } else { return null; } } } return null; } }