/*
* Copyright (c) 2009 Andrejs Jermakovics.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Andrejs Jermakovics - initial implementation
*/
package it.unibz.instasearch.indexing.tokenizers;
import org.apache.lucene.analysis.TokenStream;
/**
* Splits words at non-alphanumeric characters and characters that don't form identifiers in code
*/
public class WordSplitTokenizer extends TermSplitTokenizer {
/**
* @param in
*/
public WordSplitTokenizer(TokenStream in) {
super(in);
}
@Override
public String[] splitTerm(String term)
{
return term.split("[^A-Za-z0-9\u00E0-\u00FF_'.-]"); //"\\W" or "[^A-Za-z0-9'\u00E0-\u00FF]"
}
}