/*
* Copyright (c) 2009 Andrejs Jermakovics.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Andrejs Jermakovics - initial implementation
*/
package it.unibz.instasearch.indexing.tokenizers;
import org.apache.lucene.analysis.TokenStream;
/**
* Splits words at camel case, underscore and dot
*/
public class CamelCaseTokenizer extends TermSplitTokenizer {
public CamelCaseTokenizer(TokenStream in) {
super(in);
}
@Override
public String[] splitTerm(String term) {
String newWord = term.replaceAll("([A-Z][a-z])", "_$1"); // not ideal, but short
newWord = newWord.replaceAll("([a-z])([A-Z])", "$1_$2");
return newWord.split("[_]"); // will also split CONSTANT_NAMES
}
@Override
protected boolean returnOriginalTerm() {
return true;
}
}