/* * Copyright (c) 2009 Andrejs Jermakovics. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Andrejs Jermakovics - initial implementation */ package it.unibz.instasearch.indexing; import it.unibz.instasearch.indexing.tokenizers.CamelCaseTokenizer; import it.unibz.instasearch.indexing.tokenizers.DotSplitTokenizer; import it.unibz.instasearch.indexing.tokenizers.WordSplitTokenizer; import it.unibz.instasearch.indexing.tokenizers.standard.StandardTokenizer; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.KeywordTokenizer; import org.apache.lucene.analysis.LengthFilter; import org.apache.lucene.analysis.TokenStream; /** * Analyzer for user entered search queries */ public class QueryAnalyzer extends Analyzer { private static final int MAX_WORD_LENGTH = 128; private int minWordLength; public QueryAnalyzer(int minWordLength) { super(); this.minWordLength = minWordLength; } @Override public TokenStream tokenStream(String fieldName, Reader reader) { if( Field.CONTENTS.toString().equals(fieldName) ) { TokenStream result = new StandardTokenizer(reader); // splits at ". ", "-" result = new WordSplitTokenizer(result); // non-alphanumerics result = new DotSplitTokenizer(result); // com.package.names result = new CamelCaseTokenizer(result); // CamelCaseIdentifiers //result = new LowerCaseFilter(result); result = new LengthFilter(result, minWordLength, MAX_WORD_LENGTH); return result; } else { // PROJECT, EXT fields return new KeywordTokenizer(reader); // return whole stream contents as token } } }