SearchJanitorUtils.java example

Explorer
zhong-master
package com.ringtone.server;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;

public class SearchJanitorUtils {
	
	
	private static final Logger log = Logger.getLogger(SearchJanitorUtils.class.getName());
	
	/** From StopAnalyzer Lucene 2.9.1 */
	public final static String[] stopWords = new String[]{
	  	    "a", "an", "and", "are", "as", "at", "be", "but", "by",
		    "for", "if", "in", "into", "is", "it",
		    "no", "not", "of", "on", "or", "such",
		    "that", "the", "their", "then", "there", "these",
		    "they", "this", "to", "was", "will", "with"
		  };
	
	/**
	 * Uses english stemming (snowball + lucene) + stopwords for getting the words.
	 * 
	 * @param index
	 * @return
	 */
	public static Set<String> getTokensForIndexingOrQuery(
			String index_raw,
			int maximumNumberOfTokensToReturn) {
		
		String indexCleanedOfHTMLTags = index_raw.replaceAll("\\<.*?>"," ");
		
		
		Set<String> returnSet = new HashSet<String>();
		
		try {
			
			Analyzer analyzer =  new SnowballAnalyzer(
					org.apache.lucene.util.Version.LUCENE_CURRENT,
					"English",
					stopWords);

			
			TokenStream tokenStream = analyzer.tokenStream(
					"content", 
					new StringReader(indexCleanedOfHTMLTags));
			
			Token token = new Token();
		
		while (((token = tokenStream.next()) != null)
				&& (returnSet.size() < maximumNumberOfTokensToReturn)) {
			
					returnSet.add(token.term());
			
			}
			
		} catch (IOException e) {
			log.severe(e.getMessage());
		}
		
		return returnSet;
		
		
	}
	
	
	
	
}