/* * DrakkarKeel - An Enterprise Collaborative Search Platform * * The contents of this file are subject under the terms described in the * DRAKKARKEEL_LICENSE file included in this distribution; you may not use this * file except in compliance with the License. * * 2013-2014 DrakkarKeel Platform. */ package drakkar.mast.retrieval.analysis; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.Collections; import java.util.HashSet; import java.util.Set; import java.util.StringTokenizer; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseTokenizer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; /** * Analizador para indexar y buscar en documentos XML de la colección de Wikipedia con Lucene utilizando filtros para stopwords */ public class DefaultAnalyzer extends Analyzer { private static final long serialVersionUID = 1879843534577L; /** * */ protected Language lang; /** * */ protected Set<String> stopWords; /** * */ public DefaultAnalyzer() { this.lang = Language.EN; this.stopWords = new HashSet<String>(); } /** * * @param lang */ public DefaultAnalyzer(Language lang) { this.lang = lang; this.stopWords = new HashSet<String>(); } /** * * @param fieldName * @param reader * @return */ @Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream stream = new LowerCaseTokenizer(reader); stream = new StopFilter(true, stream, stopWords, true); // stream = new PorterStemFilter(stream); return stream; } /** * * @param fieldName * @param reader * @return * @throws IOException */ @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new LowerCaseTokenizer(reader); streams.result = new StopFilter(true, streams.source, stopWords, true); // streams.result = new PorterStemFilter(streams.source); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; } /** Filters LowerCaseTokenizer with StopFilter,PorterStemFilter */ protected class SavedStreams { Tokenizer source; TokenStream result; }; /** * * @throws IOException */ public void loadDefaultStopWords() throws IOException { load("conf/" + lang + "_stop_words.txt"); } /** * * @param lang * * @throws IOException */ public void loadDefaultStopWords(Language lang) throws IOException { setLanguage(lang); loadDefaultStopWords(); } /** * * @param filePath * @throws IOException */ public void loadStopWords(String filePath) throws IOException { load(filePath); } /** * * @param filePath * @throws IOException */ protected void load(String filePath) throws IOException{ BufferedReader bf = null; try { // TODO to update to JDK 7 bf = new BufferedReader(new FileReader(filePath)); String cad = bf.readLine(); while (cad != null) { StringTokenizer token = new StringTokenizer(cad, ","); while (token.hasMoreTokens()) { stopWords.add(token.nextToken()); } cad = bf.readLine(); } }finally{ try { bf.close(); } catch (IOException ex) { Logger.getLogger(DefaultAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } } } /** * @return the Language */ public Language getLanguage() { return lang; } /** * @param lang the language to set */ public void setLanguage(Language lang) { this.lang = lang; } /** * @return the stop words */ public Set<String> getStopWords() { return Collections.unmodifiableSet(stopWords); } /** * @param stopWords the stop words to set */ public void setStopords(Set<String> stopWords) { this.stopWords.clear(); this.stopWords.addAll(stopWords); } }