/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.analysis;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class SpanishAnalyzer extends Analyzer {
/**
* An array containing some common Spanish words that are usually not
* useful for searching. Imported from http://www.unine.ch/info/clef/.
* http://members.unine.ch/jacques.savoy/clef/spanishSmart.txt
*/
private static final String SPANISH_STOP_WORDS[] = {
"él", "ésta", "éstas", "éste", "éstos", "última", "últimas", "último", "últimos", "a", "añadió",
"aún", "actualmente", "adelante", "además", "afirmó", "agregó", "ahí", "ahora", "al", "algún",
"algo", "alguna", "algunas", "alguno", "algunos", "alrededor", "ambos", "ante", "anterior", "antes",
"apenas", "aproximadamente", "aquí", "así", "aseguró", "aunque", "ayer", "bajo", "bien", "buen",
"buena", "buenas", "bueno", "buenos", "cómo", "cada", "casi", "cerca", "cierto", "cinco", "comentó",
"como", "con", "conocer", "consideró", "considera", "contra", "cosas", "creo", "cual", "cuales",
"cualquier", "cuando", "cuanto", "cuatro", "cuenta", "da", "dado", "dan", "dar", "de", "debe",
"deben", "debido", "decir", "dejó", "del", "demás", "dentro", "desde", "después", "dice", "dicen",
"dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "donde", "dos", "durante",
"e", "ejemplo", "el", "ella", "ellas", "ello", "ellos", "embargo", "en", "encuentra", "entonces",
"entre", "era", "eran", "es", "esa", "esas", "ese", "eso", "esos", "está", "están", "esta", "estaba",
"estaban", "estamos", "estar", "estará", "estas", "este", "esto", "estos", "estoy", "estuvo", "ex",
"existe", "existen", "explicó", "expresó", "fin", "fue", "fuera", "fueron", "gran", "grandes", "ha",
"había", "habían", "haber", "habrá", "hace", "hacen", "hacer", "hacerlo", "hacia", "haciendo",
"han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", "hizo", "hoy", "hubo", "igual",
"incluso", "indicó", "informó", "junto", "la", "lado", "las", "le", "les", "llegó", "lleva",
"llevar", "lo", "los", "luego", "lugar", "más", "manera", "manifestó", "mayor", "me", "mediante",
"mejor", "mencionó", "menos", "mi", "mientras", "misma", "mismas", "mismo", "mismos", "momento",
"mucha", "muchas", "mucho", "muchos", "muy", "nada", "nadie", "ni", "ningún", "ninguna", "ningunas",
"ninguno", "ningunos", "no", "nos", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro",
"nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "otra", "otras", "otro",
"otros", "para", "parece", "parte", "partir", "pasada", "pasado", "pero", "pesar", "poca", "pocas",
"poco", "pocos", "podemos", "podrá", "podrán", "podría", "podrían", "poner", "por", "porque",
"posible", "próximo", "próximos", "primer", "primera", "primero", "primeros", "principalmente",
"propia", "propias", "propio", "propios", "pudo", "pueda", "puede", "pueden", "pues", "qué", "que",
"quedó", "queremos", "quién", "quien", "quienes", "quiere", "realizó", "realizado", "realizar",
"respecto", "sí", "sólo", "se", "señaló", "sea", "sean", "según", "segunda", "segundo", "seis",
"ser", "será", "serán", "sería", "si", "sido", "siempre", "siendo", "siete", "sigue", "siguiente",
"sin", "sino", "sobre", "sola", "solamente", "solas", "solo", "solos", "son", "su", "sus", "tal",
"también", "tampoco", "tan", "tanto", "tenía", "tendrá", "tendrán", "tenemos", "tener", "tenga",
"tengo", "tenido", "tercera", "tiene", "tienen", "toda", "todas", "todavía", "todo", "todos",
"total", "tras", "trata", "través", "tres", "tuvo", "un", "una", "unas", "uno", "unos", "usted",
"va", "vamos", "van", "varias", "varios", "veces", "ver", "vez", "y", "ya", "yo" };
/**
* Contains the stopwords used with the StopFilter.
*/
private Set<Object> stopTable = new HashSet<Object>();
/**
* Contains words that should be indexed but not stemmed.
*/
@SuppressWarnings("unused")
private Set<Object> exclTable = new HashSet<Object>();
/**
* Builds an analyzer with the default stop words.
*/
@SuppressWarnings("unchecked")
public SpanishAnalyzer() {
stopTable = StopFilter.makeStopSet(SPANISH_STOP_WORDS);
}
/** Builds an analyzer with the given stop words. */
@SuppressWarnings("unchecked")
public SpanishAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopSet(stopWords);
}
/**
* Builds an analyzer with the given stop words from file.
* @throws IOException
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public SpanishAnalyzer(File stopWords) throws IOException {
stopTable = new HashSet(WordlistLoader.getWordSet(stopWords));
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
* StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}
* and a {@link SpanishStemFilter}. */
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopTable);
result = new SpanishStemFilter(result);
return result;
}
}