package org.ansj.solr; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.ansj.lucene.util.AnsjTokenizer; import org.ansj.splitWord.analysis.IndexAnalysis; import org.ansj.splitWord.analysis.ToAnalysis; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class AnsjTokenizerFactory extends TokenizerFactory { public final Logger logger = LoggerFactory.getLogger(getClass()); private boolean pstemming; private boolean isQuery; private String stopwordsDir; public Set<String> filter; public AnsjTokenizerFactory(Map<String, String> args) { super(args); assureMatchVersion(); isQuery = getBoolean(args, "isQuery", true); pstemming = getBoolean(args, "pstemming", false); stopwordsDir = get(args, "words"); addStopwords(stopwordsDir); } /** * 添加停用词 * * @param dir */ private void addStopwords(String dir) { if (dir == null) { logger.info("no stopwords dir"); return; } logger.info("stopwords: {}", dir); filter = new HashSet<String>(); File file = new File(dir); try (FileInputStream fis = new FileInputStream(file)) { InputStreamReader reader = new InputStreamReader(fis, "UTF-8"); BufferedReader br = new BufferedReader(reader); String word = br.readLine(); while (word != null) { filter.add(word); word = br.readLine(); } } catch (FileNotFoundException e) { logger.info("No stopword file found"); } catch (IOException e) { logger.info("stopword file io exception"); } } @Override public Tokenizer create(AttributeFactory factory, Reader input) { if (isQuery == true) { return new AnsjTokenizer(new ToAnalysis(input), input, filter, pstemming); } else { return new AnsjTokenizer(new IndexAnalysis(input), input, filter, pstemming); } } }