SolrStopwordsCarrot2LexicalDataFactory.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.clustering.carrot2;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;

/**
 * An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
 * words from a field's StopFilter to the default stop words used in Carrot2,
 * for all languages Carrot2 supports. Completely replacing Carrot2 stop words
 * with Solr's wouldn't make much sense because clustering needs more aggressive
 * stop words removal. In other words, if something is a stop word during
 * indexing, then it should also be a stop word during clustering, but not the
 * other way round.
 * 
 * @lucene.experimental
 */
@Bindable
public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFactory {

  @Init
  @Input
  @Attribute(key = "solrCore")
  public SolrCore core;

  @Processing
  @Input
  @Attribute(key = "solrFieldNames")
  public Set<String> fieldNames;

  /**
   * A lazily-built cache of stop words per field.
   */
  private HashMap<String, List<CharArraySet>> solrStopWords = new HashMap<>();

  /**
   * Carrot2's default lexical resources to use in addition to Solr's stop
   * words.
   */
  public DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();

  /**
   * Obtains stop words for a field from the associated
   * {@link StopFilterFactory}, if any.
   */
  private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
    // No need to synchronize here, Carrot2 ensures that instances
    // of this class are not used by multiple threads at a time.
    synchronized (solrStopWords) {
      if (!solrStopWords.containsKey(fieldName)) {
        solrStopWords.put(fieldName, new ArrayList<>());

        IndexSchema schema = core.getLatestSchema();
        final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
        if (fieldAnalyzer instanceof TokenizerChain) {
          final TokenFilterFactory[] filterFactories = 
              ((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
          for (TokenFilterFactory factory : filterFactories) {
            if (factory instanceof StopFilterFactory) {
              // StopFilterFactory holds the stop words in a CharArraySet
              CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
              solrStopWords.get(fieldName).add(stopWords);
            }

            if (factory instanceof CommonGramsFilterFactory) {
              CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
              solrStopWords.get(fieldName).add(commonWords);
            }
          }
        }
      }
      return solrStopWords.get(fieldName);
    }
  }

  @Override
  public ILexicalData getLexicalData(LanguageCode languageCode) {
    final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
        .getLexicalData(languageCode);

    return new ILexicalData() {
      @Override
      public boolean isStopLabel(CharSequence word) {
        // Nothing in Solr maps to the concept of a stop label,
        // so return Carrot2's default here.
        return carrot2LexicalData.isStopLabel(word);
      }

      @Override
      public boolean isCommonWord(MutableCharArray word) {
        // Loop over the fields involved in clustering first
        for (String fieldName : fieldNames) {
          for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
            if (stopWords.contains(word)) {
              return true;
            }
          }
        }
        // Check default Carrot2 stop words too
        return carrot2LexicalData.isCommonWord(word);
      }
    };
  }
}