/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
/**
* An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
* words from a field's StopFilter to the default stop words used in Carrot2,
* for all languages Carrot2 supports. Completely replacing Carrot2 stop words
* with Solr's wouldn't make much sense because clustering needs more aggressive
* stop words removal. In other words, if something is a stop word during
* indexing, then it should also be a stop word during clustering, but not the
* other way round.
*
* @lucene.experimental
*/
@Bindable
public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFactory {
@Init
@Input
@Attribute(key = "solrCore")
public SolrCore core;
@Processing
@Input
@Attribute(key = "solrFieldNames")
public Set<String> fieldNames;
/**
* A lazily-built cache of stop words per field.
*/
private HashMap<String, List<CharArraySet>> solrStopWords = new HashMap<>();
/**
* Carrot2's default lexical resources to use in addition to Solr's stop
* words.
*/
public DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
/**
* Obtains stop words for a field from the associated
* {@link StopFilterFactory}, if any.
*/
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
// No need to synchronize here, Carrot2 ensures that instances
// of this class are not used by multiple threads at a time.
synchronized (solrStopWords) {
if (!solrStopWords.containsKey(fieldName)) {
solrStopWords.put(fieldName, new ArrayList<>());
IndexSchema schema = core.getLatestSchema();
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
if (fieldAnalyzer instanceof TokenizerChain) {
final TokenFilterFactory[] filterFactories =
((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
for (TokenFilterFactory factory : filterFactories) {
if (factory instanceof StopFilterFactory) {
// StopFilterFactory holds the stop words in a CharArraySet
CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
solrStopWords.get(fieldName).add(stopWords);
}
if (factory instanceof CommonGramsFilterFactory) {
CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
solrStopWords.get(fieldName).add(commonWords);
}
}
}
}
return solrStopWords.get(fieldName);
}
}
@Override
public ILexicalData getLexicalData(LanguageCode languageCode) {
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
.getLexicalData(languageCode);
return new ILexicalData() {
@Override
public boolean isStopLabel(CharSequence word) {
// Nothing in Solr maps to the concept of a stop label,
// so return Carrot2's default here.
return carrot2LexicalData.isStopLabel(word);
}
@Override
public boolean isCommonWord(MutableCharArray word) {
// Loop over the fields involved in clustering first
for (String fieldName : fieldNames) {
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
if (stopWords.contains(word)) {
return true;
}
}
}
// Check default Carrot2 stop words too
return carrot2LexicalData.isCommonWord(word);
}
};
}
}