/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.solr.step;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerFactory;
import no.trank.openpipe.api.MultiInputFieldPipelineStep;
import no.trank.openpipe.api.PipelineException;
import no.trank.openpipe.api.document.AnnotatedField;
import no.trank.openpipe.api.document.Annotation;
import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.config.annotation.NotNull;
import no.trank.openpipe.solr.analysis.AnnotationTokenStream;
import no.trank.openpipe.solr.analysis.TokenStreamAnnotation;
/**
* A <tt>PipelineStep</tt> for running solr tokenizer-/analyzer-chains in OpenPipe.<br/>
* <br/>
* Similar to the SolR config, this should have one or zero TokenizerFactories, and zero or more
* TokenFilterFactories.<br/>
* <br/>
* As an alternative to specifying a TokenizerFactory, you may set the annotation set. (If you use another tokenizer,
* but want to use filters from SolR). If this option is used, annotated tokens of a type in the annotation set will be
* treated as a token.<br/>
* <br/>
* If you want to tokenize in the pipeline, you can look into the schema.xml for solr to get tips on how to set up this.<br/>
* <br/>
* If you are using spring to supply the TokenFilterFactories, you can use
* <tt>no.trank.openpipe.solr.util.TokenFilterFactoryFactory</tt> to initialize the filterfactories.
*
* @see no.trank.openpipe.solr.util.TokenFilterFactoryFactory
* @version $Revision$
*/
public class SolrAnalyzerStep extends MultiInputFieldPipelineStep {
private TokenizerFactory tokenizerFactory;
private List<TokenFilterFactory> filterFactories = Collections.emptyList();
@NotNull
private Set<String> annotations = Collections.emptySet();
@Override
protected void process(Document doc, String fieldName, List<AnnotatedField> fieldValues) throws PipelineException {
try {
for (AnnotatedField value : fieldValues) {
processFilters(value);
}
} catch (IOException e) {
throw new PipelineException(e);
}
}
private void processFilters(AnnotatedField value) throws IOException {
final TokenStream stream;
if (tokenizerFactory != null) {
stream = tokenizerFactory.create(new StringReader(value.getValue()));
} else {
stream = new AnnotationTokenStream(value, annotations);
}
final TokenStreamAnnotation annotation = new TokenStreamAnnotation(createFilters(stream));
annotation.process();
final Map<String, List<Annotation>> annotations = annotation.getAnnotations();
for (Map.Entry<String, List<Annotation>> e : annotations.entrySet()) {
value.set(e.getKey(), e.getValue());
}
}
private TokenStream createFilters(TokenStream stream) {
TokenStream filter = stream;
for (TokenFilterFactory factory : filterFactories) {
filter = factory.create(filter);
}
return filter;
}
@Override
public void prepare() throws PipelineException {
super.prepare();
if (tokenizerFactory == null && filterFactories.isEmpty()) {
throw new PipelineException("Either tokenizerFactory or filterFactories must be provided");
}
}
@Override
public String getRevision() {
return "$Revision$";
}
/**
* Gets the SolR TokenizerFactory that is used.
*
* @return the TokenizerFactory
*/
public TokenizerFactory getTokenizerFactory() {
return tokenizerFactory;
}
/**
* Sets the SolR TokenizerFactory to use.
*
* @param tokenizerFactory a TokenizerFactory
*/
public void setTokenizerFactory(TokenizerFactory tokenizerFactory) {
this.tokenizerFactory = tokenizerFactory;
}
/**
* Gets the list of SolR <tt>TokenFilterFactories</tt> that is used.
*
* @see #setTokenizerFactory(org.apache.solr.analysis.TokenizerFactory)
* @return a list of TokenFilterFactory objects
*/
public List<TokenFilterFactory> getFilterFactories() {
return filterFactories;
}
/**
* Sets the list of initialized SolR <tt>TokenFilterFactories</tt> to use for analasys.
*
* @param filterFactories the filterFactories to use
*/
public void setFilterFactories(List<TokenFilterFactory> filterFactories) {
this.filterFactories = filterFactories;
}
/**
* Gets the set of annotation types to treat as tokens. Used only if <tt>TokenizerFactory</tt> is not set.
*
* @return a set of Strings
*/
public Set<String> getAnnotations() {
return annotations;
}
/**
* Sets the set of annotation types to treat as tokens. Used only if <tt>TokenizerFactory</tt> is not set.
*
* @param annotations a set of Strings
*/
public void setAnnotations(Set<String> annotations) {
this.annotations = annotations;
}
}