/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.analysis;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version;
import org.sindice.siren.analysis.filter.MailtoFilter;
import org.sindice.siren.analysis.filter.URIDecodingFilter;
import org.sindice.siren.analysis.filter.URILocalnameFilter;
import org.sindice.siren.analysis.filter.URINormalisationFilter;
import org.sindice.siren.analysis.filter.URITrailingSlashFilter;
/**
* Analyzer designed to deal with any kind of URIs and perform some
* post-processing on URIs.
* <p>
* The URI normalisation can be configured using
* {@link #setUriNormalisation(URINormalisation)}. You can disable it, activate
* it only on URI local name, or on the full URI. However, URI normalisation on the
* full URI is costly in term of CPU at indexing time, and can double the size
* of the index, since each URI is duplicated by n tokens. By default, the URI
* normalisation is disabled.
*
* @see URINormalisationFilter
* @see URILocalnameFilter
*/
public class AnyURIAnalyzer extends Analyzer {
private final CharArraySet stopSet;
private final Version matchVersion;
/** An unmodifiable set containing some common English words that are usually not
useful for searching. */
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Types of URI normalisation */
public enum URINormalisation {NONE, LOCALNAME, FULL};
private URINormalisation normalisationType = URINormalisation.NONE;
public AnyURIAnalyzer(final Version version) {
this(version, STOP_WORDS_SET);
}
public AnyURIAnalyzer(final Version version, final CharArraySet stopWords) {
stopSet = stopWords;
matchVersion = version;
}
public AnyURIAnalyzer(final Version version, final String[] stopWords) {
matchVersion = version;
stopSet = StopFilter.makeStopSet(matchVersion, stopWords);
}
public AnyURIAnalyzer(final Version version, final File stopwords) throws IOException {
this(version, new FileReader(stopwords));
}
public AnyURIAnalyzer(final Version version, final Reader stopWords) throws IOException {
stopSet = WordlistLoader.getWordSet(stopWords, version);
matchVersion = version;
}
public void setUriNormalisation(final URINormalisation n) {
normalisationType = n;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
final WhitespaceTokenizer source = new WhitespaceTokenizer(matchVersion, reader);
TokenStream sink = new URIDecodingFilter(source, "UTF-8");
sink = this.applyURINormalisation(sink);
sink = new MailtoFilter(sink);
sink = new LowerCaseFilter(matchVersion, sink );
sink = new StopFilter(matchVersion, sink, stopSet);
sink = new LengthFilter(true, sink, 2, 256);
return new TokenStreamComponents(source, sink);
}
/**
* Given the type of URI normalisation, apply the right sequence of operations
* and filters to the token stream.
*/
private TokenStream applyURINormalisation(TokenStream in) {
switch (normalisationType) {
case NONE:
return new URITrailingSlashFilter(in);
// here, trailing slash filter is after localname filtering, in order to
// avoid filtering subdirectory instead of localname
case LOCALNAME:
in = new URILocalnameFilter(in);
return new URITrailingSlashFilter(in);
// here, trailing slash filter is before localname filtering, in order to
// avoid trailing slash checking on every tokens generated by the
// URI normalisation filter
case FULL:
in = new URITrailingSlashFilter(in);
return new URINormalisationFilter(in);
default:
throw new EnumConstantNotPresentException(URINormalisation.class,
normalisationType.toString());
}
}
}