NAnalyzer.java example

/*
 * Copyright 2011 Stefan Partusch
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.spartusch.nasfvi.server;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;

import de.spartusch.Resources;
import de.spartusch.StringMethods;

/**
 * Analyzes content as a natural language text, a simple text or as a single
 * token according to field names.
 * @author Stefan Partusch
 *
 */
public class NAnalyzer extends Analyzer {
	/** Names of fields that are analyzed as natural language texts. */
	private static final String[] NATURAL_TEXT_FIELDS
		= new String[] {"titel", "beschreibung", "typ"};

	/** List of stop words. */
	private Set<String> stopWords;
	/** List of words for compounds. */
	private Set<String> compounds;

	/**
	 * Creates a new NAnalyzer.
	 * @param res Resources to use to locate stop words and compounds 
	 */
	public NAnalyzer(final Resources res) {
		try {
			File swFile =
				res.getFile("nasfvi.StopWords", "/WEB-INF/stopwords.txt");
			if (swFile == null) {
				throw new IllegalArgumentException("Stop words not found");
			}

			File cFile =
				res.getFile("nasfvi.Compounds", "/WEB-INF/komposita.txt");
			if (cFile == null) {
				throw new IllegalArgumentException("Compounds not found");
			}
			
			stopWords = Resources.getWordlist(swFile, "UTF-8");
			compounds = Resources.getWordlist(cFile, "UTF-8");
		} catch (IOException e) {
			throw new RuntimeException(e);
		}
	}

	/**
	 * Analyzes the input as a text with natural language. In this
	 * implementation natural language refers to German only.
	 * @param reader Input to analyze
	 * @return Analyzed and processed tokens from <code>reader</code>
	 */
	private TokenStream naturalText(final Reader reader) {
		TokenStream result = new StandardTokenizer(Version.LUCENE_35, reader);

		result = new StandardFilter(Version.LUCENE_35, result);
		result = new LowerCaseFilter(Version.LUCENE_35, result);
		result = new StopFilter(Version.LUCENE_35, result, stopWords);
		result = new DictionaryCompoundWordTokenFilter(Version.LUCENE_35, result, compounds);
		result = new SnowballFilter(result, "German2");

		return result;		
	}

	/**
	 * Returns the content of the input as a single token.
	 * @param reader Input to analyze
	 * @return Content of <code>reader</code> as a single token
	 */
	private TokenStream singleToken(final Reader reader) {
		return new KeywordTokenizer(reader);
	}

	/**
	 * Analyzes the input as a simple text. In this implementation this
	 * includes stemming.
	 * @param reader Input to analyze
	 * @return Analyzed and processed tokens from <code>reader</code>
	 */
	private TokenStream simpleText(final Reader reader) {
		TokenStream result = new StandardTokenizer(Version.LUCENE_35, reader);

		result = new StandardFilter(Version.LUCENE_35, result);
		result = new LowerCaseFilter(Version.LUCENE_35, result);
		result = new StopFilter(Version.LUCENE_35, result, stopWords);
		result = new SnowballFilter(result, "German2");

		return result;	
	}

	@Override
	public final int getPositionIncrementGap(final String fieldName) {
		return 100;
	}

	@Override
	public final TokenStream tokenStream(final String fieldName,
			final Reader reader) {
		if (StringMethods.equalsOneOf(fieldName, NATURAL_TEXT_FIELDS)) {
			return naturalText(reader);
		} else if ("semester".equals(fieldName)) {
			return singleToken(reader);
		} else {
			return simpleText(reader);
		}
	}
}