TermCompletionAnalyzer.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on May 8, 2014 by Jeremy J. Carroll, Syapse Inc.
 */
package com.bigdata.search;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.CharBuffer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;


/**
 * An analyzer intended for the term-completion use case; particularly
 * for technical vocabularies and concept schemes.
 * 
 * <p> 
 * This analyzer generates several index terms for each word in the input.
 * These are intended to match short sequences (e.g. three or more) characters
 * of user-input, to then give the user a drop-down list of matching terms.
 * <p>
 * This can be set up to address issues like matching <q>half-time</q> when the user types
 * <q>tim</q> or if the user types <q>halft</q> (treating the hyphen as a soft hyphen); or
 * to match <q>TermCompletionAnalyzer</q> when the user types <q>Ana</q>
 * <p>
 * In contrast, the Lucene Analyzers are mainly geared around the free text search use
 * case. 
 * <p>
 * The intended use cases will typical involve a prefix query of the form:
 * <pre>
 *    ?t bds:search "prefix*" .
 * </pre>
 * to find all literals in the selected graphs, which are indexed by a term starting in <q>prefix</q>,
 * so the problem this class addresses is finding the appropriate index terms to allow
 * matching, at sensible points, mid-way through words (such as at hyphens).
 * <p>
 * To get maximum effectiveness it maybe best to use private language subtags (see RFC 5647),
 * e.g. <code>"x-term"</code>
 * which are mapped to this class by {@link ConfigurableAnalyzerFactory} for
 * the data being loaded into the store, and linked to some very simple process
 * like {@link KeywordAnalyzer} for queries which are tagged with a different language tag
 * that is only used for <code>bds:search</code>, e.g. <code>"x-query"</code>.
 * The above prefix query then becomes:
 * <pre>
 *    ?t bds:search "prefix*"@x-query .
 * </pre>
 * 
 * 
 * 
 * @author jeremycarroll
 *
 */
public class TermCompletionAnalyzer extends Analyzer {
	
	private final Pattern wordBoundary;
	private final Pattern subWordBoundary;

	private final Pattern discard;
	private final boolean alwaysDiscard;

	/**
	 * Divide the input into words and short tokens
	 * as with {@link #TermCompletionAnalyzer(Pattern, Pattern)}.
	 * Each term is generated, and then an additional term
	 * is generated with softHypens (defined by the pattern),
	 * removed. If the alwaysRemoveSoftHypens flag is true,
	 * then the first term (before the removal) is suppressed.
	 * 
	 * @param wordBoundary      The definition of space (e.g. " ")
	 * @param subWordBoundary   Also index after matches to this (e.g. "-")
	 * @param softHyphens     Discard these characters from matches
	 * @param alwaysRemoveSoftHypens  If false the discard step is optional.
	 */
	public TermCompletionAnalyzer(Pattern wordBoundary, 
			Pattern subWordBoundary, 
			Pattern softHyphens,
			boolean alwaysRemoveSoftHypens) {
		this.wordBoundary = wordBoundary;
		this.subWordBoundary = subWordBoundary;
		if (softHyphens != null) {
			discard = softHyphens;
			alwaysDiscard = alwaysRemoveSoftHypens;
		} else {
			discard = Pattern.compile("(?!)"); // never matches
			alwaysDiscard = true;
		}
	}
	/**
	 * Divide the input into words, separated by the wordBoundary,
	 * and return a token for each whole word, and then 
	 * generate further tokens for each word by removing prefixes
	 * up to and including each successive match of
	 * subWordBoundary
	 * @param wordBoundary
	 * @param subWordBoundary
	 */
	public TermCompletionAnalyzer(Pattern wordBoundary, 
			Pattern subWordBoundary) {
		this(wordBoundary, subWordBoundary, null, true);
	}

	/**
	 * This classes has three processes going on
	 * all driven from the {@link #increment()} method.
	 * 
	 * One process is that of iterating over the words in the input:
	 * - the words are identified in the constructor, and the iteration
	 *   is performed by {@link #nextWord()}
	 *   
	 * - the subword boundaries are identified in {@link #next()}
	 *   We then set up {@link #found} to contain the most
	 *   recently found subword.
	 *   
	 * - the soft hyphen discarding is processed in {@link #maybeDiscardHyphens()}
	 *   
	 *   - if we are not {@link #alwaysDiscard}ing then {@link #afterDiscard}
	 *   can be set to null to return the non-discarded version on the next cycle.
	 *   
	 */
	private class TermCompletionTokenStream extends TokenStream {

		final String[] words;
		final CharTermAttribute termAtt;
		
		
		
		char currentWord[] = new char[]{};
		Matcher softMatcher;
		int currentWordIx = -1;
		
		
		int charPos = 0;
		private String afterDiscard;
		private CharBuffer found;
		
		public TermCompletionTokenStream(final Reader reader) {
		    termAtt = addAttribute(CharTermAttribute.class);
			words = wordBoundary.split(getStringReaderContents(reader));
		}
		
		@Override
		public boolean incrementToken() throws IOException {
			if ( next() ) {
				if (afterDiscard != null) {
					int lg = afterDiscard.length();
					afterDiscard.getChars(0, lg, termAtt.buffer(), 0);
				    termAtt.setLength(lg);
				} else {
				    int lg = found.length();
					found.get(termAtt.buffer(), 0, lg);
				    termAtt.setLength(lg);
				}
				return true;
			} else {
				return false;
			}
		}
		
		private boolean next() {
			if (currentWordIx >= words.length) {
				return false;
			}
			if (!alwaysDiscard) {
				// Last match was the discarded version,
				// now do the non-discard version.
				if (afterDiscard != null) {
					afterDiscard = null;
					return true;
				}
			}
			afterDiscard = null;
			if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) {
				charPos = softMatcher.end();
				maybeDiscardHyphens();
				return true;
			} else {
				return nextWord();
			}
		}

		void maybeDiscardHyphens() {
			found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
			Matcher discarding = discard.matcher(found);
			if (discarding.find()) {
				afterDiscard = discarding.replaceAll("");
			}
		}
		
		private boolean nextWord() {
			currentWordIx++;
			if (currentWordIx >= words.length) {
				return false;
			}
			currentWord = words[currentWordIx].toCharArray();
			termAtt.resizeBuffer(currentWord.length);
			charPos = 0;
			softMatcher = subWordBoundary.matcher(words[currentWordIx]);
			maybeDiscardHyphens();
			return true;
		}

	}

	static String getStringReaderContents(Reader reader) {
		try {
			reader.mark(Integer.MAX_VALUE);
			int length = (int) reader.skip(Integer.MAX_VALUE);
			reader.reset();
			char fileContent[] = new char[length];
			reader.read(fileContent);
			reader.reset();
			return new String(fileContent);
		} catch (IOException e) {
			throw new RuntimeException("Impossible",e);
		}
	}
	
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
		Tokenizer source = new StandardTokenizer();
	    return new TokenStreamComponents(source){
	    	private Reader reader;
			@Override
	    	protected void setReader(Reader reader) {
	    		this.reader = reader;
	    		super.setReader(reader);
	    	}
	    	@Override
	    	public TokenStream getTokenStream() {
	    		return new TermCompletionTokenStream(reader);
	    	}
	    };
	}
	
}