/* * Copyright 2014, Stratio. * Modification and adapations - Copyright 2014, Tuplejump Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.tuplejump.stargate.lucene.query; import com.tuplejump.stargate.lucene.Options; import com.tuplejump.stargate.lucene.Properties; import com.tuplejump.stargate.lucene.Type; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.codehaus.jackson.annotate.JsonCreator; import org.codehaus.jackson.annotate.JsonProperty; /** * A {@link Condition} that implements the fuzzy search query. The similarity measurement is based on the * Damerau-Levenshtein (optimal string alignment) algorithm, though you can explicitly choose classic Levenshtein by * passing {@code false} to the {@code transpositions} parameter. */ public class FuzzyCondition extends Condition implements Selector { public final static int DEFAULT_MAX_EDITS = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; public final static int DEFAULT_PREFIX_LENGTH = 0; public final static int DEFAULT_MAX_EXPANSIONS = 50; public final static boolean DEFAULT_TRANSPOSITIONS = true; /** * The field name */ private final String field; /** * The field value */ private String value; private final Integer maxEdits; private final Integer prefixLength; private final Integer maxExpansions; private final Boolean transpositions; /** * Returns a new {@link FuzzyCondition}. * * @param boost The boost for this query clause. Documents matching this clause will (in addition to the normal * weightings) have their score multiplied by {@code boost}. If {@code null}, then DEFAULT_BOOST * is used as default. * @param field The field name. * @param value The field fuzzy value. * @param maxEdits Must be greater or equal to 0 and less than or equal to {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}'. * @param prefixLength Length of common (non-fuzzy) prefix * @param maxExpansions The maximum number of terms to match. If this number is greater than * {@link BooleanQuery#getMaxClauseCount} when the query is rewritten, then the maxClauseCount will be * used instead. * @param transpositions True if transpositions should be treated as a primitive edit operation. If this is false, comparisons * will implement the classic Levenshtein algorithm. */ @JsonCreator public FuzzyCondition(@JsonProperty("boost") Float boost, @JsonProperty("field") String field, @JsonProperty("value") String value, @JsonProperty("maxEdits") Integer maxEdits, @JsonProperty("prefixLength") Integer prefixLength, @JsonProperty("maxExpansions") Integer maxExpansions, @JsonProperty("transpositions") Boolean transpositions) { super(boost); this.field = field != null ? field.toLowerCase() : null; this.value = value; this.maxEdits = maxEdits == null ? DEFAULT_MAX_EDITS : maxEdits; this.prefixLength = prefixLength == null ? DEFAULT_PREFIX_LENGTH : prefixLength; this.maxExpansions = maxExpansions == null ? DEFAULT_MAX_EXPANSIONS : maxExpansions; this.transpositions = transpositions == null ? DEFAULT_TRANSPOSITIONS : transpositions; } @Override public Automaton getAutomaton(Options schema) { Properties properties = schema.getProperties(field); String message; Type fieldType = properties != null ? properties.getType() : Type.text; if (fieldType == Type.string || fieldType == Type.text) { String analyzedValue = analyze(field, value, schema.analyzer); LevenshteinAutomata levenshteinAutomata = new LevenshteinAutomata(analyzedValue, transpositions); return levenshteinAutomata.toAutomaton(maxEdits); } message = String.format("Fuzzy queries cannot be supported for field type %s", fieldType); throw new UnsupportedOperationException(message); } /** * Returns the field name. * * @return the field name. */ public String getField() { return field; } /** * Returns the field value. * * @return the field value. */ public String getValue() { return value; } /** * Returns the Damerau-Levenshtein max distance. * * @return The Damerau-Levenshtein max distance. */ public Integer getMaxEdits() { return maxEdits; } /** * Returns the length of common (non-fuzzy) prefix. * * @return The length of common (non-fuzzy) prefix. */ public Integer getPrefixLength() { return prefixLength; } /** * Returns the maximum number of terms to match. * * @return The maximum number of terms to match. */ public Integer getMaxExpansions() { return maxExpansions; } /** * Returns if transpositions should be treated as a primitive edit operation. * * @return If transpositions should be treated as a primitive edit operation. */ public Boolean getTranspositions() { return transpositions; } /** * {@inheritDoc} */ @Override public Query query(Options schema) { if (field == null || field.trim().isEmpty()) { throw new IllegalArgumentException("Field name required"); } if (value == null || value.trim().isEmpty()) { throw new IllegalArgumentException("Field value required"); } if (maxEdits < 0 || maxEdits > 2) { throw new IllegalArgumentException("max_edits must be between 0 and 2"); } if (prefixLength < 0) { throw new IllegalArgumentException("prefix_length must be positive."); } if (maxExpansions < 0) { throw new IllegalArgumentException("max_expansions must be positive."); } Properties properties = schema.getProperties(field); String message; Type fieldType = properties != null ? properties.getType() : Type.text; if (fieldType == Type.string || fieldType == Type.text) { String analyzedValue = analyze(field, value, schema.analyzer); Term term = new Term(field, analyzedValue); Query query = new FuzzyQuery(term, maxEdits, prefixLength, maxExpansions, transpositions); return query; } message = String.format("Fuzzy queries cannot be supported for field type %s", fieldType); throw new UnsupportedOperationException(message); } @Override public String getType() { return "fuzzy"; } /** * {@inheritDoc} */ @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append(getClass().getSimpleName()); builder.append(" [boost="); builder.append(boost); builder.append(", field="); builder.append(field); builder.append(", value="); builder.append(value); builder.append(", maxEdits="); builder.append(maxEdits); builder.append(", prefixLength="); builder.append(prefixLength); builder.append(", maxExpansions="); builder.append(maxExpansions); builder.append(", transpositions="); builder.append(transpositions); builder.append("]"); return builder.toString(); } }