// Copyright 2017 JanusGraph Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package org.janusgraph.core.attribute; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; import org.janusgraph.graphdb.query.JanusGraphPredicate; import org.apache.commons.lang.StringUtils; import org.apache.commons.text.similarity.LevenshteinDistance; import org.apache.tinkerpop.gremlin.process.traversal.P; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; import java.util.Set; /** * Comparison relations for text objects. These comparisons are based on a tokenized representation * of the text, i.e. the text is considered as a set of word tokens. * * @author Matthias Broecheler (me@matthiasb.com) */ public enum Text implements JanusGraphPredicate { /** * Whether the text contains a given term as a token in the text (case insensitive) */ CONTAINS { @Override public boolean test(Object value, Object condition) { this.preevaluate(value,condition); if (value == null) return false; return evaluateRaw(value.toString(),(String)condition); } @Override public boolean evaluateRaw(String value, String terms) { Set<String> tokens = Sets.newHashSet(tokenize(value.toLowerCase())); terms = terms.trim(); List<String> tokenTerms = tokenize(terms.toLowerCase()); if (!terms.isEmpty() && tokenTerms.isEmpty()) return false; for (String term : tokenTerms) { if (!tokens.contains(term)) return false; } return true; } @Override public boolean isValidCondition(Object condition) { if (condition == null) return false; else if (condition instanceof String && StringUtils.isNotBlank((String) condition)) return true; else return false; } }, /** * Whether the text contains a token that starts with a given term (case insensitive) */ CONTAINS_PREFIX { @Override public boolean test(Object value, Object condition) { this.preevaluate(value,condition); if (value == null) return false; return evaluateRaw(value.toString(),(String)condition); } @Override public boolean evaluateRaw(String value, String prefix) { for (String token : tokenize(value.toLowerCase())) { if (PREFIX.evaluateRaw(token,prefix.toLowerCase())) return true; } return false; } @Override public boolean isValidCondition(Object condition) { return condition != null && condition instanceof String; } }, /** * Whether the text contains a token that matches a regular expression */ CONTAINS_REGEX { @Override public boolean test(Object value, Object condition) { this.preevaluate(value,condition); if (value == null) return false; return evaluateRaw(value.toString(),(String)condition); } @Override public boolean evaluateRaw(String value, String regex) { for (String token : tokenize(value.toLowerCase())) { if (REGEX.evaluateRaw(token,regex)) return true; } return false; } @Override public boolean isValidCondition(Object condition) { return condition != null && condition instanceof String && StringUtils.isNotBlank(condition.toString()); } }, /** * Whether the text starts with a given prefix (case sensitive) */ PREFIX { @Override public boolean test(Object value, Object condition) { this.preevaluate(value,condition); if (value==null) return false; return evaluateRaw(value.toString(),(String)condition); } @Override public boolean evaluateRaw(String value, String prefix) { return value.startsWith(prefix.trim()); } @Override public boolean isValidCondition(Object condition) { return condition != null && condition instanceof String; } }, /** * Whether the text matches a regular expression (case sensitive) */ REGEX { @Override public boolean test(Object value, Object condition) { this.preevaluate(value,condition); if (value == null) return false; return evaluateRaw(value.toString(),(String)condition); } public boolean evaluateRaw(String value, String regex) { return value.matches(regex); } @Override public boolean isValidCondition(Object condition) { return condition != null && condition instanceof String && StringUtils.isNotBlank(condition.toString()); } }, /** * Whether the text is at X Lenvenstein of a token (case sensitive) * with X=: * - 0 for strings of one or two characters * - 1 for strings of three, four or five characters * - 2 for strings of more than five characters */ FUZZY { @Override public boolean test(Object value, Object condition) { this.preevaluate(value, condition); if (value == null) return false; return evaluateRaw(value.toString(), (String) condition); } @Override public boolean evaluateRaw(String value, String term) { return isFuzzy(term.trim(),value.trim()); } @Override public boolean isValidCondition(Object condition) { return condition != null && condition instanceof String && StringUtils.isNotBlank(condition.toString()); } }, /** * Whether the text contains a token is at X Lenvenstein of a token (case insensitive) * with X=: * - 0 for strings of one or two characters * - 1 for strings of three, four or five characters * - 2 for strings of more than five characters */ CONTAINS_FUZZY { @Override public boolean test(Object value, Object condition) { this.preevaluate(value, condition); if (value == null) return false; return evaluateRaw(value.toString(), (String) condition); } @Override public boolean evaluateRaw(String value, String term) { for (String token : tokenize(value.toLowerCase())) { if (isFuzzy(term.toLowerCase(), token)) return true; } return false; } @Override public boolean isValidCondition(Object condition) { return condition != null && condition instanceof String && StringUtils.isNotBlank(condition.toString()); } }; /** * Whether {@code term} is at X Lenvenstein of a {@code value} * with X=: * - 0 for strings of one or two characters * - 1 for strings of three, four or five characters * - 2 for strings of more than five characters * @param value * @param term * @return true if {@code term} is similar to {@code value} */ private static boolean isFuzzy(String term, String value){ int distance; term = term.trim(); if (term.length() < 3) { distance = 0; } else if (term.length() < 6) { distance = 1; } else { distance = 2; } return LevenshteinDistance.getDefaultInstance().apply(value, term)<=distance; } private static final Logger log = LoggerFactory.getLogger(Text.class); public void preevaluate(Object value, Object condition) { Preconditions.checkArgument(this.isValidCondition(condition), "Invalid condition provided: %s", condition); if (!(value instanceof String)) log.debug("Value not a string: " + value); } abstract boolean evaluateRaw(String value, String condition); private static final int MIN_TOKEN_LENGTH = 1; public static List<String> tokenize(String str) { ArrayList<String> tokens = new ArrayList<String>(); int previous = 0; for (int p = 0; p < str.length(); p++) { if (!Character.isLetterOrDigit(str.charAt(p))) { if (p > previous + MIN_TOKEN_LENGTH) tokens.add(str.substring(previous, p)); previous = p + 1; } } if (previous + MIN_TOKEN_LENGTH < str.length()) tokens.add(str.substring(previous, str.length())); return tokens; } @Override public boolean isValidValueType(Class<?> clazz) { Preconditions.checkNotNull(clazz); return clazz.equals(String.class); } @Override public boolean hasNegation() { return false; } @Override public JanusGraphPredicate negate() { throw new UnsupportedOperationException(); } @Override public boolean isQNF() { return true; } //////////////// statics public static <V> P<V> textContains(final V value) { return new P(Text.CONTAINS, value); } public static <V> P<V> textContainsPrefix(final V value) { return new P(Text.CONTAINS_PREFIX, value); } public static <V> P<V> textContainsRegex(final V value) { return new P(Text.CONTAINS_REGEX, value); } public static <V> P<V> textPrefix(final V value) { return new P(Text.PREFIX, value); } public static <V> P<V> textRegex(final V value) { return new P(Text.REGEX, value); } public static <V> P<V> textContainsFuzzy(final V value) { return new P(Text.CONTAINS_FUZZY, value); } public static <V> P<V> textFuzzy(final V value) { return new P(Text.FUZZY, value); } }