/* * Copyright 2011 Peter Karich, jetwick_@_pannous_._info. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.jetwick.es; import de.jetwick.data.JTweet; import de.jetwick.tw.cmd.TermCreateCommand; import de.jetwick.util.Helper; import java.util.Collection; import java.util.LinkedHashSet; import java.util.Map.Entry; import java.util.Set; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilders; /** * * @author Peter Karich, jetwick_@_pannous_._info */ public class SimilarTweetQuery extends TweetQuery { private double mmBorder = 0.7; private JTweet tweet; /* for tests */ public SimilarTweetQuery() { } public SimilarTweetQuery(JTweet tweet, boolean facets) { super(facets); this.tweet = tweet; if (this.tweet == null) throw new IllegalArgumentException("Tweet cannot be null"); new TermCreateCommand().calcTermsWithoutNoise(tweet); getFilterQueries().clear(); addFilterQuery(ElasticTweetSearch.IS_RT, false); } public double getMmBorder() { return mmBorder; } /** * Set minimal match (percentage) for similar tweet detection when querying */ public SimilarTweetQuery setMmBorder(double mmBorder) { this.mmBorder = mmBorder; return this; } public Collection<String> calcTerms() { Set<String> res = new LinkedHashSet<String>(); for (Entry<String, Integer> e : getTerms()) { res.add(e.getKey()); } return res; } Collection<Entry<String, Integer>> getTerms() { return tweet.getTextTerms().getSortedTermLimited(8); } @Override protected QueryBuilder createQuery(String queryStr) { // use configured stemmer, but querying seems to be slower! // BoolQueryBuilder bqb = QueryBuilders.boolQuery().minimumNumberShouldMatch(minMatchNumber); // for (Entry<String, Integer> entry : terms) { // bqb.should(QueryBuilders.queryString(ElasticTweetSearch.TWEET_TEXT + ":" + Solr2ElasticTweet.escapeQuery(entry.getKey()))); // } // // qb = bqb; Collection<Entry<String, Integer>> terms = getTerms(); int minMatchNumber = (int) Math.round(terms.size() * mmBorder); // maximal 6 terms minMatchNumber = Math.min(6, minMatchNumber); // minimal 4 terms minMatchNumber = Math.max(4, minMatchNumber); // do we need to escape the terms when querying? Collection<String> coll = doSnowballTermsStemming(terms); return QueryBuilders.termsQuery(ElasticTweetSearch.TWEET_TEXT, Helper.toStringArray(coll)). minimumMatch(minMatchNumber); } }