/**
* Query
* Copyright 26.04.2015 by Michael Peter Christen, @0rb1t3r
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; wo even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package org.loklak.objects;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.elasticsearch.common.unit.DistanceUnit;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.RangeQueryBuilder;
import org.json.JSONException;
import org.json.JSONObject;
import org.loklak.data.Classifier;
import org.loklak.data.DAO;
import org.loklak.geo.GeoLocation;
import org.loklak.geo.GeoMark;
import org.loklak.tools.DateParser;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
/**
* A Query is a recording of a search result based on the query.
* THIS IS NOT RECORDED TO TRACK USER ACTIONS, THIS IS USED TO RE-SEARCH A QUERY INDEFINITELY!
* Each query will be stored in elasticsearch and retrieved by the caretaker process in
* order of the retrieval_next field. That date is calculated based on the number of search results
* in the last time; the retrieval_next is estimated based on the time interval of all tweets in
* the search results of the last query.
*
* Privacy is important:
* TO ALL COMMITTERS: please do not add any user-identification details to the data structures
* to protect the privacy of the users; TO CODE EVALUATORS: please look for yourself that this
* code does not contain any user-related information (like IP, user agent etc.).
*/
public class QueryEntry extends AbstractObjectEntry implements ObjectEntry {
private final static long DAY_MILLIS = 1000L * 60L * 60L * 24L;
private final static int RETRIEVAL_CONSTANT = 20; // the number of messages that we get with each retrieval at maximum
protected String query; // the query in the exact way as the user typed it in
protected int query_length; // the length in the query, number of characters
public SourceType source_type; // the (external) retrieval system where that query was submitted
protected int timezoneOffset; // the timezone offset of the user
protected Date query_first; // the date when this query was submitted by the user the first time
protected Date query_last; // the date when this query was submitted by the user the last time
protected Date retrieval_last; // the last time when this query was submitted to the external system
protected Date retrieval_next; // the estimated next time when the query should be submitted to get all messages
protected Date expected_next; // the estimated next time when one single message will appear
protected int query_count; // the number of queries by the user of that query done so far
protected int retrieval_count; // the number of retrievals of that query done so far to the external system
protected long message_period; // the estimated period length between two messages
protected int messages_per_day; // a message frequency based on the last query
protected long score_retrieval; // score for the retrieval order
protected long score_suggest; // score for the suggest order
/**
* This initializer can only be used for first-time creation of a query track.
* @param query
* @param timezoneOffset
* @param message_period
* @param source_type
* @throws MalformedURLException
*/
public QueryEntry(final String query, final int timezoneOffset, final long message_period, final SourceType source_type, final boolean byUserQuery) {
this.query = query;
this.query_length = query.length();
this.timezoneOffset = timezoneOffset;
this.source_type = source_type;
this.retrieval_count = 0; // will be set to 1 with first update
this.message_period = 0; // means: unknown
this.messages_per_day = 0; // means: unknown
this.score_retrieval = 0;
this.score_suggest = 0;
update(message_period, byUserQuery);
this.query_first = retrieval_last;
}
public QueryEntry(JSONObject json) throws IllegalArgumentException, JSONException {
init(json);
}
public void init(JSONObject json) throws IllegalArgumentException, JSONException {
this.query = (String) json.get("query");
this.query_length = (int) parseLong((Number) json.get("query_length"));
String source_type_string = (String) json.get("source_type");
if (source_type_string == null) source_type_string = SourceType.TWITTER.toString();
this.source_type = SourceType.byName(source_type_string);
this.timezoneOffset = (int) parseLong((Number) json.get("timezoneOffset"));
Date now = new Date();
this.query_first = json.has("query_first") ? parseDate(json.get("query_first"), now) : new Date();
this.query_last = json.has("query_last") ? parseDate(json.get("query_last"), now) : new Date();
this.retrieval_last = json.has("retrieval_last") ? parseDate(json.get("retrieval_last"), now) : new Date();
this.retrieval_next = json.has("retrieval_next") ? parseDate(json.get("retrieval_next"), now) : new Date();
this.expected_next = json.has("expected_next") ? parseDate(json.get("expected_next"), now) : new Date();
this.query_count = (int) parseLong((Number) json.get("query_count"));
this.retrieval_count = (int) parseLong((Number) json.get("retrieval_count"));
this.message_period = parseLong((Number) json.get("message_period"));
this.messages_per_day = (int) parseLong((Number) json.get("messages_per_day"));
this.score_retrieval = (int) parseLong((Number) json.get("score_retrieval"));
this.score_suggest = (int) parseLong((Number) json.get("score_suggest"));
}
/**
* update the query entry
* @param message_period
* @param byUserQuery is true, if the query was submitted by the user; false if the query was submitted by an automatic system
*/
public void update(final long message_period, final boolean byUserQuery) {
// message_period may have the value Long.MAX_VALUE if search requests have been empty and a message period cannot be computed
this.retrieval_last = new Date();
this.retrieval_count++;
if (byUserQuery) {
this.query_count++;
this.query_last = this.retrieval_last;
}
long new_message_period = message_period; // can be Long.MAX_VALUE if less than 2 messages are in timeline!
int new_messages_per_day = (int) (DAY_MILLIS / new_message_period); // this is an interpolation based on the last tweet list, can be 0!
if (new_message_period == Long.MAX_VALUE || new_messages_per_day == 0) {
this.message_period = DAY_MILLIS;
} else {
this.message_period = this.message_period == 0 ? new_message_period : (this.message_period + new_message_period) / 2;
}
this.messages_per_day = (int) (DAY_MILLIS / this.message_period);
double ttl_factor = DAO.getConfig("retrieval.queries.ttlfactor", 0.75d);
long pivot_period = DAO.getConfig("retrieval.queries.pivotfrequency", 10000);
this.expected_next = new Date(this.retrieval_last.getTime() + ((long) (ttl_factor * this.message_period)));
long strategic_period = // if the period is far below the minimum, we apply a penalty
(this.message_period < pivot_period ?
pivot_period + 1000 * (long) Math.pow((pivot_period - this.message_period) / 1000, 3) :
this.message_period);
long waitingtime = Math.min(DAY_MILLIS, (long) (ttl_factor * RETRIEVAL_CONSTANT * strategic_period));
this.retrieval_next = new Date(this.retrieval_last.getTime() + waitingtime);
}
// to check the retrieval order created by the update method, call
// http://localhost:9000/api/suggest.json?orderby=retrieval_next&order=asc
/**
* A 'blind' update can be done if the user submits a query but there are rules which prevent that the target system is queried
* as well. Then the query result is calculated using the already stored messages. To reflect this, only the query-related
* attributes are changed.
*/
public void update() {
this.query_count++;
this.query_last = new Date();
}
public String getQuery() {
return this.query;
}
public int getQueryLength() {
return this.query_length;
}
public SourceType getSourceType() {
return this.source_type;
}
public Date getQueryFirst() {
return this.query_first;
}
public Date getQueryLast() {
return this.query_last;
}
public Date getRetrievalLast() {
return this.retrieval_last;
}
public Date getRetrievalNext() {
return this.retrieval_next;
}
public Date getExpectedNext() {
return this.expected_next;
}
public int getTimezoneOffset() {
return this.timezoneOffset;
}
public int getQueryCount() {
return this.query_count;
}
public int getRetrievalCount() {
return this.retrieval_count;
}
public int getMessagesPerDay() {
return this.messages_per_day;
}
@Override
public JSONObject toJSON() {
JSONObject m = new JSONObject();
m.put("query", this.query);
m.put("query_length", this.query_length);
m.put("source_type", this.source_type.toString());
m.put("timezoneOffset", this.timezoneOffset);
if (this.query_first != null) m.put("query_first", utcFormatter.print(this.query_first.getTime()));
if (this.query_last != null) m.put("query_last", utcFormatter.print(this.query_last.getTime()));
if (this.retrieval_last != null) m.put("retrieval_last", utcFormatter.print(this.retrieval_last.getTime()));
if (this.retrieval_next != null) m.put("retrieval_next", utcFormatter.print(this.retrieval_next.getTime()));
if (this.expected_next != null) m.put("expected_next", utcFormatter.print(this.expected_next.getTime()));
m.put("query_count", this.query_count);
m.put("retrieval_count", this.retrieval_count);
m.put("message_period", this.message_period);
m.put("messages_per_day", this.messages_per_day);
m.put("score_retrieval", this.score_retrieval);
m.put("score_suggest", this.score_suggest);
return m;
}
private final static Pattern tokenizerPattern = Pattern.compile("([^\"]\\S*|\".+?\")\\s*"); // tokenizes Strings into terms respecting quoted parts
private static enum Constraint {
image("images"),
audio("audio"),
video("videos"),
place("place_name"),
location("location_point"),
link("links"),
mention("mentions"),
source_type("source_type"),
hashtag("hashtags"),
emotion("classifier_emotion"),
profanity("classifier_profanity"),
language("classifier_language"),
pure(""),
len25("text_length"), len50("text_length"), len75("text_length"), len100("text_length");
protected String field_name;
protected Pattern pattern;
private Constraint(String field_name) {
this.field_name = field_name;
this.pattern = Pattern.compile("\\s?\\-?/" + this.name() + "\\S*");
}
}
public static class Tokens {
public final String original;
public String raw;
public final HashSet<String> constraints_positive, constraints_negative;
public Multimap<String, String> modifier;
public PlaceContext place_context;
public double[] bbox; // double[]{lon_west,lat_south,lon_east,lat_north}
public Tokens(final String q) {
this.original = q;
List<String> tokens = new ArrayList<String>();
Matcher m = tokenizerPattern.matcher(q);
while (m.find()) tokens.add(m.group(1));
this.constraints_positive = new HashSet<>();
this.constraints_negative = new HashSet<>();
this.modifier = HashMultimap.create();
StringBuilder rawb = new StringBuilder(q.length() + 1);
Set<String> hashtags = new HashSet<>();
for (String t: tokens) {
if (t.startsWith("/")) {
constraints_positive.add(t.substring(1));
continue;
} else if (t.startsWith("-/")) {
constraints_negative.add(t.substring(2));
continue;
} else if (t.indexOf(':') > 0) {
int p = t.indexOf(':');
modifier.put(t.substring(0, p).toLowerCase(), t.substring(p + 1));
rawb.append(t).append(' ');
continue;
} else {
if (t.startsWith("#")) hashtags.add(t.substring(1));
rawb.append(t).append(' ');
}
}
this.place_context = this.constraints_positive.remove("about") ? PlaceContext.ABOUT : PlaceContext.FROM;
if (this.constraints_negative.remove("about")) this.place_context = PlaceContext.FROM;
if (rawb.length() > 0 && rawb.charAt(rawb.length() - 1) == ' ') rawb.setLength(rawb.length() - 1);
this.raw = rawb.toString();
// fix common mistake using hashtags in combination with their words without hashtag
for (String h: hashtags) {
int p = this.raw.indexOf(h + " #" + h);
if (p >= 0) this.raw = this.raw.substring(0, p) + h + " OR #" + h + this.raw.substring(p + h.length() * 2 + 2);
p = this.raw.indexOf("#" + h + " " + h);
if (p >= 0) this.raw = this.raw.substring(0, p) + "#" + h + " OR " + h + this.raw.substring(p + h.length() * 2 + 2);
}
// find bbox
this.bbox = null;
bboxsearch: for (String cs: this.constraints_positive) {
if (cs.startsWith(Constraint.location.name() + "=")) {
String params = cs.substring(Constraint.location.name().length() + 1);
String[] coord = params.split(",");
if (coord.length == 4) {
this.bbox = new double[4];
for (int i = 0; i < 4; i++) this.bbox[i] = Double.parseDouble(coord[i]);
break bboxsearch;
}
}
}
if (modifier.containsKey("near")) {
// either check coordinates or name
String near_name = modifier.get("near").iterator().next();
GeoMark loc = DAO.geoNames.analyse(near_name, null, 10, Long.toString(System.currentTimeMillis()));
if (loc != null) {
this.bbox = new double[]{loc.lon() - 1.0, loc.lat() + 1.0, loc.lon() + 1.0, loc.lat() - 1.0};
}
}
}
public String translate4scraper() {
// check if a location constraint was given
String dateclean = this.raw.replace(" since:hour", "").replace(" since:day", "").replace(" since:week", "").replace(" since:all", "").replace(" since:alltime", "").replace(" since:wholetime", "");
if (this.bbox == null || this.original.indexOf("near:") > 0) {
return dateclean;
}
// find place within the bbox
double lon_west = this.bbox[0];
double lat_south = this.bbox[1];
double lon_east = this.bbox[2];
double lat_north = this.bbox[3];
assert lon_west < lon_east;
assert lat_north > lat_south;
// find largest city around to compute a 'near:' operator for twitter
double lon_km = 40000 / 360 * (lon_east - lon_west);
double lat_km = 40000 / 360 * (lat_north- lat_south);
double within_km = Math.max(2.0, Math.max(lon_km, lat_km) / 2);
double lon_border = (lon_east - lon_west) / 3;
double lat_border = (lat_north - lat_south) / 3;
GeoLocation largestCity = DAO.geoNames.getLargestCity(lon_west + lon_border, lat_south + lat_border, lon_east - lon_border, lat_north - lat_border);
if (largestCity == null) largestCity = DAO.geoNames.getLargestCity(lon_west, lat_south, lon_east, lat_north);
if (largestCity == null) largestCity = DAO.geoNames.cityNear((lat_north + lat_south) / 2.0, (lon_east + lon_west) / 2.0);
String q = dateclean + " near:\"" + largestCity.getNames().iterator().next() + "\" within:" + ((int) (within_km / 1.609344)) + "mi"; // stupid imperial units are stupid
return q;
}
}
public static Timeline applyConstraint(Timeline tl0, Tokens tokens, boolean applyLocationConstraint) {
if (tokens.constraints_positive.size() == 0 && tokens.constraints_negative.size() == 0 && tokens.modifier.size() == 0) return tl0;
Timeline tl1 = new Timeline(tl0.getOrder());
messageloop: for (MessageEntry message: tl0) {
// check modifier
if (tokens.modifier.containsKey("from")) {
for (String screen_name: tokens.modifier.get("from")) {
if (!message.getScreenName().equals(screen_name)) continue messageloop;
}
}
if (tokens.modifier.containsKey("-from")) {
for (String screen_name: tokens.modifier.get("-from")) {
if (message.getScreenName().equals(screen_name)) continue messageloop;
}
}
if (applyLocationConstraint && tokens.bbox != null) {
if (message.location_point == null || message.location_point.length < 2) continue messageloop; //longitude, latitude
if (message.location_point[0] < tokens.bbox[0] || message.location_point[0] > tokens.bbox[2] || // double[]{lon_west,lat_south,lon_east,lat_north}
message.location_point[1] > tokens.bbox[1] || message.location_point[1] < tokens.bbox[3]) continue messageloop;
}
// check constraints
if (tokens.constraints_positive.contains("pure") && (
message.getImages().size() != 0 ||
message.getMentions().length != 0 ||
message.getLinks().length != 0 ||
message.getHashtags().length != 0
)) continue;
if (tokens.constraints_positive.contains("len25") && message.getTextLength() > 25) continue;
if (tokens.constraints_positive.contains("len50") && message.getTextLength() > 50) continue;
if (tokens.constraints_positive.contains("len75") && message.getTextLength() > 75) continue;
if (tokens.constraints_positive.contains("len100") && message.getTextLength() > 100) continue;
if (tokens.constraints_positive.contains("image") && message.getImages().size() == 0) continue;
if (tokens.constraints_negative.contains("image") && message.getImages().size() != 0) continue;
if (tokens.constraints_positive.contains("place") && message.getPlaceName().length() == 0) continue;
if (tokens.constraints_negative.contains("place") && message.getPlaceName().length() != 0) continue;
if (tokens.constraints_positive.contains("location") && (message.getLocationPoint() == null || message.getPlaceContext() != tokens.place_context)) continue;
if (tokens.constraints_negative.contains("location") && message.getLocationPoint() != null) continue;
if (tokens.constraints_positive.contains("link") && message.getLinks().length == 0) continue;
if (tokens.constraints_negative.contains("link") && message.getLinks().length != 0) continue;
if (tokens.constraints_positive.contains("mention") && message.getMentions().length == 0) continue;
if (tokens.constraints_negative.contains("mention") && message.getMentions().length != 0) continue;
if (tokens.constraints_positive.contains("hashtag") && message.getHashtags().length == 0) continue;
if (tokens.constraints_negative.contains("hashtag") && message.getHashtags().length != 0) continue;
for (Classifier.Context context: Classifier.Context.values()) {
if (tokens.constraints_positive.contains(context.name()) && message.getClassifier(context) == null) continue messageloop;
if (tokens.constraints_negative.contains(context.name()) && message.getClassifier(context) != null) continue messageloop;
}
// special treatment of location and link constraint
constraintCheck: for (String cs: tokens.constraints_positive) {
if (cs.startsWith(Constraint.location.name() + "=")) {
if (message.getLocationPoint() == null) continue messageloop;
if (message.getPlaceContext() != tokens.place_context) continue messageloop;
String params = cs.substring(Constraint.location.name().length() + 1);
String[] coord = params.split(",");
if (coord.length == 4) {
double lon = message.getLocationPoint()[0];
double lon_west = Double.parseDouble(coord[0]);
double lon_east = Double.parseDouble(coord[2]);
assert lon_west < lon_east;
if (lon < lon_west || lon > lon_east) continue messageloop;
double lat = message.getLocationPoint()[1];
double lat_south = Double.parseDouble(coord[1]);
double lat_north = Double.parseDouble(coord[3]);
assert lat_north > lat_south;
if (lat < lat_south || lat > lat_north) continue messageloop;
}
}
if (cs.startsWith(Constraint.link.name() + "=")) {
if (message.getLinks().length == 0) continue messageloop;
Pattern regex = Pattern.compile(cs.substring(Constraint.link.name().length() + 1));
for (String link: message.getLinks()) {
if (regex.matcher(link).matches()) continue constraintCheck;
}
// no match
continue messageloop;
}
}
tl1.add(message, tl0.getUser(message.getScreenName()));
}
return tl1;
}
private final static Pattern term4ORPattern = Pattern.compile("(?:^| )(\\S*(?: OR \\S*)+)(?: |$)"); // Pattern.compile("(^\\s*(?: OR ^\\s*+)+)");
private static List<String> splitIntoORGroups(String q) {
// detect usage of OR junctor usage. Right now we cannot have mixed AND and OR usage. Thats a hack right now
q = q.replaceAll(" AND ", " "); // AND is default
// tokenize the query
ArrayList<String> list = new ArrayList<>();
Matcher m = term4ORPattern.matcher(q);
while (m.find()) {
String d = m.group(1);
q = q.replace(d, "").replace(" ", " ");
list.add(d);
m = term4ORPattern.matcher(q);
}
q = q.trim();
if (q.length() > 0) list.add(0, q);
return list;
}
public static void main(String[] args) {
splitIntoORGroups("Alpha OR Beta AND Gamma /constraint sand OR kies OR wasser skilanglauf");
}
/**
* fixing a query mistake covers most common wrong queries from the user
* @param q
* @return the fixed query
*/
public static String fixQueryMistakes(String q) {
q = q.replaceAll("hashtag:", "#");
q = q.replaceAll(" AND ", " "); // AND is default
return q;
}
public static class ElasticsearchQuery {
public QueryBuilder queryBuilder;
public Date since;
public Date until;
public ElasticsearchQuery(String q, int timezoneOffset) {
// default values for since and util
this.since = new Date(0);
this.until = new Date(Long.MAX_VALUE);
// parse the query
this.queryBuilder = preparse(q, timezoneOffset);
}
private QueryBuilder preparse(String q, int timezoneOffset) {
// detect usage of OR connector usage.
q = QueryEntry.fixQueryMistakes(q);
List<String> terms = splitIntoORGroups(q); // OR binds stronger than AND
if (terms.size() == 0) return QueryBuilders.constantScoreQuery(QueryBuilders.matchAllQuery());
// special handling
if (terms.size() == 1) return parse(terms.get(0), timezoneOffset);
// generic handling
BoolQueryBuilder aquery = QueryBuilders.boolQuery();
for (String t: terms) {
QueryBuilder partial = parse(t, timezoneOffset);
aquery.filter(partial);
}
return aquery;
}
private QueryBuilder parse(String q, int timezoneOffset) {
// detect usage of OR ORconnective usage. Because of the preparse step we will have only OR or only AND here.
q = q.replaceAll(" AND ", " "); // AND is default
boolean ORconnective = q.indexOf(" OR ") >= 0;
q = q.replaceAll(" OR ", " "); // if we know that all terms are OR, we remove that and apply it later
// tokenize the query
Set<String> qe = new LinkedHashSet<String>();
Matcher m = tokenizerPattern.matcher(q);
while (m.find()) qe.add(m.group(1));
// twitter search syntax:
// term1 term2 term3 - all three terms shall appear
// "term1 term2 term3" - exact match of all terms
// term1 OR term2 OR term3 - any of the three terms shall appear
// from:user - tweets posted from that user
// to:user - tweets posted to that user
// @user - tweets which mention that user
// near:"location" within:xmi - tweets that are near that location
// #hashtag - tweets containing the given hashtag
// since:2015-04-01 until:2015-04-03 - tweets within given time range
// additional constraints:
// /image /audio /video /place - restrict to tweets which have attached images, audio, video or place
ArrayList<String> text_positive_match = new ArrayList<>();
ArrayList<String> text_negative_match = new ArrayList<>();
ArrayList<String> text_positive_filter = new ArrayList<>();
ArrayList<String> text_negative_filter = new ArrayList<>();
ArrayList<String> users_positive = new ArrayList<>();
ArrayList<String> users_negative = new ArrayList<>();
ArrayList<String> hashtags_positive = new ArrayList<>();
ArrayList<String> hashtags_negative = new ArrayList<>();
Multimap<String, String> modifier = HashMultimap.create();
Set<String> constraints_positive = new HashSet<>();
Set<String> constraints_negative = new HashSet<>();
for (String t: qe) {
if (t.length() == 0) continue;
if (t.startsWith("@")) {
users_positive.add(t.substring(1));
continue;
} else if (t.startsWith("-@")) {
users_negative.add(t.substring(2));
continue;
} else if (t.startsWith("#")) {
hashtags_positive.add(t.substring(1));
continue;
} else if (t.startsWith("-#")) {
hashtags_negative.add(t.substring(2));
continue;
} else if (t.startsWith("/")) {
constraints_positive.add(t.substring(1));
continue;
} else if (t.startsWith("-/")) {
constraints_negative.add(t.substring(2));
continue;
} else if (t.indexOf(':') > 0) {
int p = t.indexOf(':');
modifier.put(t.substring(0, p).toLowerCase(), t.substring(p + 1));
continue;
} else {
// patch characters that will confuse elasticsearch or have a different meaning
boolean negative = t.startsWith("-");
if (negative) t = t.substring(1);
if (t.length() == 0) continue;
if ((t.charAt(0) == '"' && t.charAt(t.length() - 1) == '"') || (t.charAt(0) == '\'' && t.charAt(t.length() - 1) == '\'')) {
t = t.substring(1, t.length() - 1);
if (negative) text_negative_filter.add(t); else text_positive_filter.add(t);
} else if (t.indexOf('-') > 0) {
// this must be handled like a quoted string without the minus
t = t.replaceAll("-", " ");
if (negative) text_negative_filter.add(t); else text_positive_filter.add(t);
} else {
if (negative) text_negative_match.add(t); else text_positive_match.add(t);
}
continue;
}
}
if (modifier.containsKey("to")) users_positive.addAll(modifier.get("to"));
if (modifier.containsKey("-to")) users_negative.addAll(modifier.get("-to"));
// special constraints
boolean constraint_about = constraints_positive.remove("about");
if (constraints_negative.remove("about")) constraint_about = false;
// compose query for text
List<QueryBuilder> ops = new ArrayList<>();
List<QueryBuilder> nops = new ArrayList<>();
List<QueryBuilder> filters = new ArrayList<>();
if (text_positive_match.size() == 1) {
BoolQueryBuilder disjunction = QueryBuilders.boolQuery();
disjunction.should(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("screen_name", text_positive_match.get(0))));
disjunction.should(QueryBuilders.constantScoreQuery(QueryBuilders.matchQuery("text", text_positive_match.get(0))));
disjunction.minimumNumberShouldMatch(1);
ops.add(disjunction);
} else {
for (String text: text_positive_match) {
ops.add(QueryBuilders.constantScoreQuery(QueryBuilders.matchQuery("text", text)));
}
}
if (text_negative_match.size() == 1) {
BoolQueryBuilder disjunction = QueryBuilders.boolQuery();
disjunction.should(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("screen_name", text_negative_match.get(0))));
disjunction.should(QueryBuilders.constantScoreQuery(QueryBuilders.matchQuery("text", text_negative_match.get(0))));
disjunction.minimumNumberShouldMatch(1);
ops.add(disjunction);
} else {
for (String text: text_negative_match) {
// negation of terms in disjunctions would cause to retrieve almost all documents
// this cannot be the requirement of the user. It may be valid in conjunctions, but not in disjunctions
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.matchQuery("text", text)));
}
}
// apply modifiers
if (modifier.containsKey("id")) {
ops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("id_str", modifier.get("id"))));
}
if (modifier.containsKey("-id")) {
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("id_str", modifier.get("-id"))));
}
for (String user: users_positive) {
ops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("mentions", user)));
}
for (String user: users_negative) nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("mentions", user)));
for (String hashtag: hashtags_positive) {
ops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("hashtags", hashtag.toLowerCase())));
}
for (String hashtag: hashtags_negative) nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("hashtags", hashtag.toLowerCase())));
if (constraints_positive.remove("pure")) {
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.existsQuery(Constraint.image.field_name)));
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.existsQuery(Constraint.audio.field_name)));
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.existsQuery(Constraint.video.field_name)));
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.existsQuery(Constraint.link.field_name)));
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.existsQuery(Constraint.mention.field_name)));
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.existsQuery(Constraint.hashtag.field_name)));
};
if (constraints_positive.remove("len25")) {
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders
.rangeQuery(Constraint.len25.field_name).from(26).to(1000).includeLower(true).includeUpper(false)));
};
if (constraints_positive.remove("len50")) {
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders
.rangeQuery(Constraint.len50.field_name).from(51).to(1000).includeLower(true).includeUpper(false)));
};
if (constraints_positive.remove("len75")) {
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders
.rangeQuery(Constraint.len75.field_name).from(76).to(1000).includeLower(true).includeUpper(false)));
};
if (constraints_positive.remove("len100")) {
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders
.rangeQuery(Constraint.len100.field_name).from(101).to(1000).includeLower(true).includeUpper(false)));
};
if (modifier.containsKey("from")) {
for (String screen_name: modifier.get("from")) {
if (screen_name.indexOf(',') < 0) {
ops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("screen_name", screen_name)));
} else {
String[] screen_names = screen_name.split(",");
BoolQueryBuilder disjunction = QueryBuilders.boolQuery();
for (String name: screen_names) disjunction.should(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("screen_name", name)));
disjunction.minimumNumberShouldMatch(1);
ops.add(disjunction);
}
}
}
if (modifier.containsKey("-from")) {
for (String screen_name: modifier.get("-from")) {
if (screen_name.indexOf(',') < 0) {
nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("screen_name", screen_name)));
} else {
String[] screen_names = screen_name.split(",");
for (String name: screen_names) nops.add(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("screen_name", name)));
}
}
}
if (modifier.containsKey("near")) {
// either check coordinates or name
String near_name = modifier.get("near").iterator().next();
GeoMark loc = DAO.geoNames.analyse(near_name, null, 10, Long.toString(System.currentTimeMillis()));
if (loc == null) {
BoolQueryBuilder nearquery = QueryBuilders.boolQuery()
.should(QueryBuilders.constantScoreQuery(QueryBuilders.matchQuery("place_name", near_name)))
.should(QueryBuilders.constantScoreQuery(QueryBuilders.matchQuery("text", near_name)));
nearquery.minimumNumberShouldMatch(1);
ops.add(QueryBuilders.boolQuery().filter(nearquery).filter(QueryBuilders.constantScoreQuery(QueryBuilders.matchQuery("place_context", PlaceContext.FROM.name()))));
} else {
filters.add(QueryBuilders.geoDistanceQuery("location_point").distance(100.0, DistanceUnit.KILOMETERS).lat(loc.lat()).lon(loc.lon()));
}
}
if (modifier.containsKey("since")) try {
Calendar since = DateParser.parse(modifier.get("since").iterator().next(), timezoneOffset);
this.since = since.getTime();
RangeQueryBuilder rangeQuery = QueryBuilders.rangeQuery(AbstractObjectEntry.CREATED_AT_FIELDNAME).from(this.since);
if (modifier.containsKey("until")) {
Calendar until = DateParser.parse(modifier.get("until").iterator().next(), timezoneOffset);
if (until.get(Calendar.HOUR) == 0 && until.get(Calendar.MINUTE) == 0) {
// until must be the day which is included in results.
// To get the result within the same day, we must add one day.
until.add(Calendar.DATE, 1);
}
this.until = until.getTime();
rangeQuery.to(this.until);
} else {
this.until = new Date(Long.MAX_VALUE);
}
ops.add(rangeQuery);
} catch (ParseException e) {} else if (modifier.containsKey("until")) try {
Calendar until = DateParser.parse(modifier.get("until").iterator().next(), timezoneOffset);
if (until.get(Calendar.HOUR) == 0 && until.get(Calendar.MINUTE) == 0) {
// until must be the day which is included in results.
// To get the result within the same day, we must add one day.
until.add(Calendar.DATE, 1);
}
this.until = until.getTime();
RangeQueryBuilder rangeQuery = QueryBuilders.rangeQuery(AbstractObjectEntry.CREATED_AT_FIELDNAME).to(this.until);
ops.add(rangeQuery);
} catch (ParseException e) {}
// apply the ops and nops
QueryBuilder bquery = QueryBuilders.boolQuery();
if (ops.size() == 1 && nops.size() == 0)
bquery = ops.iterator().next();
else if (ops.size() == 0 && nops.size() == 1)
bquery = QueryBuilders.boolQuery().mustNot(ops.iterator().next());
else {
for (QueryBuilder qb: ops) {
if (ORconnective) ((BoolQueryBuilder) bquery).should(qb); else ((BoolQueryBuilder) bquery).filter(qb);
}
if (ORconnective) ((BoolQueryBuilder) bquery).minimumNumberShouldMatch(1);
for (QueryBuilder nqb: nops) {
((BoolQueryBuilder) bquery).mustNot(nqb);
}
}
// apply constraints as filters
for (String text: text_positive_filter) {
filters.add(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("text", text)));
}
for (String text: text_negative_filter) filters.add(QueryBuilders.boolQuery().mustNot(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("text", text))));
for (Constraint c: Constraint.values()) {
if (c.field_name != null && c.field_name.length() > 0) {
if (constraints_positive.contains(c.name())) {
filters.add(QueryBuilders.existsQuery(c.field_name));
}
if (constraints_negative.contains(c.name())) {
filters.add(QueryBuilders.boolQuery().mustNot(QueryBuilders.existsQuery(c.field_name)));
}
}
}
if (constraints_positive.contains("location")) {
filters.add(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("place_context", (constraint_about ? PlaceContext.ABOUT : PlaceContext.FROM).name())));
}
// special treatment of location constraints of the form /location=lon-west,lat-south,lon-east,lat-north i.e. /location=8.58,50.178,8.59,50.181
// source_type constraint of the form /source_type=FOSSASIA_API -> search exact term (source_type must exists in SourceType enum)
for (String cs: constraints_positive) {
if (cs.startsWith(Constraint.location.name() + "=")) {
String params = cs.substring(Constraint.location.name().length() + 1);
String[] coord = params.split(",");
if (coord.length == 1) {
filters.add(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("location_source", coord[0])));
} else if (coord.length == 2) {
double lon = Double.parseDouble(coord[0]);
double lat = Double.parseDouble(coord[1]);
// ugly way to search exact geo_point : using geoboundingboxfilter, with two identical bounding points
// geoshape filter can search for exact point shape but it can't be performed on geo_point field
filters.add(QueryBuilders.geoBoundingBoxQuery("location_point")
.topLeft(lat, lon)
.bottomRight(lat, lon));
}
else if (coord.length == 4 || coord.length == 5) {
double lon_west = Double.parseDouble(coord[0]);
double lat_south = Double.parseDouble(coord[1]);
double lon_east = Double.parseDouble(coord[2]);
double lat_north = Double.parseDouble(coord[3]);
PlaceContext context = constraint_about ? PlaceContext.ABOUT : PlaceContext.FROM;
filters.add(QueryBuilders.existsQuery(Constraint.location.field_name));
filters.add(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("place_context", context.name())));
filters.add(QueryBuilders.geoBoundingBoxQuery("location_point")
.topLeft(lat_north, lon_west)
.bottomRight(lat_south, lon_east));
if (coord.length == 5) filters.add(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("location_source", coord[4])));
}
} else if (cs.startsWith(Constraint.link.name() + "=")) {
String regexp = cs.substring(Constraint.link.name().length() + 1);
filters.add(QueryBuilders.existsQuery(Constraint.link.field_name));
filters.add(QueryBuilders.regexpQuery(Constraint.link.field_name, regexp));
} else if (cs.startsWith(Constraint.source_type.name() + "=")) {
String regexp = cs.substring(Constraint.source_type.name().length() + 1);
if (SourceType.isValid(regexp)) {
filters.add(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("_type", regexp)));
}
}
}
for (String cs : constraints_negative) {
if (cs.startsWith(Constraint.source_type.name() + "=")) {
String regexp = cs.substring(Constraint.source_type.name().length() + 1);
if (SourceType.isValid(regexp)) {
filters.add(QueryBuilders.boolQuery().mustNot(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("_type", regexp))));
}
}
}
BoolQueryBuilder queryFilter = QueryBuilders.boolQuery();
for(QueryBuilder filter : filters){
queryFilter.filter(filter);
}
QueryBuilder cquery = filters.size() == 0 ? bquery : QueryBuilders.boolQuery().filter(bquery).filter(queryFilter);
return cquery;
}
}
public static enum PlaceContext {
FROM, // the message was made at that place
ABOUT; // the message is about that place
}
}