package me.osm.gazetteer.web.api.query;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import me.osm.gazetteer.web.GazetteerWeb;
import me.osm.gazetteer.web.imp.IndexHolder;
import me.osm.gazetteer.web.imp.Replacer;
import me.osm.gazetteer.web.utils.ReplacersCompiler;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class QueryAnalyzerImpl implements QueryAnalyzer {
private static final Logger log = LoggerFactory.getLogger(QueryAnalyzerImpl.class);
private static final String tokenSeparators = GazetteerWeb.config().getQueryAnalyzerSeparators();
private static final String removeChars = GazetteerWeb.config().getRemoveCharacters();
private static final Pattern groupPattern = Pattern.compile("GROUP[0-9]+");
private static final List<String[]> charReplaces = IndexHolder.getCharFilterReplaces();
public static final Set<String> optionals = new HashSet<String>();
public static Pattern optRegexp = null;
static {
readOptionals();
}
public static final List<Replacer> searchReplacers = new ArrayList<>();
static {
ReplacersCompiler.compile(searchReplacers, new File("config/replacers/search/requiredSearchReplacers"));
}
@SuppressWarnings("unchecked")
private static void readOptionals() {
try {
Set<String> patterns = new HashSet<>();
for(String option : (List<String>)IOUtils.readLines(QueryAnalyzerImpl.class.getResourceAsStream("/optional"))) {
if(!StringUtils.startsWith(option, "#") && !StringUtils.isEmpty(option)) {
if(StringUtils.startsWith(option, "~")) {
patterns.add(StringUtils.substringAfter(option, "~"));
}
else {
optionals.add(StringUtils.lowerCase(option));
}
}
}
if(!patterns.isEmpty()) {
List<String> t = new ArrayList<>(patterns.size());
for(String s : patterns) {
t.add("(" + s + ")");
}
optRegexp = Pattern.compile(StringUtils.join(t, "|"));
}
}
catch (Exception e) {
throw new RuntimeException(e);
}
}
/* (non-Javadoc)
* @see me.osm.gazetteer.web.api.imp.QueryAnalyzer#getQuery(java.lang.String)
*/
@Override
public Query getQuery(String q) {
if(null == q) {
return null;
}
String original = q;
q = StringUtils.replaceChars(q, removeChars, null);
q = q.toLowerCase();
// See: gazetteer_schema.json settings.analysis.char_filter.*.mappings
for(String[] r : charReplaces) {
q = StringUtils.replace(q, r[0], r[1]);
}
LinkedHashMap<String, Collection<String>> groups = new LinkedHashMap<>();
for(Replacer r : searchReplacers) {
groups.putAll(r.replaceGroups(q));
}
HashMap<String, String> groupAliases = new HashMap<>();
int i = 0;
for(Entry<String, Collection<String>> gk : groups.entrySet()) {
String alias = "GROUP" + i++;
groupAliases.put(alias, gk.getKey());
q = StringUtils.replace(q, gk.getKey(), alias);
}
Set<String> matchedOptTokens = new HashSet<>();
if(optRegexp != null) {
Matcher matcher = optRegexp.matcher(q);
while(matcher.find()) {
String group = matcher.group(0);
for(String t : StringUtils.split(group, tokenSeparators)) {
matchedOptTokens.add(t);
}
}
}
String[] tokens = StringUtils.split(q, tokenSeparators);
List<QToken> result = new ArrayList<QToken>(tokens.length);
for(String t : tokens) {
List<String> variants = new ArrayList<>();
if(StringUtils.startsWith(t, "GROUP")) {
Matcher matcher = groupPattern.matcher(t);
if(matcher.find()) {
String matched = matcher.group();
String groupKey = groupAliases.get(matched);
if(groupKey != null) {
String tail = StringUtils.remove(t, matched);
t = groupKey + tail;
variants = new ArrayList<>();
for(String var : groups.get(groupKey)) {
variants.add(var + tail);
}
}
}
}
String withoutNumbers = StringUtils.replaceChars(t, "0123456789", "");
boolean hasNumbers = withoutNumbers.length() != t.length();
boolean numbersOnly = StringUtils.isBlank(withoutNumbers);
boolean optional = optionals.contains(StringUtils.lowerCase(t))
|| (!hasNumbers && withoutNumbers.length() < 3)
|| matchedOptTokens.contains(t);
result.add(new QToken(t, variants, hasNumbers, numbersOnly, optional));
}
Query query = new Query(result, original, varyOriginal(original));
log.trace("Query: {}", query.print());
return query;
}
private Collection<String> varyOriginal(String original) {
Collection<String> result = new ArrayList<>();
result.add(original);
String replaced = original;
for(String[] r : charReplaces) {
replaced = StringUtils.replace(replaced, r[0], r[1]);
}
result.add(replaced);
replaced = StringUtils.replaceChars(replaced, ".,", "");
result.add(replaced);
result.add(StringUtils.capitalize(replaced));
result.add(StringUtils.upperCase(replaced));
result.add(StringUtils.lowerCase(replaced));
return result;
}
}