package org.apache.lucene.queryparser.flexible.aqp.processors;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAdsabsRegexQueryNode;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler;
import org.apache.lucene.queryparser.flexible.core.nodes.BooleanQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.FuzzyQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.TextableQueryNode;
import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorImpl;
import org.apache.lucene.queryparser.flexible.messages.MessageImpl;
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys;
import org.apache.lucene.queryparser.flexible.standard.nodes.PrefixWildcardQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.RegexpQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode;
import org.apache.solr.analysis.author.AuthorNormalizeFilter;
import org.apache.solr.analysis.author.AuthorUtils;
import org.apache.solr.analysis.author.PythonicAuthorNormalizerFilter;
/**
* Looks at the QueryNode(s) and if they are author searches,
* it adds to them some ADS specific post-analysis logic
*
* @see AqpFieldMapperProcessor
* @see QueryConfigHandler
*
*/
public class AqpAdsabsExpandAuthorSearchProcessor extends QueryNodeProcessorImpl {
private Map<String, int[]> fields;
public AqpAdsabsExpandAuthorSearchProcessor() {
// empty constructor
}
@Override
public QueryNode process(QueryNode queryTree) throws QueryNodeException {
if (getQueryConfigHandler().has(AqpAdsabsQueryConfigHandler.ConfigurationKeys.AUTHOR_FIELDS)) {
fields = getQueryConfigHandler().get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.AUTHOR_FIELDS);
return super.process(queryTree);
}
return queryTree;
}
@Override
protected QueryNode preProcessNode(QueryNode node)
throws QueryNodeException {
return node;
}
@Override
protected QueryNode postProcessNode(QueryNode node)
throws QueryNodeException {
if (node.getTag(AqpAdsabsAnalyzerProcessor.ORIGINAL_VALUE) != null) {
String origValue = (String) node.getTag(AqpAdsabsAnalyzerProcessor.ORIGINAL_VALUE);
//String normalized = AuthorUtils.normalizeAuthor(origValue);
for (String normalized: normalizeAuthorName(origValue)) {
NameInfo nameInfo = new NameInfo(normalized);
int[] level = new int[]{0}; //ugly, ugly
node = expandNodes(node, nameInfo, level);
}
}
return node;
}
@Override
protected List<QueryNode> setChildrenOrder(List<QueryNode> children)
throws QueryNodeException {
return children;
}
private QueryNode expandNodes(QueryNode node, NameInfo origNameInfo, int[] level) throws QueryNodeException {
ArrayList<QueryNode> collector = new ArrayList<QueryNode>();
if (!node.isLeaf()) {
List<QueryNode> children = node.getChildren();
boolean changed = false;
for (int i=0;i<children.size();i++) {
doExpansion(origNameInfo, children.get(i), collector, level);
// interlacing new values right behind the old values
// it looks stupid (and is dangerous, true...) but i do it
// to make the results more readable (to show expansion right
// after the source token)
if (collector.size() > 0) {
changed = true;
children.addAll(i+1, collector);
i += collector.size();
collector.clear();
}
}
if (changed)
node.set(children);
}
else {
// now expand the parent
doExpansion(origNameInfo, node, collector, level);
}
if (collector.size()>0) {
collector.add(0, node);
return new GroupQueryNode(new BooleanQueryNode(collector));
}
return node;
}
private void doExpansion(NameInfo origNameInfo, QueryNode node, List<QueryNode> parentChildren, int[] level)
throws QueryNodeException {
if (node instanceof TextableQueryNode ) {
level[0] = level[0]+1; // marker to tell us not to expand synonyms any more
if (node instanceof FuzzyQueryNode || node instanceof RegexpQueryNode
|| node instanceof WildcardQueryNode) {
return;
}
FieldQueryNode fqn = ((FieldQueryNode) node);
if (fields.containsKey(fqn.getFieldAsString())) {
// 'name upgrade'
if (level[0] == 1 && !isLongForm(origNameInfo.origName)) {
try {
String[] synonyms = getSynonyms(origNameInfo.origName);
if (synonyms != null) {
for (String syn: synonyms) {
parentChildren.add(new FieldQueryNode(fqn.getField(), syn, fqn.getBegin(), fqn.getEnd()));
}
}
} catch (IOException e) {
throw new QueryNodeException(new MessageImpl("Wonky, wonky, bong, bong - synonym expansion failed..." + e.getMessage()));
}
}
String v = fqn.getTextAsString();
String[] nameParts = AuthorUtils.splitName(fqn.getTextAsString());
/*
if (node instanceof WildcardQueryNode) { // only "kurtz, m*" cases are tolerated
if (nameParts[nameParts.length-1].length() > 1) return;
nameParts[nameParts.length-1] = nameParts[nameParts.length-1].replace("*", "").trim();
}
*/
if (nameParts.length == 1) { // the new name is just surname
if (nameParts.length < origNameInfo.noOfParts ) return; // do nothing
if (origNameInfo.containsOnlySurname) { // orig was lone surname
parentChildren.add(new PrefixWildcardQueryNode(fqn.getField(), v + "*", fqn.getBegin(), fqn.getEnd()));
}
else {
// do nothing
}
}
else { // new name has several parts
if (nameParts.length < origNameInfo.noOfParts ) return; // do nothing
if (origNameInfo.containsOnlySurname) { // orig was lone surname
// we could extract the surname and search for "surname, *" but i have decided against it
// the surname probably comes from the synonym expansion and if it was there, it can contain initials
parentChildren.add(new PrefixWildcardQueryNode(fqn.getField(), v + " *", fqn.getBegin(), fqn.getEnd()));
}
else {
if (origNameInfo.lastPartWasAcronym) { // orig name had only initial at the end
if (nameParts[nameParts.length-1].length() == 1) { // allow broader search only if the expanded form also has initial
parentChildren.add(new PrefixWildcardQueryNode(fqn.getField(), v + "*", fqn.getBegin(), fqn.getEnd()));
}
else {
parentChildren.add(new PrefixWildcardQueryNode(fqn.getField(), v + " *", fqn.getBegin(), fqn.getEnd()));
}
}
else {
parentChildren.add(new PrefixWildcardQueryNode(fqn.getField(), v + " *", fqn.getBegin(), fqn.getEnd()));
}
}
// special regular expression cases, only happens if the new name and the original
// have initial somewhere in the middle (and both at the same position)
if (regexIsPossible(nameParts, origNameInfo.parts)) {
StringBuffer nn = new StringBuffer();
nn.append(nameParts[0]);
for (int i=1;i<nameParts.length-1;i++) {
if (nameParts[i].length()==1 && origNameInfo.parts[i].length()==1) {
nn.append(" " + nameParts[i] + "[^\\s]+");
}
else {
nn.append(" " + nameParts[i]);
}
}
nn.append(" " + nameParts[nameParts.length-1]);
if (nameParts[nameParts.length-1].length()==1 && origNameInfo.parts[nameParts.length-1].length()==1) {
parentChildren.add(new AqpAdsabsRegexQueryNode(fqn.getField(), nn.toString() + ".*", fqn.getBegin(), fqn.getEnd()));
}
else {
parentChildren.add(new AqpAdsabsRegexQueryNode(fqn.getField(), nn.toString(), fqn.getBegin(), fqn.getEnd()));
parentChildren.add(new AqpAdsabsRegexQueryNode(fqn.getField(), nn.toString() + " .*", fqn.getBegin(), fqn.getEnd()));
}
}
}
return;
}
}
if (!node.isLeaf()) expandNodes(node, origNameInfo, level);
}
private boolean regexIsPossible(String[] orig, String[] newName) {
for (int i=1;i<orig.length-1 && i<newName.length;i++) {
if (orig[i].length()==1 && newName[i].length()==1 && i+1<newName.length) {
return true;
}
}
return false;
}
private String[] getSynonyms(String origInput) throws IOException {
Analyzer analyzer = getQueryConfigHandler().get(ConfigurationKeys.ANALYZER);
TokenStream source = null;
try {
source = analyzer.tokenStream("author_short_name_rage", new StringReader(origInput));
source.reset();
} catch (IOException e1) {
if (source != null)
source.close();
throw new RuntimeException(e1);
}
CharTermAttribute termAtt = source.getAttribute(CharTermAttribute.class);
List<String> synonyms = new ArrayList<String>();
while (source.incrementToken()) {
synonyms.add(termAtt.toString());
}
source.close();
if (synonyms.size()<2) { // the first one is the original
return null;
}
synonyms.remove(0);
return synonyms.toArray(new String[synonyms.size()]);
}
private boolean isLongForm(String name) {
String[] parts = name.split(" ");
int longParts = 0;
for (int i=1;i<parts.length;i++) {
if (parts[i].length() > 1)
longParts++;
}
return parts.length==longParts;
}
/*
* This is a part of the first part of the author chain tokenizer;
* but it is very important, because without it, the search
* may be *slightly* different. So, whenever you update the tokenizer
* chain, you should always review also this method
*/
Analyzer authorNameAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
TokenStream filter = new PythonicAuthorNormalizerFilter(source);
filter = new AuthorNormalizeFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
private List<String> normalizeAuthorName(String input) throws QueryNodeException {
try {
TokenStream ts = authorNameAnalyzer.tokenStream("foo", input);
ts.reset();
List<String> out = new ArrayList<String>();
CharTermAttribute termAtt;
while (ts.incrementToken()) {
termAtt = ts.getAttribute(CharTermAttribute.class);
out.add(termAtt.toString());
}
ts.close();
return out;
} catch (IOException e) {
throw new QueryNodeException(new MessageImpl("Error parsing: " + input, e));
}
}
class NameInfo {
public String origName;
public boolean lastPartWasAcronym;
public int noOfParts;
public String[] parts;
public boolean containsOnlySurname = false;
public NameInfo(String name) {
// lone surnames get always expanded
if (name.endsWith(",") || !name.contains(",")) containsOnlySurname = true;
// whether to add a space, ie. Kurtz, Michael J -> Kurtz, Michael J*
// but Kurtz, Michael Julian -> Kurtz, Michael Julian *
parts = AuthorUtils.splitName(name);
lastPartWasAcronym = parts[parts.length-1].length() == 1;
noOfParts = parts.length;
origName = name;
}
}
static final class ReusableStringReader extends Reader {
private int pos = 0, size = 0;
private String s = null;
void setValue(String s) {
this.s = s;
this.size = s.length();
this.pos = 0;
}
@Override
public int read() {
if (pos < size) {
return s.charAt(pos++);
} else {
s = null;
return -1;
}
}
@Override
public int read(char[] c, int off, int len) {
if (pos < size) {
len = Math.min(len, size-pos);
s.getChars(pos, pos+len, c, off);
pos += len;
return len;
} else {
s = null;
return -1;
}
}
@Override
public void close() {
pos = size; // this prevents NPE when reading after close!
s = null;
}
@Override
public void reset() {
this.size = s.length();
this.pos = 0;
}
}
}