package org.apache.solr.analysis.author;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/*
* Inspect the query form and decide whether we should generate the ADS
* fail-safe query additions. If the author search is:
*
* "Kurtz, Michael"
*
* then the old records can be simply under
*
* "Kurtz, M"
* "Kurtz,"
*
* This filter expects NORMALIZED form of the author name.
*
*/
public final class AuthorCreateQueryVariationsFilter extends TokenFilter {
private final String tokenType;
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private boolean lookAtPayloadForOrigAuthor;
private int maxNumberOfNames = 6; // safety precaution, we are pretty efficient but one should be careful...
private boolean plainSurname;
private int createVariations;
private boolean addWildcards;
private boolean shortenMultiname;
public AuthorCreateQueryVariationsFilter(TokenStream input, String tokenType,
boolean surname, int variate, boolean addWildcards,
boolean shortenMultiname, boolean lookAtPayloadForOrigAuthor) {
super(input);
this.variationStack = new Stack<String>();
this.createVariations = variate;
this.plainSurname = surname;
this.tokenType = tokenType;
this.addWildcards = addWildcards;
this.shortenMultiname = shortenMultiname;
this.lookAtPayloadForOrigAuthor = lookAtPayloadForOrigAuthor;
}
private Stack<String> variationStack;
private AttributeSource.State current;
private String origAuthorName = null;
@Override
public boolean incrementToken() throws IOException {
if (this.variationStack.size() > 0) {
String syn = this.variationStack.remove(0);
this.restoreState(this.current);
this.termAtt.setEmpty();
this.termAtt.append(syn);
this.posIncrAtt.setPositionIncrement(0);
this.typeAtt.setType(AuthorUtils.AUTHOR_QUERY_VARIANT);
//System.out.println("var:" + termAtt.toString());
return true;
}
if (!input.incrementToken()) return false;
// sort of hack, we want to know what the original input was
// but it can't be done otherwise because the SynonynFilter
// is resetting all attributes!!! This will work only for the
// first author in the token list, but since we use the filter
// only for the query, it should be fine
if (origAuthorName==null) origAuthorName=termAtt.toString();
if ((tokenType==null || typeAtt.type().equals(tokenType)) && this.genVariations()) {
this.current = this.captureState();
}
//System.out.println("var:" + termAtt.toString());
return true;
}
private boolean genVariations() {
String authorName = termAtt.toString();
if (lookAtPayloadForOrigAuthor && payloadAtt.getPayload() != null) {
origAuthorName = payloadAtt.getPayload().utf8ToString();
}
if (!authorName.contains(",")) return false;
if (authorName.endsWith(",") && addWildcards) {
variationStack.push(authorName + " *");
return true;
}
String[] parts = null;
String[] origParts = null;
if (authorName.contains(",")) {
String[] authorParts = authorName.split(",\\s*", 2);
String[] origAuthorParts = origAuthorName.split(",\\s*", 2);
if (authorParts.length > 1) {
String[] nameParts = authorParts[1].split(" ", maxNumberOfNames-1 );
String[] origNameParts = origAuthorParts[1].split(" ", maxNumberOfNames-1 );
parts = new String[nameParts.length+1];
parts[0] = authorParts[0] + ",";
for (int i=1;i<nameParts.length+1;i++) {
parts[i] = nameParts[i-1];
}
origParts = new String[origNameParts.length+1];
origParts[0] = origAuthorParts[0] + ",";
for (int i=1;i<origNameParts.length+1;i++) {
origParts[i] = origNameParts[i-1];
}
}
else {
parts = authorParts;
origParts = origAuthorParts;
}
}
else {
parts = authorName.split(" ", maxNumberOfNames );
origParts = origAuthorName.split(" ", maxNumberOfNames );
}
// this is an important indicator that influences how the wildcard variant
// is generated (if there is only acronym in the input, we do prefix search,
// if there is more than 2 characters, we append a space - ie. "kurtz, mi *"
// as opposed to "kurtz, m*"
boolean lastPartWasAcronym = origParts[origParts.length-1].length() == 1;
if (createVariations > 0 && parts.length >= createVariations) {
List<Integer> ids = new ArrayList<Integer>();
for (int i=1;i<parts.length;i++) {
if (parts[i].length()>1) ids.add(i);
}
StringBuilder output = new StringBuilder();
if (ids.size()>0) { // number of to-be-shortened name parts
int[] idx = new int[ids.size()];
for (int i=0;i<idx.length;i++) {
idx[i] = ids.get(i);
}
List<int[]> combinations = comb(idx);
for (int[] comb: combinations) {
String[] newParts = Arrays.copyOf(parts, parts.length);
for (int x: comb) {
newParts[x] = parts[x].substring(0,1);
}
output = new StringBuilder();
boolean notFirst = false;
for (int i=0;i<newParts.length;i++) {
if (notFirst) output.append(" ");
output.append(newParts[i]);
notFirst=true;
}
if (lastPartWasAcronym) {
if (addWildcards) {
variationStack.push(output.toString() + "*"); // prefix search
}
else {
variationStack.push(output.toString());
}
}
else {
if (addWildcards) {
variationStack.push(output.toString());
variationStack.push(output.toString() + " *");
}
else {
variationStack.push(output.toString());
}
}
}
}
else {
if (addWildcards) {
variationStack.push(authorName + (lastPartWasAcronym ? "*" : " *"));
}
}
}
if (shortenMultiname && parts.length > 2) {
variationStack.push(parts[0] + " " + parts[1]);
if (parts[1].length()>1) {
variationStack.push(parts[0] + " " + parts[1].substring(0,1));
}
}
if (plainSurname) variationStack.push(parts[0]);
return variationStack.size() > 0;
}
@Override
public void reset() throws IOException {
super.reset();
variationStack.clear();
current = null;
origAuthorName=null;
}
private List<int[]> comb(int... items) {
ArrayList<int[]> comb = new ArrayList<int[]>();
Arrays.sort(items);
for (int k = 1; k <= items.length; k++) {
kcomb(items, 0, k, new int[k], comb);
}
return comb;
}
public void kcomb(int[] items, int n, int k, int[] arr, List<int[]> comb) {
if (k == 0) {
comb.add(Arrays.copyOf(arr, arr.length));
} else {
for (int i = n; i <= items.length - k; i++) {
arr[arr.length - k] = items[i];
kcomb(items, i + 1, k - 1, arr, comb);
}
}
}
}