package org.apache.solr.analysis.author;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.PersistingMapTokenFilterFactory;
import org.apache.solr.common.util.StrUtils;
import org.apache.lucene.analysis.synonym.NewSolrSynonymParser;
import org.apache.lucene.analysis.synonym.NewSynonymFilterFactory;
import org.apache.lucene.analysis.synonym.NewSynonymFilterFactory.SynonymParser;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.util.CharsRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This is a trickster class - it modifies the synonym input on the fly, so that
* we don't need to bother with producing the multiplicated data from the
* author synonyms. But obviously, this could introduce some bugs...
*/
public class AuthorShortNameUpgradeFilterFactory extends PersistingMapTokenFilterFactory implements ResourceLoaderAware {
public AuthorShortNameUpgradeFilterFactory(Map<String,String> args) {
super(args);
}
public static final Logger log = LoggerFactory.getLogger(AuthorShortNameUpgradeFilterFactory.class);
/*
* If we were insane (and we apparently are!) we would want to generate all combinations
* of the name, ie.
*
* Surname, One Two Three; Foo, Bar Baz
*
* Results in:
*
* Surname, One T T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, O Two T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, O T Three => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, One T T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, One Two T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, O Two Three => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, One T Three => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
*
* PLUS!
*
* Surname, One T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, O Two => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, One => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, O => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
*
* and this happens for every name in the list!!!
*
* OK, I honestly think this is too much combinations and most of them are going to be
* useless. Especially, because these combinations ARE ALREADY generated during the
* search for synonyms (but NOT used in the query if there is no EXACT match!)
*
* I secretly hope, that if a user typed "Foo, Boo Boz" she REALLY doesn't want
* to get "Foo, Baz Bar" -- and this is exactly what they would get IFF this logic is
* implemented - ie. if the synonym file contained another line with:
*
* "Foo, Boo Boz; Surname, One Boo"
*
* The combination "Foo, B B" will cause all the patters of these two lines to be merged
* with "Foo, Bar Baz" and the FALSE FALSE HITS are returned. And I don't like this idea and I
* think it is wrong.
*
* So, I am going to implement a middle path that creates ONLY the
* initial and surname variations. And if we discover, that in fact this behaviour is not
* desirable (then I can change it)
*
* So, to summarize, I want to generate only these patters for now:
*
* Surname, => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, O => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, O T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
* Surname, O T T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
*
*/
public static class MakeAllShortNames extends NewSynonymFilterFactory.SynonymBuilderFactory {
public MakeAllShortNames(Map<String,String> args) {
super(args);
}
protected SynonymParser getParser(Analyzer analyzer) {
char sep = ',';
if (args.containsKey("format") && args.get("format").equals("semicolon")) {
sep = ';';
};
final Character charSeparator = sep;
return new NewSolrSynonymParser(true, true, analyzer) {
public void add(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
StringBuffer newBr = new StringBuffer();
String line = null;
String[] parts;
HashSet<String> seen = new HashSet<String>();
try {
while ((line = br.readLine()) != null) {
//System.out.println(line);
// modify the original on-the-fly
if (line.length() == 0 || line.charAt(0) == '#') {
continue; // ignore empty lines and comments
}
seen.clear();
String[] sides = line.split("=>");
if (sides.length > 1) { // explicit mapping
String[] names = getNames(sides[1]);
//System.out.println(Arrays.toString(names));
parts = AuthorUtils.splitName(sides[0]);
if (isLongForm(parts) && containsLongForm(names) > 0) {
for (String shortForm: getAllShortForms(parts)) {
if (seen.contains(shortForm)) continue;
seen.add(shortForm);
newBr.append(escape(shortForm) + "=>" +
sides[0] + "," +
buildLine(names));
newBr.append("\n");
}
}
}
else {
String[] names = getNames(sides[0]);
if (containsLongForm(names) > 1) {
String newLine = buildLine(names);
for (int i=0;i<names.length;i++) {
parts = AuthorUtils.splitName(names[i]);
if (isLongForm(parts)) {
for (String shortForm: getAllShortForms(parts)) {
if (seen.contains(shortForm)) continue;
seen.add(shortForm);
newBr.append(escape(shortForm) + "=>" +
newLine);
newBr.append("\n");
}
}
}
}
}
}
} catch (IllegalArgumentException e) {
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
ex.initCause(e);
throw ex;
} finally {
br.close();
}
// pass the modified synonym to the builder to create a synonym map
super.add(new InputStreamReader(new ByteArrayInputStream(newBr.toString().getBytes(Charset.forName("UTF-8"))),
Charset.forName("UTF-8").newDecoder()));
}
private String[] getAllShortForms(String[] parts) {
String[] names = new String[parts.length];
for (int i=0;i<parts.length;i++) {
StringBuilder out = new StringBuilder();
out.append(parts[0]);
for (int j=1;j<=i;j++) {
out.append(" ");
out.append(parts[i].substring(0, 1));
}
names[i] = out.toString();
}
return names;
}
@Override
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
super.add(input, output, true);
}
private String[] getNames(String vals) {
List<String> nn = StrUtils.splitSmart(vals, charSeparator);
String names[] = new String[nn.size()];
int j = 0;
for (String n: nn) {
names[j] = unescape(n);
j++;
}
return names;
}
// cause we subclass solrsynonym parser, we must output solr format
private String buildLine(String[] names) {
HashSet<String> set = new HashSet<String>();
StringBuilder out = new StringBuilder();
boolean notFirst = false;
for (String name: names) {
String[] p = AuthorUtils.splitName(name);
if (isLongForm(p)) {
set.add(makeShortForm(p));
}
set.add(name);
}
for (String name: set) {
if (notFirst) out.append(",");
out.append(escape(name));
notFirst = true;
}
return out.toString();
}
private String unescape(String s) {
return s.replace("\\ ", " ").replace("\\," + charSeparator, charSeparator.toString());
}
// cause we subclass solrsynonym parser, we must output solr format
private String escape(String s) {
return s.replace(" ", "\\ ").replace(",", "\\,");
}
private String makeShortForm(String[] parts) {
StringBuilder out = new StringBuilder();
out.append(parts[0]);
for (int i=1;i<parts.length;i++) {
out.append(" ");
//System.out.println("->" + parts[i]);
out.append(parts[i].substring(0, 1));
}
return out.toString();
}
private boolean isLongForm(String[] parts) {
return parts.length > 1;
}
private int containsLongForm(String[] names) {
int i = 0;
for (String name: names) {
if (isLongForm(AuthorUtils.splitName(name))) {
i++;
}
}
return i;
}
};
}
}
/*
* The following class will change (on-the-fly) the synonym file, for every
* rule which contains a fullname, ie.
*
* Surname, Name; Foo, Bar
*
* it will produce new mappings of the form
*
* Surname, N => Surname, Name; Foo, B; Foo, Bar
* Foo, B => Foo, Bar; Surname, Name; Surname, N
*
*
*
* This class was the first attempt I wrote on the synonym upgrade, however
* Alberto wants that all combinations of names are upgraded. Ie.
*
* Surname, => Surname, Name; Foo, B; Foo, Bar
* Surname, N => Surname, Name; Foo, B; Foo, Bar
* Foo, => Foo, Bar; Surname, Name; Surname, N
* Foo, B => Foo, Bar; Surname, Name; Surname, N
*
*/
public static class MakeShortNames extends NewSynonymFilterFactory.SynonymBuilderFactory {
public MakeShortNames(Map<String,String> args) {
super(args);
}
protected SynonymParser getParser(Analyzer analyzer) {
char sep = ',';
if (args.containsKey("format") && args.get("format").equals("semicolon")) {
sep = ';';
};
final Character charSeparator = sep;
return new NewSolrSynonymParser(true, true, analyzer) {
public void add(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
StringBuffer newBr = new StringBuffer();
String line = null;
String[] parts;
try {
while ((line = br.readLine()) != null) {
// modify the original on-the-fly
if (line.length() == 0 || line.charAt(0) == '#') {
continue; // ignore empty lines and comments
}
String[] sides = line.split("=>");
if (sides.length > 1) { // explicit mapping
String[] names = getNames(sides[1]);
parts = AuthorUtils.splitName(sides[0]);
if (isLongForm(parts) && containsLongForm(names) > 0) {
newBr.append(escape(makeShortForm(parts)) + "=>" +
sides[0] + "," +
buildLine(names));
newBr.append("\n");
}
}
else {
String[] names = getNames(sides[0]);
if (containsLongForm(names) > 1) {
String newLine = buildLine(names);
for (int i=0;i<names.length;i++) {
parts = AuthorUtils.splitName(sides[i]);
if (isLongForm(parts)) {
newBr.append(escape(makeShortForm(parts)) + "=>" +
newLine);
newBr.append("\n");
}
}
}
}
}
} catch (IllegalArgumentException e) {
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
ex.initCause(e);
throw ex;
} finally {
br.close();
}
// pass the modified synonym to the builder to create a synonym map
super.add(new InputStreamReader(new ByteArrayInputStream(newBr.toString().getBytes()),
Charset.forName("UTF-8").newDecoder()));
}
@Override
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
super.add(input, output, true);
}
private String[] getNames(String vals) {
List<String> nn = StrUtils.splitSmart(vals, charSeparator);
String names[] = new String[nn.size()];
int j = 0;
for (String n: nn) {
names[j] = unescape(n);
j++;
}
return names;
}
private String buildLine(String[] names) {
HashSet<String> set = new HashSet<String>();
StringBuilder out = new StringBuilder();
boolean notFirst = false;
for (String name: names) {
String[] p = AuthorUtils.splitName(name);
if (isLongForm(p)) {
set.add(makeShortForm(p));
}
set.add(name);
}
for (String name: set) {
if (notFirst) out.append(",");
out.append(escape(name));
notFirst = true;
}
return out.toString();
}
private String unescape(String s) {
return s.replace("\\ ", " ").replace("\\" + charSeparator, charSeparator.toString());
}
private String escape(String s) {
return s.replace(" ", "\\ ").replace(",", "\\,");
}
private String makeShortForm(String[] parts) {
StringBuilder out = new StringBuilder();
out.append(parts[0]);
for (int i=1;i<parts.length;i++) {
out.append(" ");
out.append(parts[i].substring(0, 1));
}
return out.toString();
}
private boolean isLongForm(String[] parts) {
boolean res = false;
for (int i=1;i<parts.length;i++) {
if (parts[i].length() > 1)
return true;
}
return res;
}
private int containsLongForm(String[] names) {
int i = 0;
for (String name: names) {
if (isLongForm(AuthorUtils.splitName(name))) {
i++;
}
}
return i;
}
};
}
}
@Override
public void inform(ResourceLoader loader) {
super.inform(loader);
}
@Override
public TokenStream create(TokenStream input) {
// this filter factory does nothing on its own, it is used by the
// NewSynonymFilteFactory
return input;
}
}