package org.apache.solr.handler.batch;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.SimpleCollector;
import org.apache.solr.analysis.WriteableExplicitSynonymMap;
import org.apache.solr.analysis.WriteableSynonymMap;
import org.apache.solr.analysis.author.AuthorUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.SolrIndexSearcher;
/**
* Provider which collects all author names,
* and saves their ASCII => UTF8 mapping to disc.
*
*/
public class BatchProviderDumpAuthorNames extends BatchProvider {
public String sourceField = "author";
public String analyzerField = "author_collector";
public void run(SolrQueryRequest req, BatchHandlerRequestQueue queue) throws Exception {
SolrParams params = req.getParams();
String jobid = params.get("jobid");
String workDir = params.get("#workdir");
SolrCore core = req.getCore();
IndexSchema schema = req.getSchema();
BatchHandlerRequestData data = queue.pop();
SchemaField field = core.getLatestSchema().getFieldOrNull(sourceField);
if (field==null || !field.stored()) {
data.setMsg("We cannot dump fields that are not stored: " + sourceField);
queue.registerFailedBatch(this, data);
return;
}
final Analyzer analyzer = core.getLatestSchema().getQueryAnalyzer();
SchemaField tField = core.getLatestSchema().getFieldOrNull(analyzerField);
if (tField == null) {
data.setMsg("We cannot find analyzer for: " + analyzerField);
queue.registerFailedBatch(this, data);
return;
}
final String targetAnalyzer = analyzerField;
SolrIndexSearcher se = req.getSearcher();
final HashSet<String> fieldsToLoad = new HashSet<String>();
fieldsToLoad.add(sourceField);
File jobFile = new File(workDir + "/" + jobid);
final WriteableSynonymMap synMap = createSynonymMap();
synMap.setOutput(jobFile.getAbsolutePath());
se.search(new MatchAllDocsQuery(), new SimpleCollector() {
private LeafReader reader;
private int i = 0;
private Set<String> tokenBuffer = new LinkedHashSet<String>();
private CharTermAttribute termAtt;
private TypeAttribute typeAtt;
private String authorInput;
protected void doSetNextReader(LeafReaderContext context) throws IOException {
reader = context.reader();
}
@Override
public void collect(int i) {
Document d;
try {
d = reader.document(i, fieldsToLoad);
for (String f: fieldsToLoad) {
String[] vals = d.getValues(f);
for (String s: vals) {
//System.out.println(s);
TokenStream ts = analyzer.tokenStream(targetAnalyzer, new StringReader(s));
ts.reset();
while(ts.incrementToken()) {
termAtt = ts.getAttribute(CharTermAttribute.class);
typeAtt = ts.getAttribute(TypeAttribute.class);
if (typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT)) {
addTokensToSynMap();
authorInput = termAtt.toString();
//System.out.println("authorInput " + authorInput);
}
else {
tokenBuffer.add(termAtt.toString());
}
}
addTokensToSynMap();
ts.close();
}
}
} catch (IOException e) {
// pass
}
}
private void addTokensToSynMap() {
if (tokenBuffer.size()>0) {
if (authorInput != null && authorInput.length() >= 4 && authorInput.split(" ").length <= 5) // ignore obvious mistakes
synMap.add(authorInput, tokenBuffer);
tokenBuffer.clear();
authorInput=null;
}
}
@Override
public boolean needsScores() {
return false;
}
});
synMap.persist();
}
@Override
public String getDescription() {
return "Collects author names and saves them to disk as synonym file";
}
private WriteableSynonymMap createSynonymMap() {
return new WriteableExplicitSynonymMap() { // no configuration allowed!
/*
* This synonym map has ascii forms as a key
* and the upgraded utf8 values are values,
* when it is persisted, it will generate the
* variation of the names. I.e.
* MÜLLER, BILL
*
* will become
*
* MULLER, BILL=>MÜLLER, BILL
* MUELLER, BILL=>MÜLLER, BILL
* MULLER, B=>MÜLLER, B
* MUELLER, B=>MÜLLER, B
* MULLER,=>MÜLLER,
*
* (non-Javadoc)
* @see org.apache.solr.analysis.WriteableExplicitSynonymMap#add(java.lang.String, java.util.Set)
*/
@Override
public void populateMap(List<String> rules) {
HashSet<String> hs = new HashSet<String>();
for (String rule : rules) {
List<String> mapping = StrUtils.splitSmart(rule, "=>", false);
if (mapping.size() != 2) {
log.error("Invalid Synonym Rule:" + rule);
continue;
}
String key = mapping.get(0).trim().replace("\\,", ",").replace("\\ ", " ");
hs.clear();
hs.add(key);
for (String val: splitValues(mapping.get(1))) {
add(val, hs);
}
}
}
@Override
public void add(String origName, Set<String> values) {
// key = the original author input (possibly with utf8 characters)
// values = set of transliterated values
Set<String> masterSet = null;
for (String key: values) {
if (containsKey(key)) {
masterSet = get(key);
break;
}
}
if (masterSet==null) {
masterSet = new LinkedHashSet<String>();
}
masterSet.add(origName);
for (String key: values) {
put(key, masterSet);
}
}
@Override
public String formatEntry(String key, Set<String>values) {
List<String> rows = new ArrayList<String>();
// remove all but the first comma
//System.out.println("before replace " + key);
key = key.replaceAll("\\G((?!^).*?|[^,]*,.*?),", "$1"); //"agusan, Adrian, , Dr" -> "agusan, Adrian Dr"
key = AuthorUtils.normalizeAuthor(key);
//System.out.println("after replace " + key);
String[] nameParts = key.split(" ");
if (nameParts.length > 1) {
//nameParts[0] = nameParts[0].replace(",", "\\,");
String[][] otherNames = new String[values.size()][];
int n = 0;
for (String name: values) {
// remove all but the first comma
name = name.replaceAll("\\G((?!^).*?|[^,]*,.*?),", "$1");
name = AuthorUtils.normalizeAuthor(name);
otherNames[n++] = name.split(" ");
//otherNames[n-1][0] = otherNames[n-1][0].replace(",", "\\,");
}
int cycle=0;
do {
for (n=0;n<nameParts.length;n++) {
StringBuffer out = new StringBuffer();
if (cycle>0 && n==0) continue;
out.append(join(nameParts, n));
out.append("=>");
boolean notFirst = false;
for (String[] other: otherNames) {
if (notFirst) out.append(";");
out.append(join(other, n));
notFirst = true;
}
rows.add(out.toString());
}
cycle++;
} while (shortened(nameParts, otherNames));
}
// cleanup entries, keep only those that have non-ascii character
StringBuffer toReturn = new StringBuffer();
for (String row: rows) {
if (hasNonAscii(row)) {
toReturn.append(row);
toReturn.append("\n");
}
}
return toReturn.toString();
}
private boolean hasNonAscii(String s) {
for (char c: s.toCharArray()) {
if ((int)c > 128) {
return true;
}
}
return false;
}
private String join(String[] name, int v) {
StringBuffer out = new StringBuffer();
boolean notFirst = false;
for (int i=0;i<=v;i++) {
if (notFirst) out.append(" ");
out.append(name[i]);
notFirst=true;
}
return out.toString();
}
private boolean shortened(String[]nameParts, String[][] otherNames) {
for (int i=nameParts.length-1;i>0;i--) {
if (nameParts[i].length() > 1) {
nameParts[i] = nameParts[i].substring(0, 1);
for (String[] other: otherNames) {
if (other[i] == null || other[i].length() < 2)
return false; // this may happen if synonyms map the name to a shorter version, my solution is to stop processing (cheap?)
other[i] = other[i].substring(0, 1);
}
return true;
}
}
return false;
}
};
}
}