package org.apache.solr.analysis.author;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.jython.JythonObjectFactory;
import org.jython.monty.interfaces.JythonNameParser;
/*
* This filter will call Python library: http://code.google.com/p/python-nameparser/
* to parse the input string, eg.
*
* Doe, Lt. Gen. John A. Kenneth IV
*
* is parsed as:
*
* <HumanName : [
* Title: 'Lt. Gen.'
* First: 'John'
* Middle: 'A. Kenneth'
* Last: 'Doe'
* Suffix: 'IV'
* ]>
*
* Input can contain several author names, but these need to be separated
* by semicolon
*/
public final class PythonicAuthorNormalizerFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private List<String> buffer = new ArrayList<String>();
private Pattern multiSpace = Pattern.compile("\\s\\s+");
private JythonNameParser jythonParser;
public PythonicAuthorNormalizerFilter(TokenStream input) {
super(input);
JythonObjectFactory factory = new JythonObjectFactory(JythonNameParser.class, "jython_name_parser", "HumanParser");
this.jythonParser = (JythonNameParser) factory.createObject();
}
@Override
public boolean incrementToken() throws IOException {
if (buffer.size() > 0) {
termAtt.setEmpty().append(buffer.remove(0));
typeAtt.setType(AuthorUtils.AUTHOR_INPUT);
return true;
}
if (!input.incrementToken()) return false;
String original = termAtt.toString();
original = multiSpace.matcher(original).replaceAll(" ");
String newIndividual = null;
for (String individual: original.split(";")) {
Map<String,String> parsedName = jythonParser.parse_human_name(individual);
if (parsedName != null) {
if (parsedName.containsKey("Last")) {
newIndividual = (parsedName.get("Last") + ","
+ (parsedName.containsKey("First") ? " " + parsedName.get("First") : "")
+ (parsedName.containsKey("Middle") ? " " + parsedName.get("Middle") : "")
);
}
else {
if (parsedName.containsKey("First") && parsedName.containsKey("Middle")) {
// should never happen
throw new SolrException(ErrorCode.BAD_REQUEST, "We cannot reliably parse author name: " + individual);
}
else if (parsedName.containsKey("First")) {
newIndividual = (parsedName.get("First") + ","); // we treat it as surname
}
else {
newIndividual = (parsedName.get("Title") + ","); // else it was parsed as title (and since it is the only thing we have, let's take it for surname)
}
}
String ignSpaceIndividual = individual.replaceAll(" ", "");
String ignNewSpaceIndividual = newIndividual != null ? newIndividual.replaceAll(" ", "") : "";
if (newIndividual == null) {
// we should ignore this input completely
}
else if (newIndividual.equals(individual)
|| newIndividual.equals(individual + ",")
|| ignNewSpaceIndividual.equals(ignSpaceIndividual)
|| ignNewSpaceIndividual.equals(ignSpaceIndividual + ",")
) {
buffer.add(newIndividual); // no modifications, just add original
}
else { // some modifications happened
// add original
if (individual.indexOf(",") == -1) {
buffer.add(individual + ",");
}
else {
buffer.add(individual);
}
if (newIndividual != null) {
buffer.add(newIndividual); // add modified version
}
}
}
else {
buffer.add(individual);
}
}
if (buffer.size() ==0) {
return false;
}
termAtt.setEmpty().append(buffer.remove(0));
typeAtt.setType(AuthorUtils.AUTHOR_INPUT);
return true;
}
}