PythonicAuthorNormalizerFilter.java example

Explorer
montysolr-master
- contrib
package org.apache.solr.analysis.author;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.jython.JythonObjectFactory;
import org.jython.monty.interfaces.JythonNameParser;

/*
 * This filter will call Python library: http://code.google.com/p/python-nameparser/
 * to parse the input string, eg.
 * 
 * Doe, Lt. Gen. John A. Kenneth IV 
 *
 * is parsed as:
 * 
 * <HumanName : [
 *  Title: 'Lt. Gen.' 
 *  First: 'John' 
 *  Middle: 'A. Kenneth' 
 *  Last: 'Doe' 
 *  Suffix: 'IV'
 * ]>
 *
 * Input can contain several author names, but these need to be separated
 * by semicolon
 */
public final class PythonicAuthorNormalizerFilter extends TokenFilter {

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  private List<String> buffer = new ArrayList<String>();
  private Pattern multiSpace = Pattern.compile("\\s\\s+");
  private JythonNameParser jythonParser;
  
  public PythonicAuthorNormalizerFilter(TokenStream input) {
    super(input);
    JythonObjectFactory factory = new JythonObjectFactory(JythonNameParser.class, "jython_name_parser", "HumanParser");
    this.jythonParser = (JythonNameParser) factory.createObject();
  }

  @Override
  public boolean incrementToken() throws IOException {
  	if (buffer.size() > 0) {
  		termAtt.setEmpty().append(buffer.remove(0));
      typeAtt.setType(AuthorUtils.AUTHOR_INPUT);
      return true;
  	}
  	
    if (!input.incrementToken()) return false;
    
    String original = termAtt.toString();
    original = multiSpace.matcher(original).replaceAll(" ");

    String newIndividual = null; 

    for (String individual: original.split(";")) {
    	
    	Map<String,String> parsedName = jythonParser.parse_human_name(individual);

    	if (parsedName != null) {
    		if (parsedName.containsKey("Last")) {
    			newIndividual = (parsedName.get("Last") + "," 
    					+ (parsedName.containsKey("First") ? " " + parsedName.get("First") : "")
    					+ (parsedName.containsKey("Middle") ? " " + parsedName.get("Middle") : "")
    					);
    		}
    		else {
    			if (parsedName.containsKey("First") && parsedName.containsKey("Middle")) {
    				// should never happen
    				throw new SolrException(ErrorCode.BAD_REQUEST, "We cannot reliably parse author name: " + individual);
    			}
    			else if (parsedName.containsKey("First")) {
    				newIndividual = (parsedName.get("First") + ","); // we treat it as surname
    			}
    			else {
    				newIndividual =  (parsedName.get("Title") + ","); // else it was parsed as title (and since it is the only thing we have, let's take it for surname)
    			}
    		}

        String ignSpaceIndividual = individual.replaceAll(" ", "");
        String ignNewSpaceIndividual = newIndividual != null ? newIndividual.replaceAll(" ", "") : "";
        
        if (newIndividual == null) {
          // we should ignore this input completely
        }
        else if (newIndividual.equals(individual) 
          || newIndividual.equals(individual + ",") 
          || ignNewSpaceIndividual.equals(ignSpaceIndividual) 
          || ignNewSpaceIndividual.equals(ignSpaceIndividual + ",") 
          ) {
          buffer.add(newIndividual);  // no modifications, just add original
        }
        else { // some modifications happened
          
          // add original  
          if (individual.indexOf(",") == -1) {
            buffer.add(individual + ","); 
          }
          else {
            buffer.add(individual);
          }

          if (newIndividual != null) {
            buffer.add(newIndividual); // add modified version  
          }
          
        }

    	}
    	else {
    		buffer.add(individual);
    	}
    }
    
    if (buffer.size() ==0) {
    	return false;
    }
    
    termAtt.setEmpty().append(buffer.remove(0));
    typeAtt.setType(AuthorUtils.AUTHOR_INPUT);
    
    return true;
  }
}