BatchProviderDumpAuthorNames.java example

Explorer
montysolr-master
- contrib
package org.apache.solr.handler.batch;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.SimpleCollector;
import org.apache.solr.analysis.WriteableExplicitSynonymMap;
import org.apache.solr.analysis.WriteableSynonymMap;
import org.apache.solr.analysis.author.AuthorUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.SolrIndexSearcher;

/**
 * Provider which collects all author names,
 * and saves their ASCII => UTF8 mapping to disc. 
 *
 */
public class BatchProviderDumpAuthorNames extends BatchProvider {
	
	public String sourceField = "author";
	public String analyzerField = "author_collector";

	public void run(SolrQueryRequest req, BatchHandlerRequestQueue queue) throws Exception {
		
		SolrParams params = req.getParams();
	  String jobid = params.get("jobid");
	  String workDir = params.get("#workdir");
	  
		SolrCore core = req.getCore();
    IndexSchema schema = req.getSchema();
    
    BatchHandlerRequestData data = queue.pop();
    
    SchemaField field = core.getLatestSchema().getFieldOrNull(sourceField);
    
    if (field==null || !field.stored()) {
      data.setMsg("We cannot dump fields that are not stored: " + sourceField);
      queue.registerFailedBatch(this, data);
      return;
    }
    
    final Analyzer analyzer = core.getLatestSchema().getQueryAnalyzer();
    
    SchemaField tField = core.getLatestSchema().getFieldOrNull(analyzerField);
    
    if (tField == null) {
      data.setMsg("We cannot find analyzer for: " + analyzerField);
      queue.registerFailedBatch(this, data);
      return;
    }
    
    final String targetAnalyzer = analyzerField;
    SolrIndexSearcher se = req.getSearcher();
    
    final HashSet<String> fieldsToLoad = new HashSet<String>();
    fieldsToLoad.add(sourceField);
    
    File jobFile = new File(workDir + "/" + jobid);
    final WriteableSynonymMap synMap = createSynonymMap();
    synMap.setOutput(jobFile.getAbsolutePath());
		
    se.search(new MatchAllDocsQuery(), new SimpleCollector() {
      private LeafReader reader;
      private int i = 0;
      private Set<String> tokenBuffer = new LinkedHashSet<String>();
      private CharTermAttribute termAtt;
      private TypeAttribute typeAtt;
      private String authorInput;
      
      protected void doSetNextReader(LeafReaderContext context) throws IOException {
        reader = context.reader();
      }
      @Override
      public void collect(int i) {
        Document d;
        try {
          d = reader.document(i, fieldsToLoad);
          
          for (String f: fieldsToLoad) {
            String[] vals = d.getValues(f);
            
            for (String s: vals) {
            	//System.out.println(s);
              TokenStream ts = analyzer.tokenStream(targetAnalyzer, new StringReader(s));
              ts.reset();
              
						  while(ts.incrementToken()) {
						  	termAtt = ts.getAttribute(CharTermAttribute.class);
						  	typeAtt = ts.getAttribute(TypeAttribute.class);
              
						  	if (typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT)) {
						  		addTokensToSynMap();
						      authorInput = termAtt.toString();
						      //System.out.println("authorInput " + authorInput);
						    }
						  	else {
						  		tokenBuffer.add(termAtt.toString());
						  	}
						  }
						  
						  addTokensToSynMap();
              ts.close();
            }
          }
        } catch (IOException e) {
          // pass
        }
      }
      
      private void addTokensToSynMap() {
		    if (tokenBuffer.size()>0) {
		    	if (authorInput != null && authorInput.length() >= 4 && authorInput.split(" ").length <= 5) // ignore obvious mistakes
		    		synMap.add(authorInput, tokenBuffer);
		      tokenBuffer.clear();
		      authorInput=null;
		    }
		  }
      @Override
      public boolean needsScores() {
        return false;
      }
    });
    
    synMap.persist();
    
  }
	
	@Override
  public String getDescription() {
	  return "Collects author names and saves them to disk as synonym file";
  }

  private WriteableSynonymMap createSynonymMap() {
  	
    return new WriteableExplicitSynonymMap() { // no configuration allowed!
      /*
       * This synonym map has ascii forms as a key
       * and the upgraded utf8 values are values, 
       * when it is persisted, it will generate the
       * variation of the names. I.e. 
       * MÜLLER, BILL
       * 
       * will become
       * 
       *    MULLER, BILL=>MÜLLER, BILL
       *    MUELLER, BILL=>MÜLLER, BILL
       *    MULLER, B=>MÜLLER, B
       *    MUELLER, B=>MÜLLER, B
       *    MULLER,=>MÜLLER,
       * 
       * (non-Javadoc)
       * @see org.apache.solr.analysis.WriteableExplicitSynonymMap#add(java.lang.String, java.util.Set)
       */
      
      @Override
      public void populateMap(List<String> rules) {
        HashSet<String> hs = new HashSet<String>();
        for (String rule : rules) {
          List<String> mapping = StrUtils.splitSmart(rule, "=>", false);
          if (mapping.size() != 2) {
            log.error("Invalid Synonym Rule:" + rule);
            continue;
          }
          String key = mapping.get(0).trim().replace("\\,", ",").replace("\\ ", " ");
          hs.clear();
          hs.add(key);
          for (String val: splitValues(mapping.get(1))) {
            add(val, hs);
          }
          
        }
      }

      
      @Override
      public void add(String origName, Set<String> values) {
        // key = the original author input (possibly with utf8 characters)
        // values = set of transliterated values
        Set<String> masterSet = null;
        for (String key: values) {
          if (containsKey(key)) {
            masterSet = get(key);
            break;
          }
        }
        
        if (masterSet==null) { 
          masterSet = new LinkedHashSet<String>();
        }
        masterSet.add(origName);
        
        for (String key: values) {
          put(key, masterSet);
        }
      }
      
      @Override
      public String formatEntry(String key, Set<String>values) {
        List<String> rows = new ArrayList<String>();

        // remove all but the first comma
        //System.out.println("before replace " + key);
        key = key.replaceAll("\\G((?!^).*?|[^,]*,.*?),", "$1"); //"agusan, Adrian, , Dr" -> "agusan, Adrian Dr"
        key = AuthorUtils.normalizeAuthor(key);
        //System.out.println("after replace " + key);
            
        String[] nameParts = key.split(" ");
        if (nameParts.length > 1) {
          //nameParts[0] = nameParts[0].replace(",", "\\,");
          String[][] otherNames = new String[values.size()][];
          int n = 0;
          for (String name: values) {
          	// remove all but the first comma
          	name = name.replaceAll("\\G((?!^).*?|[^,]*,.*?),", "$1");
	        	name = AuthorUtils.normalizeAuthor(name);
            otherNames[n++] = name.split(" ");
            //otherNames[n-1][0] = otherNames[n-1][0].replace(",", "\\,"); 
          }
          int cycle=0;
          do {
          	
            for (n=0;n<nameParts.length;n++) {
            	StringBuffer out = new StringBuffer();
              if (cycle>0 && n==0) continue;
              out.append(join(nameParts, n));
              out.append("=>");
              boolean notFirst = false;
              for (String[] other: otherNames) {
                if (notFirst) out.append(";");
                out.append(join(other, n));
                notFirst = true;
              }
              rows.add(out.toString());
            }
            cycle++;
          } while (shortened(nameParts, otherNames));
        }
        
        // cleanup entries, keep only those that have non-ascii character
        StringBuffer toReturn = new StringBuffer();
        for (String row: rows) {
        	if (hasNonAscii(row)) {
        		toReturn.append(row);
        		toReturn.append("\n");
        	}
        }
        return toReturn.toString();
      }
      
      private boolean hasNonAscii(String s) {
      	for (char c: s.toCharArray()) {
      		if ((int)c > 128) {
      			return true;
      		}
      	}
      	return false;
      }
      
      private String join(String[] name, int v) {
        StringBuffer out = new StringBuffer();
        boolean notFirst = false;
        for (int i=0;i<=v;i++) {
          if (notFirst) out.append(" ");
          out.append(name[i]);
          notFirst=true;
        }
        return out.toString();
      }
      
      private boolean shortened(String[]nameParts, String[][] otherNames) {
        for (int i=nameParts.length-1;i>0;i--) {
          if (nameParts[i].length() > 1) {
            nameParts[i] = nameParts[i].substring(0, 1);
            for (String[] other: otherNames) {
              if (other[i] == null || other[i].length() < 2)
                return false;  // this may happen if synonyms map the name to a shorter version, my solution is to stop processing (cheap?)
              other[i] = other[i].substring(0, 1);
            }
            return true;
          }
        }
        return false;
      }
    };
	}

}