AuthorShortNameUpgradeFilterFactory.java example

Explorer
montysolr-master
- contrib
package org.apache.solr.analysis.author;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.PersistingMapTokenFilterFactory;
import org.apache.solr.common.util.StrUtils;
import org.apache.lucene.analysis.synonym.NewSolrSynonymParser;
import org.apache.lucene.analysis.synonym.NewSynonymFilterFactory;
import org.apache.lucene.analysis.synonym.NewSynonymFilterFactory.SynonymParser;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.util.CharsRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;



/**
 * This is a trickster class - it modifies the synonym input on the fly, so that
 * we don't need to bother with producing the multiplicated data from the 
 * author synonyms. But obviously, this could introduce some bugs...
 */
public class AuthorShortNameUpgradeFilterFactory extends PersistingMapTokenFilterFactory implements ResourceLoaderAware {

  public AuthorShortNameUpgradeFilterFactory(Map<String,String> args) {
    super(args);
  }

  public static final Logger log = LoggerFactory.getLogger(AuthorShortNameUpgradeFilterFactory.class);

  
  /*
   * If we were insane (and we apparently are!) we would want to generate all combinations
   * of the name, ie.
   * 
   *  Surname, One Two Three; Foo, Bar Baz
   *  
   * Results in:
   *  
   *  Surname, One T T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, O Two T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, O T Three => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, One T T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, One Two T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, O Two Three => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, One T Three => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  
   *  PLUS!
   *  
   *  Surname, One T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, O Two => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, One => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, O => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname,  => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  
   *  and this happens for every name in the list!!!
   *  
   *  OK, I honestly think this is too much combinations and most of them are going to be
   *  useless. Especially, because these combinations ARE ALREADY generated during the
   *  search for synonyms (but NOT used in the query if there is no EXACT match!) 
   *  
   *  I secretly hope, that if a user typed "Foo, Boo Boz" she REALLY doesn't want
   *  to get "Foo, Baz Bar" -- and this is exactly what they would get IFF this logic is
   *  implemented - ie. if the synonym file contained another line with:
   *  
   *  "Foo, Boo Boz; Surname, One Boo"
   *  
   *  The combination "Foo, B B" will cause all the patters of these two lines to be merged
   *  with "Foo, Bar Baz" and the FALSE FALSE HITS are returned. And I don't like this idea and I 
   *  think it is wrong. 
   *  
   *  So, I am going to implement a middle path that creates ONLY the 
   *  initial and surname variations. And if we discover, that in fact this behaviour is not
   *  desirable (then I can change it)
   *  
   *  So, to summarize, I want to generate only these patters for now:
   *  
   *  Surname, => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, O => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, O T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  Surname, O T T => Surname, One Two Three; Surname, O T T; Foo, Bar Baz; Foo, B B
   *  
   */
  public static class MakeAllShortNames extends NewSynonymFilterFactory.SynonymBuilderFactory {
  	
  	
  	
    public MakeAllShortNames(Map<String,String> args) {
      super(args);
    }

    protected SynonymParser getParser(Analyzer analyzer) {
    	char sep = ',';
    	if (args.containsKey("format") && args.get("format").equals("semicolon")) {
    		sep = ';';
    	};
    	
    	final Character charSeparator = sep;
    	
      return new NewSolrSynonymParser(true, true, analyzer) {
      	
        public void add(Reader in) throws IOException, ParseException {
          LineNumberReader br = new LineNumberReader(in);
          StringBuffer newBr = new StringBuffer();
          String line = null;
          
          String[] parts;
          HashSet<String> seen = new HashSet<String>();
          
          try {
            while ((line = br.readLine()) != null) {
            	//System.out.println(line);
              // modify the original on-the-fly
              if (line.length() == 0 || line.charAt(0) == '#') {
                continue; // ignore empty lines and comments
              }
              seen.clear();
              
              String[] sides = line.split("=>");
              if (sides.length > 1) { // explicit mapping
                String[] names = getNames(sides[1]);
                //System.out.println(Arrays.toString(names));
                parts = AuthorUtils.splitName(sides[0]);
                if (isLongForm(parts) && containsLongForm(names) > 0) {
                  for (String shortForm: getAllShortForms(parts)) {
                    if (seen.contains(shortForm)) continue;
                    seen.add(shortForm);
                    newBr.append(escape(shortForm) + "=>" +
                        sides[0] + "," +
                        buildLine(names));
                    newBr.append("\n");
                  }
                }
              }
              else {
                String[] names = getNames(sides[0]);
                if (containsLongForm(names) > 1) {
                  String newLine = buildLine(names);
                  for (int i=0;i<names.length;i++) {
                    parts = AuthorUtils.splitName(names[i]);
                    if (isLongForm(parts)) {
                      for (String shortForm: getAllShortForms(parts)) {
                        if (seen.contains(shortForm)) continue;
                        seen.add(shortForm);
                        newBr.append(escape(shortForm) + "=>" +
                            newLine);
                        newBr.append("\n");
                      }
                    }
                  }
                }
              }
            }
          } catch (IllegalArgumentException e) {
            ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
            ex.initCause(e);
            throw ex;
          } finally {
            br.close();
          }
          
          // pass the modified synonym to the builder to create a synonym map
          super.add(new InputStreamReader(new ByteArrayInputStream(newBr.toString().getBytes(Charset.forName("UTF-8"))),
              Charset.forName("UTF-8").newDecoder()));

        }
        
        private String[] getAllShortForms(String[] parts) {
          String[] names = new String[parts.length];
          for (int i=0;i<parts.length;i++) {
            StringBuilder out = new StringBuilder();
            out.append(parts[0]);
            for (int j=1;j<=i;j++) {
              out.append(" ");
              out.append(parts[i].substring(0, 1));
            }
            names[i] = out.toString();
          }
          return names;
        }
        
        @Override
        public void add(CharsRef input, CharsRef output, boolean includeOrig) {
          super.add(input, output, true);
        }
        
        private String[] getNames(String vals) {
          List<String> nn = StrUtils.splitSmart(vals, charSeparator);
          String names[] = new String[nn.size()];
          int j = 0;
          for (String n: nn) {
            names[j] = unescape(n);
            j++;
          }
          return names;
        }
     // cause we subclass solrsynonym parser, we must output solr format
        private String buildLine(String[] names) {
          HashSet<String> set = new HashSet<String>();
          StringBuilder out = new StringBuilder();
          boolean notFirst = false;
          
          for (String name: names) {
            
            String[] p = AuthorUtils.splitName(name);
            if (isLongForm(p)) {
              set.add(makeShortForm(p));
            }
            set.add(name);
          }
          for (String name: set) {
            if (notFirst) out.append(",");
            out.append(escape(name));
            notFirst = true;
          }
          return out.toString();
        }
        
        
        private String unescape(String s) {
          return s.replace("\\ ", " ").replace("\\," + charSeparator, charSeparator.toString());
        }
        
        // cause we subclass solrsynonym parser, we must output solr format
        private String escape(String s) {
          return s.replace(" ", "\\ ").replace(",", "\\,");
        }
        
        
        private String makeShortForm(String[] parts) {
          StringBuilder out = new StringBuilder();
          out.append(parts[0]);
          
          for (int i=1;i<parts.length;i++) {
            out.append(" ");
            //System.out.println("->" + parts[i]);
            out.append(parts[i].substring(0, 1));
          }
          return out.toString();
        }
        
        private boolean isLongForm(String[] parts) {
          return parts.length > 1;
        }
        private int containsLongForm(String[] names) {
          int i = 0;
          for (String name: names) {
            if (isLongForm(AuthorUtils.splitName(name))) {
              i++;
            }
          }
          return i;
        }
      };
    }
    
  }
  
  /*
   * The following class will change (on-the-fly) the synonym file, for every
   * rule which contains a fullname, ie. 
   * 
   *    Surname, Name; Foo, Bar
   *    
   * it will produce new mappings of the form
   * 
   *    Surname, N => Surname, Name; Foo, B; Foo, Bar
   *    Foo, B => Foo, Bar; Surname, Name; Surname, N
   *    
   * 
   *    
   * This class was the first attempt I wrote on the synonym upgrade, however
   * Alberto wants that all combinations of names are upgraded. Ie. 
   *    
   *    Surname, => Surname, Name; Foo, B; Foo, Bar
   *    Surname, N => Surname, Name; Foo, B; Foo, Bar
   *    Foo, => Foo, Bar; Surname, Name; Surname, N
   *    Foo, B => Foo, Bar; Surname, Name; Surname, N
   * 
   */
  public static class MakeShortNames extends NewSynonymFilterFactory.SynonymBuilderFactory {

    public MakeShortNames(Map<String,String> args) {
      super(args);
    }

    protected SynonymParser getParser(Analyzer analyzer) {
    	
    	char sep = ',';
    	if (args.containsKey("format") && args.get("format").equals("semicolon")) {
    		sep = ';';
    	};
    	
    	final Character charSeparator = sep;
    	
      return new NewSolrSynonymParser(true, true, analyzer) {

        public void add(Reader in) throws IOException, ParseException {
          LineNumberReader br = new LineNumberReader(in);
          StringBuffer newBr = new StringBuffer();
          String line = null;
          
          String[] parts;
          
          try {
            while ((line = br.readLine()) != null) {
              // modify the original on-the-fly
              if (line.length() == 0 || line.charAt(0) == '#') {
                continue; // ignore empty lines and comments
              }
              String[] sides = line.split("=>");
              if (sides.length > 1) { // explicit mapping
                String[] names = getNames(sides[1]);
                parts = AuthorUtils.splitName(sides[0]);
                if (isLongForm(parts) && containsLongForm(names) > 0) {
                  newBr.append(escape(makeShortForm(parts)) + "=>" +
                      sides[0] + "," +
                      buildLine(names));
                  newBr.append("\n");
                }
              }
              else {
                String[] names = getNames(sides[0]);
                if (containsLongForm(names) > 1) {
                  String newLine = buildLine(names);
                  for (int i=0;i<names.length;i++) {
                  	parts = AuthorUtils.splitName(sides[i]);
                    if (isLongForm(parts)) {
                      newBr.append(escape(makeShortForm(parts)) + "=>" +
                          newLine);
                      newBr.append("\n");
                    }
                  }
                }
              }
            }
          } catch (IllegalArgumentException e) {
            ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
            ex.initCause(e);
            throw ex;
          } finally {
            br.close();
          }
          
          // pass the modified synonym to the builder to create a synonym map
          super.add(new InputStreamReader(new ByteArrayInputStream(newBr.toString().getBytes()),
              Charset.forName("UTF-8").newDecoder()));

        }
        @Override
        public void add(CharsRef input, CharsRef output, boolean includeOrig) {
          super.add(input, output, true);
        }
        
        private String[] getNames(String vals) {
          List<String> nn = StrUtils.splitSmart(vals, charSeparator);
          String names[] = new String[nn.size()];
          int j = 0;
          for (String n: nn) {
            names[j] = unescape(n);
            j++;
          }
          return names;
        }
        private String buildLine(String[] names) {
          HashSet<String> set = new HashSet<String>();
          StringBuilder out = new StringBuilder();
          boolean notFirst = false;
          
          for (String name: names) {
            
            String[] p = AuthorUtils.splitName(name);
            if (isLongForm(p)) {
              set.add(makeShortForm(p));
            }
            set.add(name);
          }
          for (String name: set) {
            if (notFirst) out.append(",");
            out.append(escape(name));
            notFirst = true;
          }
          return out.toString();
        }
        
        
        private String unescape(String s) {
          return s.replace("\\ ", " ").replace("\\" + charSeparator, charSeparator.toString());
        }
        
        
        private String escape(String s) {
          return s.replace(" ", "\\ ").replace(",", "\\,");
        }
        
        
        private String makeShortForm(String[] parts) {
          StringBuilder out = new StringBuilder();
          out.append(parts[0]);
          for (int i=1;i<parts.length;i++) {
            out.append(" ");
            out.append(parts[i].substring(0, 1));
          }
          return out.toString();
        }
        
        private boolean isLongForm(String[] parts) {
          boolean res = false;
          for (int i=1;i<parts.length;i++) {
            if (parts[i].length() > 1)
              return true;
          }
          return res;
        }
        private int containsLongForm(String[] names) {
          int i = 0;
          for (String name: names) {
            if (isLongForm(AuthorUtils.splitName(name))) {
              i++;
            }
          }
          return i;
        }
      };
    }
  }
  
  
  @Override
  public void inform(ResourceLoader loader) {
    super.inform(loader);
  }

  @Override
  public TokenStream create(TokenStream input) {
    // this filter factory does nothing on its own, it is used by the
    // NewSynonymFilteFactory
    return input;
  }

}