SolrSynonymParser.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.synonym;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;

/**
 * Parser for the Solr synonyms format.
 * <ol>
 *   <li> Blank lines and lines starting with '#' are comments.
 *   <li> Explicit mappings match any token sequence on the LHS of "=>"
 *        and replace with all alternatives on the RHS.  These types of mappings
 *        ignore the expand parameter in the constructor.
 *        Example:
 *        <blockquote>i-pod, i pod => ipod</blockquote>
 *   <li> Equivalent synonyms may be separated with commas and give
 *        no explicit mapping.  In this case the mapping behavior will
 *        be taken from the expand parameter in the constructor.  This allows
 *        the same synonym file to be used in different synonym handling strategies.
 *        Example:
 *        <blockquote>ipod, i-pod, i pod</blockquote>
 * 
 *   <li> Multiple synonym mapping entries are merged.
 *        Example:
 *        <blockquote>
 *         foo => foo bar<br>
 *         foo => baz<br><br>
 *         is equivalent to<br><br>
 *         foo => foo bar, baz
 *        </blockquote>
 *  </ol>
 * @lucene.experimental
 */
public class SolrSynonymParser extends SynonymMap.Parser {
  private final boolean expand;
  
  public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
    super(dedup, analyzer);
    this.expand = expand;
  }

  @Override
  public void parse(Reader in) throws IOException, ParseException {
    LineNumberReader br = new LineNumberReader(in);
    try {
      addInternal(br);
    } catch (IllegalArgumentException e) {
      ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
      ex.initCause(e);
      throw ex;
    } finally {
      br.close();
    }
  }
  
  private void addInternal(BufferedReader in) throws IOException {
    String line = null;
    while ((line = in.readLine()) != null) {
      if (line.length() == 0 || line.charAt(0) == '#') {
        continue; // ignore empty lines and comments
      }
      
      // TODO: we could process this more efficiently.
      String sides[] = split(line, "=>");
      if (sides.length > 1) { // explicit mapping
        if (sides.length != 2) {
          throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
        }
        String inputStrings[] = split(sides[0], ",");
        CharsRef[] inputs = new CharsRef[inputStrings.length];
        for (int i = 0; i < inputs.length; i++) {
          inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
        }
        
        String outputStrings[] = split(sides[1], ",");
        CharsRef[] outputs = new CharsRef[outputStrings.length];
        for (int i = 0; i < outputs.length; i++) {
          outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRefBuilder());
        }
        // these mappings are explicit and never preserve original
        for (int i = 0; i < inputs.length; i++) {
          for (int j = 0; j < outputs.length; j++) {
            add(inputs[i], outputs[j], false);
          }
        }
      } else {
        String inputStrings[] = split(line, ",");
        CharsRef[] inputs = new CharsRef[inputStrings.length];
        for (int i = 0; i < inputs.length; i++) {
          inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
        }
        if (expand) {
          // all pairs
          for (int i = 0; i < inputs.length; i++) {
            for (int j = 0; j < inputs.length; j++) {
              if (i != j) {
                add(inputs[i], inputs[j], true);
              }
            }
          }
        } else {
          // all subsequent inputs map to first one; we also add inputs[0] here
          // so that we "effectively" (because we remove the original input and
          // add back a synonym with the same text) change that token's type to
          // SYNONYM (matching legacy behavior):
          for (int i = 0; i < inputs.length; i++) {
            add(inputs[i], inputs[0], false);
          }
        }
      }
    }
  }
  
  private static String[] split(String s, String separator) {
    ArrayList<String> list = new ArrayList<>(2);
    StringBuilder sb = new StringBuilder();
    int pos=0, end=s.length();
    while (pos < end) {
      if (s.startsWith(separator,pos)) {
        if (sb.length() > 0) {
          list.add(sb.toString());
          sb=new StringBuilder();
        }
        pos+=separator.length();
        continue;
      }

      char ch = s.charAt(pos++);
      if (ch=='\\') {
        sb.append(ch);
        if (pos>=end) break;  // ERROR, or let it go?
        ch = s.charAt(pos++);
      }

      sb.append(ch);
    }

    if (sb.length() > 0) {
      list.add(sb.toString());
    }

    return list.toArray(new String[list.size()]);
  }
  
  private String unescape(String s) {
    if (s.indexOf("\\") >= 0) {
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < s.length(); i++) {
        char ch = s.charAt(i);
        if (ch == '\\' && i < s.length() - 1) {
          sb.append(s.charAt(++i));
        } else {
          sb.append(ch);
        }
      }
      return sb.toString();
    }
    return s;
  }
}