WriteableSynonymMap.java example

Explorer
montysolr-master
- contrib
package org.apache.solr.analysis;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.StrUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class WriteableSynonymMap {

  public static final Logger log = LoggerFactory.getLogger(WriteableSynonymMap.class);

  public abstract void add(String key, Set<String> values);
  public abstract void populateMap(List<String> rules);
  public abstract String formatEntry(String key, Set<String> values);

  protected Map<String, Set<String>> map;
  protected Map<String, Set<String>> regexMap;
  protected int numUpdates = 0;
  protected String outFile = null;

  public WriteableSynonymMap() {
    this.map = Collections.synchronizedMap(new HashMap<String, Set<String>>());
    this.regexMap = Collections.synchronizedMap(new HashMap<String, Set<String>>());
  }
  
  public void clear() {
    this.map = new HashMap<String, Set<String>>();
    this.regexMap = new HashMap<String, Set<String>>();
  }

  public void setOutput(String out) {
    this.outFile = out;
  }

  public Set<String> get(String k) {
    return this.map.get(k);
  }
  
  public void put(String k, Set<String> v) {
  	if (outFile == null)
  		return;
    numUpdates++;
    this.map.put(k, v);
  }

  /*
   * Tries hard to find all keys that match
   * the given pattern, the first call is expensive
   * subsequent calls will be relative fast.
   * However, avoid this method if you can!
   */
  public Set<String> get(Pattern p) {

    Set<String> synonyms = new HashSet<String>();
    String pKey = p.toString();
    if (regexMap.containsKey(pKey)) {
      if (regexMap.get(pKey) != null) {
        for (String k : regexMap.get(pKey)) {
          synonyms.addAll(this.map.get(k));
          synonyms.add(k);
        }
        return synonyms;
      }
      return null;
    }

    Set<String> matchedKeys = new HashSet<String>();
    for (String mapKey : this.map.keySet()) {
      Matcher m = p.matcher(mapKey);
      if (m.matches()) {
        matchedKeys.add(mapKey);
        synonyms.addAll(this.map.get(mapKey));
        synonyms.add(mapKey);
      }
    }

    if (synonyms.size() > 0) { 
      regexMap.put(pKey, matchedKeys);
      return synonyms;
    } else {
      regexMap.put(pKey, null);
      return null;
    }
  }


  public boolean containsKey(String key) {
    return this.map.containsKey(key);
  }

  public boolean isDirty() {
    if (numUpdates > 0) 
      return true;
    return false;
  }

  public boolean persist() throws IOException {
    return persist(false);
  }

  public boolean persist(boolean append) throws IOException {
    Writer writer = getWriter(append);
    if (writer == null) {
      //log.error("Cannot write synonyms, writer object is null.");
      return false;
    }

    writeSynonyms(map, writer);
    numUpdates = 0;
    writer.close();
    return true;
  }

  private Writer getWriter(boolean append) {
    if (outFile == null)
      return null;

    log.info("Creating new Writer for " + outFile);
    Writer w;

    Charset UTF_8 = Charset.forName("UTF-8");
    try {
      w = new OutputStreamWriter(new FileOutputStream(this.outFile, append), UTF_8);
      //w = new BufferedWriter(w);
    } catch (FileNotFoundException e) {
      throw new RuntimeException(e);
    } 
    return w;
  }

  public void writeSynonyms(Map<String, Set<String>> map, Writer writer) {
    StringBuffer out = new StringBuffer();
    int max = 1000;
    int i = 0;
    for (Map.Entry<String, Set<String>> entry : map.entrySet()) {
      out.append(formatEntry(entry.getKey(), entry.getValue()));
      i++;
      if (i > max) {
        i = 0;
        write(writer, out.toString(), false);
        out = new StringBuffer();
      }
    }
    write(writer, out.toString(), true);
  }

  private void write(Writer writer, String out, boolean flush) {
    try {
      synchronized(writer) {
        writer.write(out);
        if (flush) writer.flush();
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  public void finalize() {
    if (isDirty())
      try {
        this.persist();
      } catch (IOException e) {
        log.error(e.getLocalizedMessage());
      }
  }

  public static Set<String> splitValues(String synonyms) {
    Set<String> list = new LinkedHashSet<String>(); // preserve order
    for (String s : StrUtils.splitSmart(synonyms, ",", false)) {
      list.add(s.trim().replace("\\,", ",").replace("\\ ", " "));
    }
    return list;
  }

  /**
   * This is just a helper method, you should be using SolrResouceLoader#getLines()
   * instead
   * 
   * @param inputFile
   *    file path
   */
  public List<String> getLines(String inputFile) throws IOException {
    ArrayList<String> lines;
    BufferedReader input = null;
    try {
      input = new BufferedReader(new InputStreamReader(
          new FileInputStream(new File(inputFile)),
          Charset.forName("UTF-8")));

      lines = new ArrayList<String>();
      for (String word = null; (word = input.readLine()) != null;) {
        // skip initial bom marker
        if (lines.isEmpty() && word.length() > 0
            && word.charAt(0) == '\uFEFF')
          word = word.substring(1);
        // skip comments
        if (word.startsWith("#"))
          continue;
        word = word.trim();
        // skip blank lines
        if (word.length() == 0)
          continue;
        lines.add(word);
      }
    } catch (CharacterCodingException ex) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
          "Error loading resource (wrong encoding?): " + inputFile,
          ex);
    } finally {
      if (input != null)
        input.close();
    }
    return lines;

  }

}