package org.apache.solr.analysis; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.StrUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public abstract class WriteableSynonymMap { public static final Logger log = LoggerFactory.getLogger(WriteableSynonymMap.class); public abstract void add(String key, Set<String> values); public abstract void populateMap(List<String> rules); public abstract String formatEntry(String key, Set<String> values); protected Map<String, Set<String>> map; protected Map<String, Set<String>> regexMap; protected int numUpdates = 0; protected String outFile = null; public WriteableSynonymMap() { this.map = Collections.synchronizedMap(new HashMap<String, Set<String>>()); this.regexMap = Collections.synchronizedMap(new HashMap<String, Set<String>>()); } public void clear() { this.map = new HashMap<String, Set<String>>(); this.regexMap = new HashMap<String, Set<String>>(); } public void setOutput(String out) { this.outFile = out; } public Set<String> get(String k) { return this.map.get(k); } public void put(String k, Set<String> v) { if (outFile == null) return; numUpdates++; this.map.put(k, v); } /* * Tries hard to find all keys that match * the given pattern, the first call is expensive * subsequent calls will be relative fast. * However, avoid this method if you can! */ public Set<String> get(Pattern p) { Set<String> synonyms = new HashSet<String>(); String pKey = p.toString(); if (regexMap.containsKey(pKey)) { if (regexMap.get(pKey) != null) { for (String k : regexMap.get(pKey)) { synonyms.addAll(this.map.get(k)); synonyms.add(k); } return synonyms; } return null; } Set<String> matchedKeys = new HashSet<String>(); for (String mapKey : this.map.keySet()) { Matcher m = p.matcher(mapKey); if (m.matches()) { matchedKeys.add(mapKey); synonyms.addAll(this.map.get(mapKey)); synonyms.add(mapKey); } } if (synonyms.size() > 0) { regexMap.put(pKey, matchedKeys); return synonyms; } else { regexMap.put(pKey, null); return null; } } public boolean containsKey(String key) { return this.map.containsKey(key); } public boolean isDirty() { if (numUpdates > 0) return true; return false; } public boolean persist() throws IOException { return persist(false); } public boolean persist(boolean append) throws IOException { Writer writer = getWriter(append); if (writer == null) { //log.error("Cannot write synonyms, writer object is null."); return false; } writeSynonyms(map, writer); numUpdates = 0; writer.close(); return true; } private Writer getWriter(boolean append) { if (outFile == null) return null; log.info("Creating new Writer for " + outFile); Writer w; Charset UTF_8 = Charset.forName("UTF-8"); try { w = new OutputStreamWriter(new FileOutputStream(this.outFile, append), UTF_8); //w = new BufferedWriter(w); } catch (FileNotFoundException e) { throw new RuntimeException(e); } return w; } public void writeSynonyms(Map<String, Set<String>> map, Writer writer) { StringBuffer out = new StringBuffer(); int max = 1000; int i = 0; for (Map.Entry<String, Set<String>> entry : map.entrySet()) { out.append(formatEntry(entry.getKey(), entry.getValue())); i++; if (i > max) { i = 0; write(writer, out.toString(), false); out = new StringBuffer(); } } write(writer, out.toString(), true); } private void write(Writer writer, String out, boolean flush) { try { synchronized(writer) { writer.write(out); if (flush) writer.flush(); } } catch (IOException e) { throw new RuntimeException(e); } } public void finalize() { if (isDirty()) try { this.persist(); } catch (IOException e) { log.error(e.getLocalizedMessage()); } } public static Set<String> splitValues(String synonyms) { Set<String> list = new LinkedHashSet<String>(); // preserve order for (String s : StrUtils.splitSmart(synonyms, ",", false)) { list.add(s.trim().replace("\\,", ",").replace("\\ ", " ")); } return list; } /** * This is just a helper method, you should be using SolrResouceLoader#getLines() * instead * * @param inputFile * file path */ public List<String> getLines(String inputFile) throws IOException { ArrayList<String> lines; BufferedReader input = null; try { input = new BufferedReader(new InputStreamReader( new FileInputStream(new File(inputFile)), Charset.forName("UTF-8"))); lines = new ArrayList<String>(); for (String word = null; (word = input.readLine()) != null;) { // skip initial bom marker if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF') word = word.substring(1); // skip comments if (word.startsWith("#")) continue; word = word.trim(); // skip blank lines if (word.length() == 0) continue; lines.add(word); } } catch (CharacterCodingException ex) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error loading resource (wrong encoding?): " + inputFile, ex); } finally { if (input != null) input.close(); } return lines; } }