package org.wikibrain.phrases; import com.typesafe.config.Config; import org.wikibrain.conf.Configuration; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.utils.WpStringUtils; import java.util.HashMap; import java.util.Map; /** * Extends the simple pruner by treating two strings that have the * same "normalized" version as the same. * * The final pruned counts includes the most popular version of the * normalized string. * * @author Shilad Sen */ public class NormalizedStringPruner extends SimplePruner<String> { public NormalizedStringPruner(int minCount, int maxRank, double minFrac) { super(minCount, maxRank, minFrac); } @Override public PrunedCounts<String> prune(final Map<String, Integer> allCounts) { Map<String, Integer> sums = new HashMap<String, Integer>(); // count sums per normalized string Map<String, String> best = new HashMap<String, String>(); // normalized string to most popular unnormalized version for (String key : allCounts.keySet()) { String nkey = WpStringUtils.normalize(key); int c = allCounts.get(key); sums.put(nkey, c + (sums.containsKey(nkey) ? sums.get(nkey) : 0)); if (!best.containsKey(nkey) || allCounts.get(best.get(nkey)) < c) { best.put(nkey, key); } } Map<String, Integer> normalizedCounts = new HashMap<String, Integer>(); for (String key : best.values()) { normalizedCounts.put(key, sums.get(WpStringUtils.normalize(key))); } return super.prune(normalizedCounts); } public static class Provider extends org.wikibrain.conf.Provider<PrunedCounts.Pruner> { public Provider(Configurator configurator, Configuration config) throws ConfigurationException { super(configurator, config); } @Override public Class getType() { return PrunedCounts.Pruner.class; } @Override public String getPath() { return "phrases.pruners"; } @Override public PrunedCounts.Pruner get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException { if (!config.getString("type").equals("string")) { return null; } int minCount = config.getInt("minCount"); int maxRank = config.getInt("maxRank"); double minFraction = config.getDouble("minFraction"); return new NormalizedStringPruner(minCount, maxRank, minFraction); } } }