/* * The MIT License (MIT) * ------------------ * * Copyright (c) 2012-2014 Philipp Nolte * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ /* * This software was taken from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier * and inserted into the loklak class hierarchy to be enhanced and extended * by @0rb1t3r. After optimization in loklak it was inserted into the net.yacy.cora.bayes * package. It shall be used to create custom search navigation filters. * The original copyright notice was copied from the README.mnd * from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier/blob/master/README.md * The original package domain was de.daslaboratorium.machinelearning.classifier */ package net.yacy.cora.bayes; import java.util.Collection; import java.util.LinkedList; import java.util.Map; import java.util.Queue; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; /** * Abstract base extended by any concrete classifier. It implements the basic * functionality for storing categories or features and can be used to calculate * basic probabilities – both category and feature probabilities. The classify * function has to be implemented by the concrete classifier class. * * @author Philipp Nolte * * @param <T> A feature class * @param <K> A category class */ public abstract class Classifier<T, K> { /** * Initial capacity of category dictionaries. */ private static final int INITIAL_CATEGORY_DICTIONARY_CAPACITY = 16; /** * Initial capacity of feature dictionaries. It should be quite big, because * the features will quickly outnumber the categories. */ private static final int INITIAL_FEATURE_DICTIONARY_CAPACITY = 32; /** * The initial memory capacity or how many classifications are memorized. */ private int memoryCapacity = 1000; /** * A dictionary mapping features to their number of occurrences in each * known category. */ private Map<K, Map<T, Integer>> featureCountPerCategory; /** * A dictionary mapping features to their number of occurrences. */ private Map<T, Integer> totalFeatureCount; /** * A dictionary mapping categories to their number of occurrences. */ private Map<K, Integer> totalCategoryCount; /** * The classifier's memory. It will forget old classifications as soon as * they become too old. */ private Queue<Classification<T, K>> memoryQueue; /** * Constructs a new classifier without any trained knowledge. */ public Classifier() { this.reset(); } /** * Resets the <i>learned</i> feature and category counts. */ public void reset() { this.featureCountPerCategory = new ConcurrentHashMap<K, Map<T,Integer>>( Classifier.INITIAL_CATEGORY_DICTIONARY_CAPACITY); this.totalFeatureCount = new ConcurrentHashMap<T, Integer>( Classifier.INITIAL_FEATURE_DICTIONARY_CAPACITY); this.totalCategoryCount = new ConcurrentHashMap<K, Integer>( Classifier.INITIAL_CATEGORY_DICTIONARY_CAPACITY); this.memoryQueue = new LinkedList<Classification<T, K>>(); } /** * Returns a <code>Set</code> of features the classifier knows about. * * @return The <code>Set</code> of features the classifier knows about. */ public Set<T> getFeatures() { return this.totalFeatureCount.keySet(); } /** * Returns a <code>Set</code> of categories the classifier knows about. * * @return The <code>Set</code> of categories the classifier knows about. */ public Set<K> getCategories() { return this.totalCategoryCount.keySet(); } /** * Retrieves the total number of categories the classifier knows about. * * @return The total category count. */ public int getCategoriesTotal() { int toReturn = 0; for (Integer c: this.totalCategoryCount.values()) { toReturn += c; } return toReturn; } /** * Retrieves the memory's capacity. * * @return The memory's capacity. */ public int getMemoryCapacity() { return memoryCapacity; } /** * Sets the memory's capacity. If the new value is less than the old * value, the memory will be truncated accordingly. * * @param memoryCapacity The new memory capacity. */ public void setMemoryCapacity(int memoryCapacity) { for (int i = this.memoryCapacity; i > memoryCapacity; i--) { this.memoryQueue.poll(); } this.memoryCapacity = memoryCapacity; } /** * Increments the count of a given feature in the given category. This is * equal to telling the classifier, that this feature has occurred in this * category. * * @param feature The feature, which count to increase. * @param category The category the feature occurred in. */ public void incrementFeature(T feature, K category) { Map<T, Integer> features = this.featureCountPerCategory.get(category); if (features == null) { this.featureCountPerCategory.put(category, new ConcurrentHashMap<T, Integer>(Classifier.INITIAL_FEATURE_DICTIONARY_CAPACITY)); features = this.featureCountPerCategory.get(category); } Integer count = features.get(feature); if (count == null) { features.put(feature, 0); count = features.get(feature); } features.put(feature, ++count); Integer totalCount = this.totalFeatureCount.get(feature); if (totalCount == null) { this.totalFeatureCount.put(feature, 0); totalCount = this.totalFeatureCount.get(feature); } this.totalFeatureCount.put(feature, ++totalCount); } /** * Increments the count of a given category. This is equal to telling the * classifier, that this category has occurred once more. * * @param category The category, which count to increase. */ public void incrementCategory(K category) { Integer count = this.totalCategoryCount.get(category); if (count == null) { this.totalCategoryCount.put(category, 0); count = this.totalCategoryCount.get(category); } this.totalCategoryCount.put(category, ++count); } /** * Decrements the count of a given feature in the given category. This is * equal to telling the classifier that this feature was classified once in * the category. * * @param feature The feature to decrement the count for. * @param category The category. */ public void decrementFeature(T feature, K category) { Map<T, Integer> features = this.featureCountPerCategory.get(category); if (features == null) { return; } Integer count = features.get(feature); if (count == null) { return; } if (count.intValue() == 1) { features.remove(feature); if (features.size() == 0) { this.featureCountPerCategory.remove(category); } } else { features.put(feature, --count); } Integer totalCount = this.totalFeatureCount.get(feature); if (totalCount == null) { return; } if (totalCount.intValue() == 1) { this.totalFeatureCount.remove(feature); } else { this.totalFeatureCount.put(feature, --totalCount); } } /** * Decrements the count of a given category. This is equal to telling the * classifier, that this category has occurred once less. * * @param category The category, which count to increase. */ public void decrementCategory(K category) { Integer count = this.totalCategoryCount.get(category); if (count == null) { return; } if (count.intValue() == 1) { this.totalCategoryCount.remove(category); } else { this.totalCategoryCount.put(category, --count); } } /** * Retrieves the number of occurrences of the given feature in the given * category. * * @param feature The feature, which count to retrieve. * @param category The category, which the feature occurred in. * @return The number of occurrences of the feature in the category. */ public int featureCount(T feature, K category) { Map<T, Integer> features = this.featureCountPerCategory.get(category); if (features == null) return 0; Integer count = features.get(feature); return (count == null) ? 0 : count.intValue(); } /** * Retrieves the number of occurrences of the given category. * * @param category The category, which count should be retrieved. * @return The number of occurrences. */ public int categoryCount(K category) { Integer count = this.totalCategoryCount.get(category); return (count == null) ? 0 : count.intValue(); } public float featureProbability(T feature, K category) { if (this.categoryCount(category) == 0) return 0; return (float) this.featureCount(feature, category) / (float) this.categoryCount(category); } /** * Retrieves the weighed average <code>P(feature|category)</code> with * overall weight of <code>1.0</code> and an assumed probability of * <code>0.5</code>. The probability defaults to the overall feature * probability. * * @see de.daslaboratorium.machinelearning.classifier.Classifier#featureProbability(Object, Object) * @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float) * * @param feature The feature, which probability to calculate. * @param category The category. * @return The weighed average probability. */ public float featureWeighedAverage(T feature, K category) { return this.featureWeighedAverage(feature, category, null, 1.0f, 0.5f); } /** * Retrieves the weighed average <code>P(feature|category)</code> with * overall weight of <code>1.0</code>, an assumed probability of * <code>0.5</code> and the given object to use for probability calculation. * * @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float) * * @param feature The feature, which probability to calculate. * @param category The category. * @param calculator The calculating object. * @return The weighed average probability. */ public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator) { return this.featureWeighedAverage(feature, category, calculator, 1.0f, 0.5f); } /** * Retrieves the weighed average <code>P(feature|category)</code> with * the given weight and an assumed probability of <code>0.5</code> and the * given object to use for probability calculation. * * @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float) * * @param feature The feature, which probability to calculate. * @param category The category. * @param calculator The calculating object. * @param weight The feature weight. * @return The weighed average probability. */ public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator, float weight) { return this.featureWeighedAverage(feature, category, calculator, weight, 0.5f); } /** * Retrieves the weighed average <code>P(feature|category)</code> with * the given weight, the given assumed probability and the given object to * use for probability calculation. * * @param feature The feature, which probability to calculate. * @param category The category. * @param calculator The calculating object. * @param weight The feature weight. * @param assumedProbability The assumed probability. * @return The weighed average probability. */ public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator, float weight, float assumedProbability) { /* * use the given calculating object or the default method to calculate * the probability that the given feature occurred in the given * category. */ final float basicProbability = (calculator == null) ? this.featureProbability(feature, category) : calculator.featureProbability(feature, category); Integer totals = this.totalFeatureCount.get(feature); if (totals == null) totals = 0; return (weight * assumedProbability + totals * basicProbability) / (weight + totals); } /** * Train the classifier by telling it that the given features resulted in * the given category. * * @param category The category the features belong to. * @param features The features that resulted in the given category. */ public void learn(K category, Collection<T> features) { this.learn(new Classification<T, K>(features, category)); } /** * Train the classifier by telling it that the given features resulted in * the given category. * * @param classification The classification to learn. */ public void learn(Classification<T, K> classification) { for (T feature : classification.getFeatureset()) this.incrementFeature(feature, classification.getCategory()); this.incrementCategory(classification.getCategory()); this.memoryQueue.offer(classification); if (this.memoryQueue.size() > this.memoryCapacity) { Classification<T, K> toForget = this.memoryQueue.remove(); for (T feature : toForget.getFeatureset()) this.decrementFeature(feature, toForget.getCategory()); this.decrementCategory(toForget.getCategory()); } } /** * The classify method. It will retrieve the most likely category for the * features given and depends on the concrete classifier implementation. * * @param features The features to classify. * @return The category most likely. */ public abstract Classification<T, K> classify(Collection<T> features); }