/**
* IndexEntry
* Copyright 22.07.2015 by Michael Peter Christen, @0rb1t3r
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; wo even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package org.loklak.data;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.loklak.objects.MessageEntry;
import org.loklak.objects.Timeline;
import org.loklak.tools.bayes.BayesClassifier;
import org.loklak.tools.bayes.Classification;
public class Classifier {
private final static Category NEGATIVE_FEATURE = Category.NONE;
public enum Category {
joy,trust,fear,surprise,sadness,disgust,anger,anticipation,
swear,sex,leet,troll,
english, german, french, spanish, dutch,
NONE;
}
public final static Pattern NON_WORD_PATTERN = Pattern.compile("\\W");
public final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
public enum Context {
emotion(new Category[]{Category.joy,Category.trust,Category.fear,Category.surprise,Category.sadness,Category.disgust,Category.anger,Category.anticipation}),
profanity(new Category[]{Category.swear,Category.sex,Category.leet,Category.troll}),
language(new Category[]{Category.english, Category.german, Category.french, Category.spanish, Category.dutch});
public Map<Category, Set<String>> categories;
BayesClassifier<String, Category> bayes;
private Context(Category... categories) {
this.categories = new HashMap<>();
this.bayes = new BayesClassifier<>();
for (Category f: categories) this.categories.put(f, null);
}
public void init(final int capacity) {
this.bayes.setMemoryCapacity(capacity);
// read the categories
for (Category f: this.categories.keySet()) {
String keys = DAO.getConfig("classification." + this.name() + "." + f.name(), "");
Set<String> keyset = new HashSet<>();
for (String key: keys.toLowerCase().split(",")) keyset.add(key);
this.categories.put(f, keyset);
}
// consistency check of categories: identify words appearing not in one category only
Set<String> inconsistentWords = new HashSet<>();
for (Map.Entry<Category, Set<String>> c0: this.categories.entrySet()) {
for (String key: c0.getValue()) {
doublecheck: for (Map.Entry<Category, Set<String>> c1: this.categories.entrySet()) {
if (c1.getKey().equals(c0.getKey())) continue doublecheck;
if (c1.getValue().contains(key)) {inconsistentWords.add(key); break doublecheck;}
}
}
}
// remove inconsistent words from all categories
for (String key: inconsistentWords) {
forgetWord(key);
}
}
public Set<String> vocabulary() {
Set<String> v = new HashSet<String>();
for (Set<String> v0: this.categories.values()) v.addAll(v0);
return v;
}
public void forgetWord(String key) {
for (Map.Entry<Category, Set<String>> c2: this.categories.entrySet()) {
c2.getValue().remove(key);
}
}
public void learnPhrase(String phrase) {
synchronized (this) {try {
List<String> words = normalize(phrase);
for (Map.Entry<Category, Set<String>> entry: categories.entrySet()) {
Set<String> vs = entry.getValue();
if (vs == null) continue; // what does that mean?
for (String word: words) {
if (word.length() == 0) continue;
if (vs.contains(word)) {
bayes.learn(entry.getKey(), words);
}
}
}
bayes.learn(NEGATIVE_FEATURE, words);
} catch (Throwable t) {
t.printStackTrace();
}}
}
public Classification<String, Category> classify(String phrase) {
List<String> words = normalize(phrase);
return this.bayes.classify(words);
}
private List<String> normalize(String phrase) {
String cleanphrase = NON_WORD_PATTERN.matcher(phrase.toLowerCase()).replaceAll(" ");
String[] rawtokens = WHITESPACE_PATTERN.split(cleanphrase, 0);
List<String> tokens = new ArrayList<>();
for (String token: rawtokens) if (token.length() > 2) tokens.add(token);
return tokens;
}
}
public static synchronized void learnPhrase(String message) {
for (Context c: Context.values()) c.learnPhrase(message);
}
public static Map<Context, Classification<String, Category>> classify(String phrase) {
Map<Context, Classification<String, Category>> map = new HashMap<>();
for (Context c: Context.values()) {
Classification<String, Category> classification = c.classify(phrase);
if (classification == null) return null;
if (classification.getProbability() == 0.0) return null;
map.put(c, classification);
}
return map;
}
public static void init(int maxsize, int initsize) {
// load the context keys
DAO.log("Classifier: initializing " + Context.values().length + " contexts...");
for (Context c: Context.values()) {
DAO.log("Classifier: initializing contexts '" + c.name() + "'...");
c.init(maxsize);
}
/*
// ensure consistency throughout the contexts: remove words which could confuse the bayesian filter
for (Context c: Context.values()) {
Set<String> voc = c.vocabulary();
for (Context c0: Context.values()) {
if (c0.equals(c)) continue;
for (String key: voc) c0.forget(key);
}
}
*/
// load a test set
if (DAO.countLocalMessages(-1, true) > 0) {
DAO.log("Classifier: loading test set for " + initsize + " messages...");
DAO.SearchLocalMessages testset = new DAO.SearchLocalMessages("", Timeline.Order.CREATED_AT, 0, initsize, 0);
Timeline tl = testset.timeline;
DAO.log("Classifier: awaiting " + tl.size() * Context.values().length + " learn steps...");
int count = 0;
for (Context c: Context.values()) {
//Set<String> voc = c.vocabulary();
for (MessageEntry m: tl) {
c.learnPhrase(m.getText());
count++;
if (count % 100 == 0) DAO.log("Classifier: performed " + count + " learn steps");
}
}
}
/*
for (MessageEntry m: tl) {
System.out.println(m.getText());
System.out.print(" -> ");
Map<Context, Classification<String, Category>> classification = classify(m.getText());
for (Map.Entry<Context, Classification<String, Category>> c: classification.entrySet()) {
System.out.print(c.getKey().name() + " = " + c.getValue().getCategory() + "[" + c.getValue().getProbability() + "]" + " ");
}
System.out.println();
}
*/
}
}