/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package afxdeadcode;
import afxdeadcode.test.ClassifierTool;
import java.io.*;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.knallgrau.utils.textcat.FingerPrint;
/**
*
* @author SeH
*/
public class NGramClassifier implements Serializable {
String name;
String corpus = "";
Category cat;
// transient public Map<String, Double> avgBackground = new HashMap();
// transient public Map<String, List<Double>> avgBackgroundSamples = new HashMap();
static public String getContents(File aFile) {
//...checks on aFile are elided
StringBuilder contents = new StringBuilder();
try {
//use buffering, reading one line at a time
//FileReader always assumes default encoding is OK!
BufferedReader input = new BufferedReader(new FileReader(aFile));
try {
String line = null; //not declared within while loop
/*
* readLine is a bit quirky : it returns the content of a line
* MINUS the newline. it returns null only for the END of the
* stream. it returns an empty String if two newlines appear in
* a row.
*/
while ((line = input.readLine()) != null) {
contents.append(line);
contents.append(System.getProperty("line.separator"));
}
} finally {
input.close();
}
} catch (IOException ex) {
ex.printStackTrace();
}
return contents.toString();
}
public static NGramClassifier load(String name) {
String corp = getContents(new File("media/text/ngram." + name));
NGramClassifier cc = new NGramClassifier(name);
cc.setCorpus(corp);
return cc;
}
public NGramClassifier(String name) {
cat = new Category(name);
this.name = name;
update();
}
public void update() {
cat.clear();
}
// public void calibrateNormal() throws Exception {
// System.out.println("Calibrating normal levels");
//
// final int cycles = 8;
//
// for (String s : TwitterChannel.getPublicTweetStrings(cycles)) {
// //addBackground(Agent.filterTweet(s));
// addBackground(s);
// }
//
// System.out.println("avg background distances: " + avgBackground);
// }
//
// public void addBackground(String p) {
//
// if (avgBackgroundSamples == null) {
// avgBackground = new HashMap();
// avgBackgroundSamples = new HashMap();
// }
//
// for (String c : corpii.keySet()) {
// double dist = getDistance(p, c);
//
//// if (c.equals("happy"))
// //System.out.println(c + " " + p + " " + dist + " " + p.length() + " " + ( ((double)dist) / ((double)p.length()) ));
//
// if (avgBackgroundSamples.get(c) == null)
// avgBackgroundSamples.put(c, new LinkedList());
// avgBackgroundSamples.get(c).add(dist);
// }
//
// //recompute avgBackground
// for (String c : corpii.keySet()) {
// double total = 0;
// for (Double i : avgBackgroundSamples.get(c)) {
// total += i;
// }
// double n = avgBackgroundSamples.size();
// double v = total/n;
// avgBackground.put(c, v);
// }
// }
// public void addCategory(String x) {
// if (!corpii.containsKey(x))
// corpii.put(x, "");
// }
// public Category getCategory(String p) {
// if (cats == null)
// cats = new HashMap();
//
// if (cats.containsKey(p)) {
// return cats.get(p);
// }
// Category c = new Category(p);
// c.create(corpii.get(p));
// cats.put(p, c);
// return c;
// }
public void save(String path) throws Exception {
//System.out.println("saving:\n" + corpii.toString());
ObjectOutputStream ois = new ObjectOutputStream(new FileOutputStream(path));
ois.writeObject(this);
ois.close();
}
public void saveOnExit(final String path) {
Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
@Override
public void run() {
try {
save(path);
} catch (Exception ex) {
Logger.getLogger(ClassifierTool.class.getName()).log(Level.SEVERE, null, ex);
}
}
}));
}
// public Collection<String> categories() {
// return corpii.keySet();
// }
public void setCorpus(String corpus) {
this.corpus = corpus;
cat.create(corpus);
}
// public Map<String, Double> analyzeC(String t, List<String> catCompared) {
// Map<String, Integer> result = analyze(t, catCompared);
//
// Map<String, Double> d = new HashMap();
// for (String x : result.keySet()) {
// double nv = result.get(x) == 0 ? 1.0 : ((double)corpii.get(x).length()) / ((double)result.get(x));
// d.put(x, nv);
// }
//
// return d;
//
// }
public double getDistance(String t) {
FingerPrint fp = new FingerPrint();
fp.create(t);
//return ((double) fp.categorize(Arrays.asList(new FingerPrint[]{cat})).get(name)) / ((double) t.length());
return ((double) fp.categorize(Arrays.asList(new FingerPrint[]{cat})).get(name)) / ((double) corpus.length());
}
// public double analyzeC(String t, String c) {
// FingerPrint fp = new FingerPrint();
// fp.create(t);
//
// int d = fp.categorize(Arrays.asList(new FingerPrint[] { getCategory(c) }) ).get(c);
//
// return d == 0 ? 1.0 : ((double)corpii.get(c).length()) / ((double)d); // * ((double)t.length());
// }
//
// @Deprecated public Map<String, Double> analyzeNormalized(String t, List<String> catCompared) {
// Map<String, Double> result = analyzeC(t, catCompared);
// double maxDist = 0, minDist = -1;
// for (Double ii : result.values()) {
// if (maxDist < ii) maxDist = ii;
// if (minDist == -1) minDist = ii;
// else if (minDist > ii) minDist = ii;
// }
//
// Map<String, Double> d = new HashMap();
// if ((maxDist!=0) && (maxDist!=minDist)) {
// for (String x : result.keySet()) {
// double nv = 1.0 - ((double)(result.get(x) - minDist)) / ((double)maxDist - minDist);
// d.put(x, nv);
// }
// }
//
// return d;
//
// }
// public Map<String, Integer> analyze(String t, List<String> catCompared) {
// FingerPrint fp = new FingerPrint();
// fp.create(t);
//
// List<FingerPrint> ffp = new LinkedList();
// for (Category cat : cats.values())
// if (catCompared.contains(cat.getCategory()))
// ffp.add(cat);
//
// return fp.categorize(ffp);
// }
//
// public double getAverageBackgroundDistance(String k) {
// return avgBackground.get(k);
// }
public String getName() {
return name;
}
}