package com.yc.nlp.util;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class GoodTuring {
public static List<Double> getz(List<Double> r, List<Double> nr) {
List<Double> z = new ArrayList<Double>();
z.add(2 * nr.get(0) / r.get(1));
for (int i = 0; i < nr.size() - 2; i++) {
z.add(2 * nr.get(i + 1) / (r.get(i + 2) - r.get(i)));
}
z.add(nr.get(nr.size() - 1) / (r.get(r.size() - 1) - r.get(r.size() - 2)));
return z;
}
public static List<Double> leastSquare(List<Double> rd, List<Double> zd) {
List<Double> result = new ArrayList<Double>();
double sumX = 0, sumY = 0, sumXY = 0, square = 0, b = 0;
for (Double value : rd) {
sumX += value;
}
for (Double value : zd) {
sumY += value;
}
double meanX = sumX / rd.size(), meanY = sumY / zd.size();
for (int i = 0; i < rd.size(); i++) {
square += Math.pow(rd.get(i) - meanX, 2);
sumXY += (rd.get(i) - meanX) * (zd.get(i) - meanY);
}
b = sumXY / square;
result.add(meanY - b * meanX);
result.add(b);
return result;
}
public static List<Object> main(Map<String, Double> data) {
List<Double> r = new ArrayList<Double>(), rd = new ArrayList<Double>(), zd = new ArrayList<Double>(), nr = new ArrayList<Double>(), prob = new ArrayList<Double>(), z = new ArrayList<Double>();
List<Double> values = new ArrayList<Double>(data.values());
Collections.sort(values);
for (double value : values) {
if (r.size() == 0 || r.get(r.size() - 1) != value) {
r.add(value);
nr.add(1d);
} else {
if (nr.size() == 0) {
nr.add(1d);
} else {
nr.set(nr.size() - 1, nr.get(nr.size() - 1) + 1);
}
}
}
double total = 0d;
Map<Double, Integer> rr = new HashMap<Double, Integer>();
for (double value : r) {
Integer idx = r.indexOf(value);
if (idx < nr.size())
total += value * nr.get(idx);
rr.put(value, idx);
rd.add(Math.log(value));
}
z = getz(r, nr);
for (double value : z) {
zd.add(Math.log(value));
}
List<Double> square = leastSquare(rd, zd);
boolean useGoogTuring = false;
nr.add(Math.exp(square.get(0) + square.get(1) * Math.log(r.get(r.size() - 1) + 1)));
for (int i = 0; i < r.size(); i++) {
double goodTuring = (r.get(i) + 1) * (Math.exp(square.get(1) * (Math.log(r.get(i) + 1) - Math.log(r.get(i)))));
double turing = (i + 1 < r.size()) ? (r.get(i) + 1) * nr.get(i + 1) / nr.get(i) : goodTuring;
double diff = Math.pow(Math.pow(r.get(i) + 1, 2) / nr.get(i) * nr.get(i + 1) / nr.get(i) * (1 + nr.get(i + 1) / nr.get(i)), 0.5) * 1.65;
if (!useGoogTuring && Math.abs(goodTuring - turing) > diff) {
prob.add(turing);
} else {
useGoogTuring = true;
prob.add(goodTuring);
}
}
double sump = 0d;
for (double value : nr) {
Integer idx = nr.indexOf(value);
if (idx < prob.size())
sump += value * prob.get(idx);
}
for (int i = 0; i < prob.size(); i++) {
prob.set(i, (1 - nr.get(0) / total) * prob.get(i) / sump);
}
// nr.get(0)/total/total;
List<Object> mixResult = new ArrayList<Object>();
Map<String, Double> result = new HashMap<String, Double>();
for (Map.Entry<String, Double> entry : data.entrySet()) {
result.put(entry.getKey(), prob.get(rr.get(entry.getValue())));
}
mixResult.add(nr.get(0) / total / total);
mixResult.add(result);
return mixResult;
}
public static void main(String[] args) {
Map<String, Double> data = new HashMap<String, Double>();
data.put("1", 1d);
data.put("2", 1d);
data.put("3", 1d);
data.put("4", 2d);
data.put("5", 2d);
data.put("6", 3d);
data.put("7", 1d);
data.put("8", 2d);
data.put("9", 3d);
System.out.println(main(data).get(0));
System.out.println(main(data).get(1));
}
}