package happy.research.utils; import happy.coding.io.FileIO; import happy.coding.io.Lists; import happy.coding.io.Logs; import happy.coding.math.Gaussian; import happy.coding.math.Randoms; import happy.coding.math.Sims; import happy.coding.math.Stats; import happy.coding.system.Debug; import happy.coding.system.Systems; import happy.research.cf.ConfigParams; import happy.research.cf.Dataset; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.math.linear.Array2DRowRealMatrix; import org.apache.commons.math.linear.RealMatrix; import org.apache.commons.math.stat.correlation.PearsonsCorrelation; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class SimUtils { private final static Logger log = LoggerFactory.getLogger(SimUtils.class); public final static PearsonsCorrelation pc = new PearsonsCorrelation(); public static double alpha = 0.2, beta = 0.2; public enum SimMethod { COS, PCC, MSD, CPC, SRC, BS, PIP, SM, iufCOS, caPCC }; /** * @param a * @param b * @param isRemoveChance * : whether to remove chance correlation * @return */ private static double ijcai_bs_examples(List<Double> a, List<Double> b, boolean isRemoveChance) { if (a == null || b == null || a.size() < 1 || b.size() < 1 || a.size() != b.size()) return Double.NaN; int N = a.size(); int R = (int) (Dataset.maxScale / Dataset.minScale) - 1; // {diff level, count} map Map<Integer, Integer> diffMap = new HashMap<>(); for (int i = 0; i < R + 1; i++) diffMap.put(i, 0); for (int i = 0; i < N; i++) { double ar = a.get(i); double br = b.get(i); double diff = Math.abs(ar - br); int level = (int) (diff / Dataset.minScale); int count = diffMap.get(level); diffMap.put(level, count + 1); } // System.out.println(PrintUtils.printMap(diffMap)); double sum = 0.0, weights = 0.0; double chance = 1.0; for (int i = 0; i < R + 1; i++) { int count = diffMap.get(i); int alpha = i == 0 ? (R + 1) : 2 * (R - i + 1); int M2 = (R + 1) * (R + 1); double post = (count + alpha + 0.0) / (N + M2); double prio = (alpha + 0.0) / M2; double byChance = Math.pow(prio, count); chance *= byChance; double weight = post - prio; try { if (Debug.ON) { weights += post; sum += post * i * Dataset.minScale; } else { if (weight > 0) { weights += Math.abs(weight); sum += weight * i * Dataset.minScale; } } } catch (Exception e) { e.printStackTrace(); } } double d = sum / weights; double similarity = 1 - d / (Dataset.maxScale - Dataset.minScale); if (!isRemoveChance) chance = 0; double bias = 0.04; if (Debug.OFF) bias = 0; return Math.max(similarity * (1 - chance) - bias, 0.0); } public static boolean isPositive(double rate) { return rate >= Dataset.median; } public static boolean isConsistent(double r1, double r2) { return (r1 - Dataset.median) * (r2 - Dataset.median) >= 0; } /** * Calculate the Bayesian Similarity, referring to * "A Novel Bayesian Similairty for Recommender Systems" (IJCAI 2013) * * @param a * user a's rating vector * @param b * user b's rating vector * @param priors * the prior value of each rating scales * @param sd * standard deviation of ratings given on each item * @param mu * mean of ratings given on each item * @return Bayesian similarity */ public static double bsSim(List<Double> a, List<Double> b, List<Double> priors, List<Double> sd, List<Double> mu, Map<Integer, Map<Double, Double>> histos, List<Double> cf) throws Exception { if (a == null || b == null || a.size() < 1 || b.size() < 1 || a.size() != b.size()) return Double.NaN; int N = a.size(); int R = Dataset.scaleSize; // {distance level, count} map Map<Double, Double> evidences = new HashMap<>(); for (int i = 0; i < R; i++) evidences.put(i * Dataset.minScale, 0.0); double numEvidence = 0; for (int i = 0; i < N; i++) { double ar = a.get(i); double br = b.get(i); double di = Math.abs(ar - br); double sigma = sd.get(i); double mean = mu.get(i); double conf = cf.get(i); double evidence = evidences.get(di); double ei = 0; // evidence weight // ours double x = ConfigParams.defaultInstance().X_SIGMA; if (sigma <= 0 || x <= 0) ei = 1; else ei = 1 - di / (x * sigma); // new factor double singu = 0.0; if (Debug.OFF) { // Singularity double asin = 0, bsin = 0; if (ar > Dataset.median) asin = 1 - conf; else asin = conf; if (br > Dataset.median) bsin = 1 - conf; else bsin = conf; singu = asin * bsin; } else if (Debug.ON) { // Gaussian double pa = Gaussian.pdf(ar, mean, sigma); double pb = Gaussian.pdf(br, mean, sigma); singu = (1 - pa) * (1 - pb); } else if (Debug.OFF) { // Discrete Map<Double, Double> hist = histos.get(i); if (hist.containsKey(di)) singu = 1 - hist.get(di); else singu = 1; } double semantic = 0.0; if (Debug.ON) { // Semantics // factor 1: proximity double pr = 0; double range = Dataset.range; if (isConsistent(ar, br)) pr = 1 - di / range; else pr = -di / range; // factor 2: impact double im = 0.0; double s = 0.5 * (ar + br); if (isPositive(ar) && isPositive(br)) im = s / Dataset.maxScale; else if (!isPositive(ar) && !isPositive(br)) im = 1 - s / Dataset.maxScale; else im = -s / Dataset.maxScale; // factor 3: popularity double po = 0; double dist = Math.abs(s - mean); boolean consistent = (ar - mean) * (br - mean) >= 0; if (consistent) po = dist / range; else po = -dist / range; semantic = pr * im * po; } ei = alpha * ei + beta * singu + (1 - alpha - beta) * semantic; if (ei < -1) ei = -1; numEvidence += ei; evidences.put(di, evidence + ei); } double sum = 0.0, weights = 0.0; double chance = 1.0; double[] x = Lists.toArray(priors); for (int i = 0; i < R; i++) { double num = evidences.get(i * Dataset.minScale); double alpha = prior(x, i); int M2 = R * R; double post = (num + alpha + 0.0) / (numEvidence + M2); double prio = (alpha + 0.0) / M2; double byChance = Math.pow(prio, num); chance *= byChance; double weight = post - prio; try { if (Debug.OFF) { weights += post; sum += post * i * Dataset.minScale; } else { if (weight > 0) { weights += Math.abs(weight); sum += weight * i * Dataset.minScale; } } } catch (Exception e) { e.printStackTrace(); } } double d = sum / weights; double similarity = 1 - d / (Dataset.maxScale - Dataset.minScale); if (Debug.OFF) chance = 0.0; double bias = 0.04; if (Debug.ON) bias = 0; if (Debug.OFF) { return Math.max(similarity * (1 - chance) - bias, 0.0); } else { return Math.max(similarity - chance - bias, 0.0); } } private static double prior(double[] priors, int i) { double evidence = 0; if (i == 0) { for (int j = 0; j < priors.length; j++) evidence += priors[j] * priors[j]; } else { for (int j = 0; (i + j) < priors.length; j++) evidence += 2 * priors[j] * priors[j + i]; } return evidence; } public static double pearsonSim(List<Double> a, List<Double> b, List<Double> ca) { if (a == null || b == null || a.size() < 2 || b.size() < 2 || a.size() != b.size()) return Double.NaN; double[] as = Lists.toArray(a); double[] bs = Lists.toArray(b); double[] cs = Lists.toArray(ca); return pearsonSim(as, bs, cs); } /** * This example is used in my merge journal paper */ @SuppressWarnings("unchecked") @Test public void kbs_examples() { Map<String, Double> merged_rates = new HashMap<>(); Map<String, Double> merged_confs = new HashMap<>(); merged_rates.put("i1", 4.33); merged_confs.put("i1", 0.19); merged_rates.put("i2", 4.0); merged_confs.put("i2", 0.38); merged_rates.put("i3", 5.0); merged_confs.put("i3", 1.0); merged_rates.put("i4", 3.0); merged_confs.put("i4", 0.25); merged_rates.put("i5", 2.73); merged_confs.put("i5", 0.47); merged_rates.put("i8", 1.72); merged_confs.put("i8", 0.47); Map<String, Double>[] us = new HashMap[10]; us[2] = new HashMap<>(); us[2].put("i1", 5.0); us[2].put("i3", 4.0); us[2].put("i5", 3.0); us[2].put("i8", 2.0); us[3] = new HashMap<>(); us[3].put("i2", 4.0); us[3].put("i4", 3.0); us[3].put("i8", 1.0); us[4] = new HashMap<>(); us[4].put("i1", 3.0); us[4].put("i3", 5.0); us[4].put("i5", 2.0); us[5] = new HashMap<>(); us[5].put("i2", 4.0); us[5].put("i3", 4.0); us[5].put("i5", 3.0); us[5].put("i8", 3.0); us[6] = new HashMap<>(); us[6].put("i2", 3.0); us[6].put("i3", 3.0); us[6].put("i4", 5.0); us[6].put("i5", 5.0); us[7] = new HashMap<>(); us[7].put("i7", 5.0); us[7].put("i9", 4.0); us[8] = new HashMap<>(); us[8].put("i3", 4.0); us[8].put("i5", 2.0); us[8].put("i8", 1.0); us[9] = new HashMap<>(); us[9].put("i3", 4.0); us[9].put("i5", 5.0); us[9].put("i8", 5.0); for (int i = 2; i < us.length; i++) { List<Double> u = new ArrayList<>(); List<Double> v = new ArrayList<>(); List<Double> c = new ArrayList<>(); Map<String, Double> ux = us[i]; for (String item : ux.keySet()) { if (merged_rates.containsKey(item)) { u.add(merged_rates.get(item)); c.add(merged_confs.get(item)); v.add(ux.get(item)); } } double cpcc = SimUtils.pearsonSim(u, v, c); double pcc = Sims.pcc(u, v); Logs.debug("u" + i + " cpcc = " + cpcc); Logs.debug("u" + i + " pcc = " + pcc); } } /** * Calculate the PIP similarity proposed by Hyung Jun Ahn [2008]: * * <i>A new similarity measure for collaborative filtering to alleviate the * new user cold-starting problem</i> * * @param a * user a's ratings * @param b * user b's ratings * @param means * the average ratings of items by all users * @return PIP similarity */ public static double PIPSim(List<Double> a, List<Double> b, List<Double> means) { double score = 0; /** * to compute median rating * * however, this is only useful for the number of rating scales is odd * rather than even. But it is the way used in the paper. */ double r = (Dataset.maxScale + Dataset.minScale) / 2.0; for (int i = 0; i < a.size(); i++) { double r1 = a.get(i); double r2 = b.get(i); double ui = means.get(i); double agreement = (r1 - r) * (r2 - r); boolean agree = false; if (agreement < 0) agree = false; else agree = true; /* compute proximity */ double D = 0; if (agree) D = Math.abs(r1 - r2); else D = 2 * Math.abs(r1 - r2); double prox = 2 * (Dataset.maxScale - Dataset.minScale) + 1 - D; double proximity = Math.pow(prox, 2); /* compute impact */ double impact = (Math.abs(r1 - r) + 1) * (Math.abs(r2 - r) + 1); if (!agree) impact = 1.0 / impact; /* compute popularity */ double val = (r1 - ui) * (r2 - ui); double popularity = 1.0; double pop = (r1 + r2) / 2.0 - ui; if (val > 0) popularity = 1 + Math.pow(pop, 2); /* aggregation PIP */ score += proximity * impact * popularity; } return score; } /** * Calculate confidence-aware PCC similarity * * @param u * user u's ratings * @param v * user v's ratings * @param uc * user u's rating confidences * @return confidence-aware PCC similarity */ public static double pearsonSim(double[] u, double[] v, double[] uc) { if (u == null || v == null || u.length < 2 || v.length < 2 || u.length != v.length) return Double.NaN; double meanA = Stats.weightedcMean(u, uc); double meanB = Stats.mean(v); double sumNum = 0.0, sumDen1 = 0.0, sumDen2 = 0.0; for (int i = 0; i < u.length; i++) { double ai = uc[i] * (u[i] - meanA); double bi = v[i] - meanB; sumNum += ai * bi; sumDen1 += ai * ai; // sumDen1 += uc[i] * (u[i] - meanA) * (u[i] - meanA); sumDen2 += bi * bi; } return sumNum / (Math.sqrt(sumDen1) * Math.sqrt(sumDen2)); } /** * Calculate Spearman's Rank Correlation (SRC) * * @param u * user u's rating ranks (tied ratings get the average rank of * their spot) * @param v * user v's rating ranks * * @return Spearman's Rank Correlation (SRC) */ public static double SRCSim(List<Double> u, List<Double> v) { if (u == null || v == null) return Double.NaN; double meanU = Stats.mean(u); double meanV = Stats.mean(v); double sumNum = 0.0, sumDen1 = 0.0, sumDen2 = 0.0; for (int i = 0; i < u.size(); i++) { double ui = u.get(i) - meanU; double vi = v.get(i) - meanV; sumNum += ui * vi; sumDen1 += Math.pow(ui, 2); sumDen2 += Math.pow(vi, 2); } return sumNum / (Math.sqrt(sumDen1) * Math.sqrt(sumDen2)); } /** * Calculate Spearman's Rank Correlation (SRC) * * @param a * user u's ratings * @param b * user v's ratings * @param ua * user u's all ratings * @param va * user v's all ratings * * @return Spearman's Rank Correlation (SRC) */ public static double SRCSim(List<Double> a, List<Double> b, List<Double> ua, List<Double> va) { if (a == null || b == null) return Double.NaN; List<Double> usRatings = new ArrayList<>(ua); List<Double> vsRatings = new ArrayList<>(va); Collections.sort(usRatings); Collections.sort(vsRatings); List<Double> u = new ArrayList<>(); List<Double> v = new ArrayList<>(); for (Double ar : a) { double sum = 0; int count = 0; for (int i = 0; i < usRatings.size(); i++) { double usRating = usRatings.get(i); if (usRating == ar) { sum += (i + 1); count++; } } u.add(sum / count); } for (Double br : b) { double sum = 0; int count = 0; for (int i = 0; i < vsRatings.size(); i++) { double vsRating = vsRatings.get(i); if (vsRating == br) { sum += (i + 1); count++; } } v.add(sum / count); } double meanU = Stats.mean(u); double meanV = Stats.mean(v); double sumNum = 0.0, sumDen1 = 0.0, sumDen2 = 0.0; for (int i = 0; i < u.size(); i++) { double ui = u.get(i) - meanU; double vi = v.get(i) - meanV; sumNum += ui * vi; sumDen1 += Math.pow(ui, 2); sumDen2 += Math.pow(vi, 2); } return sumNum / (Math.sqrt(sumDen1) * Math.sqrt(sumDen2)); } public static double distanceSim(List<Double> a, List<Double> b) { if (a == null || b == null || a.size() < 1 || b.size() < 1 || a.size() != b.size()) return Double.NaN; double maxRating = Dataset.maxScale, minRating = Dataset.minScale; double sumNum = 0.0, sumDen = 0.0; for (int i = 0; i < a.size(); i++) { sumNum += Math.abs(a.get(i) - b.get(i)); sumDen += Math.abs(maxRating - minRating); } return 1 - sumNum / sumDen; } public static double distanceSim(List<Double> a, List<Double> b, List<Double> ac, List<Double> bc) { if (a == null || b == null || a.size() < 1 || b.size() < 1 || a.size() != b.size()) return Double.NaN; double sumNum = 0.0, sumDen = 0.0; for (int i = 0; i < a.size(); i++) { double ai = a.get(i), bi = b.get(i); double ca = ac.get(i), cb = bc.get(i); double dr = Math.abs(ai - bi); double dc = Math.abs(ca - cb); double dm = Dataset.maxScale - Dataset.minScale; if (Debug.ON) { double d = (ai - Dataset.median) * (bi - Dataset.median); double w = 1.0; if (d >= 0) w = 1.0 / (d + 1); // agreement else { d = -d; w = d / (d + 1); } sumNum += (w + dr / dm + dc / (dc + 1)); sumDen += 3; } else { sumNum += dr / dm + dc / (dc + 1); sumDen += 2; } } return 1 - sumNum / sumDen; } public static double disValueSim(double[] a, double[] b) { if (a == null || b == null || a.length < 1 || b.length < 1 || a.length != b.length) return Double.NaN; double maxRating = Dataset.maxScale; double sum = 0.0; int count = 0; for (int i = 0; i < a.length; i++) { sum += 1 - Math.abs(a[i] - b[i]) / maxRating; count++; } return sum / count; } public static double disValueSim(List<Double> a, List<Double> b) { if (a == null || b == null || a.size() < 1 || b.size() < 1 || a.size() != b.size()) return Double.NaN; double[] as = Lists.toArray(a); double[] bs = Lists.toArray(b); return disValueSim(as, bs); } /** * Calculate the cosine similarity between two vectors * * @param a * user a's ratings * @param b * user b's ratings * @return cosine similarity */ public static double cosineSim(double[] a, double[] b) { if (a == null || b == null || a.length < 1 || b.length < 1 || a.length != b.length) return Double.NaN; double sum = 0.0, sum_a = 0, sum_b = 0; for (int i = 0; i < a.length; i++) { sum += a[i] * b[i]; sum_a += a[i] * a[i]; sum_b += b[i] * b[i]; } double val = Math.sqrt(sum_a) * Math.sqrt(sum_b); return sum / val; } /** * Calculate the SM (singularities measure) similarity proposed by Bobadilla * et al. [2012]: <i>A collaborative filtering similarity measure based on * singularities</i> * * @param a * user a's ratings * @param b * user b's ratings * @return SM similarity */ public static double SMSim(List<Double> a, List<Double> b, List<Double> sp, List<Double> sn) { double r = Dataset.median; double sumA = 0, sumB = 0, sumC = 0; int countA = 0, countB = 0, countC = 0; for (int i = 0; i < a.size(); i++) { double ai = a.get(i); double bi = b.get(i); double pi = sp.get(i); double ni = sn.get(i); double ri = ai / Dataset.maxScale; double rj = bi / Dataset.maxScale; if (ai > r && bi > r) { // A: positive agreement countA++; sumA += (1 - Math.pow(ri - rj, 2)) * Math.pow(pi, 2); } else if (ai <= r && bi <= r) { // B: negative agreement countB++; sumB += (1 - Math.pow(ri - rj, 2)) * Math.pow(ni, 2); } else { // C: disagreement countC++; sumC += (1 - Math.pow(ri - rj, 2)) * pi * ni; } } double score = 0; if (countA > 0) score += sumA / countA; if (countB > 0) score += sumB / countB; if (countC > 0) score += sumC / countC; if (countA + countB + countC == 0) return Double.NaN; else return score / 3.0; } /** * Implement Weng's approach * * @param a * @param b * @return */ public static double kappa2Similarity(List<Double> a, List<Double> b) { if (a == null || b == null || a.size() < 1 || b.size() < 1 || a.size() != b.size()) return Double.NaN; double weight[][] = { { 1.00, 0.75, 0.50, 0.25, 0.00 }, { 0.75, 1.00, 0.75, 0.50, 0.25 }, { 0.50, 0.75, 1.00, 0.75, 0.50 }, { 0.25, 0.50, 0.75, 1.00, 0.75 }, { 0.00, 0.25, 0.50, 0.75, 1.00 } }; double data[][] = { { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0, 0.0 } }; RealMatrix weightMatrix = new Array2DRowRealMatrix(weight); RealMatrix observationMatrix = new Array2DRowRealMatrix(data); int R[] = new int[5]; int C[] = new int[5]; int size = a.size(); for (int i = 0; i < size; i++) { int aRating = a.get(i).intValue(); int bRating = b.get(i).intValue(); int idx = aRating - 1; int jdx = bRating - 1; double count = observationMatrix.getRow(idx)[jdx]; count += 1; observationMatrix.setEntry(idx, jdx, count); } for (int i = 0; i < 5; i++) { R[i] = 0; for (int j = 0; j < 5; j++) R[i] += (int) observationMatrix.getRow(i)[j]; } for (int j = 0; j < 5; j++) { C[j] = 0; for (int i = 0; i < 5; i++) C[j] += (int) observationMatrix.getColumn(j)[i]; } double similarity = 0.0; double observation = 0.0, expectation = 0.0; for (int i = 0; i < 5; i++) { for (int j = 0; j < 5; j++) { observation += weightMatrix.getRow(i)[j] * observationMatrix.getRow(i)[j] / C[j]; } } observation *= size; for (int i = 0; i < 5; i++) { expectation += R[i] * R[i]; } similarity = (observation - expectation) / (size * size - expectation); return similarity; } /** * Compare the trends of different similarity methods; * * @throws Exception */ @Test // @Ignore public void ijcai_trends() throws Exception { Dataset.minScale = 1; Dataset.maxScale = 5; Dataset.scaleSize = 5; int size = 1_000_000; int items = 20; // double[] pccs = new double[size]; // double[] coss = new double[size]; List<Double> bys = new ArrayList<>(); String dirPath = Systems.getDesktop(); // List<Double> ps = new ArrayList<>(); // List<Double> cs = new ArrayList<>(); List<Double> bs = new ArrayList<>(); // List<Double> pstd = new ArrayList<>(); // List<Double> cstd = new ArrayList<>(); List<Double> bstd = new ArrayList<>(); /* * FileUtils.deleteFile(dirPath + "pcc.txt"); * FileUtils.deleteFile(dirPath + "cos.txt"); * FileUtils.deleteFile(dirPath + "by.txt"); * FileUtils.deleteFile(dirPath + "pcc-std.txt"); * FileUtils.deleteFile(dirPath + "cos-std.txt"); * FileUtils.deleteFile(dirPath + "by-std.txt"); */ for (int i = 1; i < items + 1; i++) { System.out.println("Progress = " + i + "/" + items); bys.clear(); for (int j = 0; j < size; j++) { double[] a = Randoms.doubles(1, 6, i); double[] b = Randoms.doubles(1, 6, i); List<Double> u = Lists.toList(a); List<Double> v = Lists.toList(b); // double pcc = SimUtils.pearsonSim(a, b); // double cos = SimUtils.cosineSim(a, b); double by = SimUtils.ijcai_bs_examples(u, v, true); // double by = SimUtils.MSDSim(MathUtils.array2Col(a), // MathUtils.array2Col(b)); // double by = SimUtils.CPCSim(u, v, 3); // pccs[j] = pcc; // coss[j] = cos; bys.add(by); } // double mp = MathUtils.mean(pccs); // mp = (1 + mp) / 2; // double mc = MathUtils.mean(coss); double mb = Stats.mean(bys); //mb = (1 + mb) / 2; // ps.add(mp); // cs.add(mc); bs.add(mb); for (int j = 0; j < size; j++) { // if (Double.isNaN(pccs[j])) pccs[j] = mp; if (Double.isNaN(bys.get(j))) bys.set(j, mb); } // double dp = MathUtils.sd(pccs); // double dc = MathUtils.sd(coss); double db = Stats.sd(bys); // pstd.add(dp); // cstd.add(dc); bstd.add(db); } // FileUtils.writeCollection(dirPath + "pcc.txt", ps, true); // FileUtils.writeCollection(dirPath + "cos.txt", cs, true); FileIO.writeList(dirPath + "cpc.txt", bs, null, true); // FileUtils.writeCollection(dirPath + "pcc-std.txt", pstd, true); // FileUtils.writeCollection(dirPath + "cos-std.txt", cstd, true); FileIO.writeList(dirPath + "cpc-std.txt", bstd, null, true); } @Test public void ijcai_examples() throws Exception { Dataset.minScale = 1; Dataset.maxScale = 5; Dataset.scaleSize = 5; double[] a, b; if (Debug.OFF) { int size = 1_00; int items = 10; for (int i = 1; i < items + 1; i++) { for (int j = 0; j < size; j++) { a = Randoms.doubles(1, 6, i); b = Randoms.doubles(1, 6, i); singleComparison(a, b); } } } else { a = new double[] { 1, 1, 1 }; b = new double[] { 1, 1, 1 }; singleComparison(a, b); a = new double[] { 1, 1, 1 }; b = new double[] { 2, 2, 2 }; singleComparison(a, b); a = new double[] { 1, 1, 1 }; b = new double[] { 5, 5, 5 }; singleComparison(a, b); a = new double[] { 1, 5, 1 }; b = new double[] { 5, 1, 5 }; singleComparison(a, b); a = new double[] { 2, 4, 4 }; b = new double[] { 4, 2, 2 }; singleComparison(a, b); a = new double[] { 2, 4, 4, 1 }; b = new double[] { 4, 2, 2, 5 }; singleComparison(a, b); a = new double[] { 1 }; b = new double[] { 1 }; singleComparison(a, b); a = new double[] { 1 }; b = new double[] { 2 }; singleComparison(a, b); a = new double[] { 1 }; b = new double[] { 5 }; singleComparison(a, b); a = new double[] { 1, 5 }; b = new double[] { 5, 1 }; singleComparison(a, b); a = new double[] { 1, 3 }; b = new double[] { 4, 2 }; singleComparison(a, b); a = new double[] { 5, 1 }; b = new double[] { 5, 4 }; singleComparison(a, b); a = new double[] { 4, 3 }; b = new double[] { 3, 1 }; singleComparison(a, b); } } private static void singleComparison(double[] a1, double[] b1) { List<Double> u = Lists.toList(a1); List<Double> v = Lists.toList(b1); float pcc = (float) Sims.pcc(u, v); float cos = (float) Sims.cos(u, v); float bs = (float) SimUtils.ijcai_bs_examples(u, v, true); float bs2 = (float) SimUtils.ijcai_bs_examples(u, v, false); log.info("u = {}, v = {}", u, v); log.info("PCC = {}, COS = {}, BS = {}, BS-1 = {}", new Object[] { pcc, cos, bs, bs2 }); } public static double kappaSim(List<Double> a, List<Double> b) { if (a == null || b == null || a.size() < 1 || b.size() < 1 || a.size() != b.size()) return Double.NaN; double weight[][] = { { 1.00, 0.75, 0.50, 0.25, 0.00 }, { 0.75, 1.00, 0.75, 0.50, 0.25 }, { 0.50, 0.75, 1.00, 0.75, 0.50 }, { 0.25, 0.50, 0.75, 1.00, 0.75 }, { 0.00, 0.25, 0.50, 0.75, 1.00 } }; double data[][] = { { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0, 0.0 } }; RealMatrix weightMatrix = new Array2DRowRealMatrix(weight); RealMatrix observationMatrix = new Array2DRowRealMatrix(data); RealMatrix expectationMatrix = new Array2DRowRealMatrix(data); int R[] = new int[5]; int C[] = new int[5]; int size = a.size(); for (int i = 0; i < size; i++) { int aRating = a.get(i).intValue(); int bRating = b.get(i).intValue(); int idx = aRating - 1; int jdx = bRating - 1; double count = observationMatrix.getRow(idx)[jdx]; count += 1; observationMatrix.setEntry(idx, jdx, count); } for (int i = 0; i < 5; i++) { R[i] = 0; for (int j = 0; j < 5; j++) R[i] += (int) observationMatrix.getRow(i)[j]; } for (int j = 0; j < 5; j++) { C[j] = 0; for (int i = 0; i < 5; i++) C[j] += (int) observationMatrix.getColumn(j)[i]; } for (int i = 0; i < 5; i++) { for (int j = 0; j < 5; j++) { double value = R[i] * C[j] / (size + 0.0); expectationMatrix.setEntry(i, j, value); } } double similarity = 0.0; double observation = 0.0, expectation = 0.0; for (int i = 0; i < 5; i++) { for (int j = 0; j < 5; j++) { observation += weightMatrix.getRow(i)[j] * observationMatrix.getRow(i)[j]; expectation += weightMatrix.getRow(i)[j] * expectationMatrix.getRow(i)[j]; } } similarity = (observation - expectation) / (size - expectation); return similarity; } /* * Below are a list of significance computation methods */ public static double overlapSig(int aSize, int uSize, int auCommon) { return auCommon / (Math.min(aSize, uSize) + 0.0); } public static double aSig(int size, int gamma) { return Math.min(size, gamma) / (gamma + 0.0); } public static double bSig(int size, int gamma) { return size / (size + gamma + 0.0); } public static double cSig(int size, int gamma, double similarity) { double SW = (size < gamma) ? size / (0.0 + gamma) : 1.0; return 2 * SW * similarity / (SW + similarity); } public static double dSig(double r, double s) throws Exception { return TrustUtils.confidence(r, s); } public static double eSig(int aSize, int uSize, int auCommon) { return overlapSig(aSize, uSize, auCommon); } public static double fSig(int aSize, int uSize, int auCommon, double similarity) { double os = overlapSig(aSize, uSize, auCommon); return 2 * os * similarity / (os + similarity); } }