package com.yc.nlp.sim; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * tf-idf应用 * @author uohzoaix * */ public class BM25 { private Integer d; private Double avgdl; private List<List<String>> docs; private List<Map<String, Integer>> f = new ArrayList<Map<String, Integer>>(); private Map<String, Integer> df = new HashMap<String, Integer>(); private Map<String, Double> idf = new HashMap<String, Double>(); private Double k1; private Double b; public BM25(List<List<String>> docs) { this.d = docs.size(); double sum = 0; for (List<String> doc : docs) { sum += doc.size(); } this.avgdl = sum / this.d; this.docs = docs; this.k1 = 1.5; this.b = 0.75; init(); } public List<Map<String, Integer>> getF() { return f; } public Map<String, Double> getIdf() { return idf; } private void init() { for (List<String> doc : docs) { Map<String, Integer> tmp = new HashMap<String, Integer>(); for (String ch : doc) { String word = ch.toString(); tmp.put(word, tmp.containsKey(word) ? tmp.get(word) + 1 : 1); } this.f.add(tmp); for (Map.Entry<String, Integer> entry : tmp.entrySet()) { String word = entry.getKey(); this.df.put(word, df.containsKey(word) ? df.get(word) + 1 : 1); } } for (Map.Entry<String, Integer> entry : df.entrySet()) { this.idf.put(entry.getKey(), Math.log(this.d - entry.getValue() + 0.5) - Math.log(entry.getValue() + 0.5)); } } public double sim(List<String> doc, Integer index) { double score = 0; for (String ch : doc) { String word = ch.toString(); if (!this.f.get(index).containsKey(word)) { continue; } score += (this.idf.get(word) * this.f.get(index).get(word) * (this.k1 + 1) / (this.f.get(index).get(word) + this.k1 * (1 - this.b + this.b * this.d / this.avgdl))); } return score; } public List<Double> simall(List<String> doc) { List<Double> scores = new ArrayList<Double>(); for (int i = 0; i < this.d; i++) { scores.add(this.sim(doc, i)); } return scores; } }