package org.wikibrain.core.dao.sql;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import org.wikibrain.core.dao.*;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.model.*;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Builds a directed graph among categories and pages using daos.
* Also calculates page rank among pages.
*/
public class LocalCategoryGraphBuilder {
private static final Logger LOG = LoggerFactory.getLogger(LocalCategoryGraphBuilder.class);
private final LocalPageDao pageDao;
private final LocalCategoryMemberDao catDao;
public LocalCategoryGraphBuilder(LocalPageDao pageDao, LocalCategoryMemberDao catDao) {
this.pageDao = pageDao;
this.catDao = catDao;
}
/**
*
* @param language
* @return
* @throws DaoException
*/
public CategoryGraph build(Language language) throws DaoException {
CategoryGraph graph = new CategoryGraph(language);
loadCategories(graph);
buildGraph(graph);
computePageRanks(graph);
return graph;
}
private void loadCategories(CategoryGraph graph) throws DaoException {
LOG.info("loading categories...");
graph.catIndexes = new TIntIntHashMap();
List<String> catList = new ArrayList<String>();
Iterable<LocalPage> catIter = pageDao.get(new DaoFilter()
.setNameSpaces(NameSpace.CATEGORY)
.setLanguages(graph.language)
);
TIntList catIds = new TIntArrayList();
for (LocalPage cat : catIter) {
if (cat != null) {
if (graph.catIndexes.containsKey(cat.getLocalId())) {
continue;
}
assert(catList.size() == graph.catIndexes.size());
assert(catIds.size() == graph.catIndexes.size());
int ci = graph.catIndexes.size();
graph.catIndexes.put (cat.getLocalId(), ci);
catList.add(cat.getTitle().getCanonicalTitle());
catIds.add(cat.getLocalId());
}
}
graph.cats = catList.toArray(new String[0]);
graph.catIds = catIds.toArray();
LOG.info("finished loading " + graph.cats.length + " categories");
}
private void buildGraph(CategoryGraph graph) throws DaoException {
LOG.info("building category graph");
graph.catPages = new int[graph.catIndexes.size()][];
graph.catParents = new int[graph.catIndexes.size()][];
graph.catChildren = new int[graph.catIndexes.size()][];
graph.catCosts = new double[graph.catIndexes.size()];
Arrays.fill(graph.catPages, new int[0]);
Arrays.fill(graph.catParents, new int[0]);
Arrays.fill(graph.catChildren, new int[0]);
// count reverse edges
int totalEdges = 0;
int numCatChildren[] = new int[graph.catIndexes.size()];
int numCatParents[] = new int[graph.catIndexes.size()];
int numCatPages[] = new int[graph.catIndexes.size()];
DaoFilter filter = new DaoFilter().setLanguages(graph.language);
for (LocalCategoryMember lcm : catDao.get(filter)) {
int catIndex1 = graph.catIdToIndex(lcm.getArticleId()); // cat index for page (probably -1)
int catIndex2 = graph.catIdToIndex(lcm.getCategoryId()); // cat index for cat
if (catIndex1 >= 0 && catIndex2 >= 0) {
numCatChildren[catIndex2]++;
numCatParents[catIndex1]++;
} else if (catIndex2 >= 0) {
numCatPages[catIndex2]++;
}
totalEdges++;
}
// allocate space
for (int i = 0; i < graph.catIndexes.size(); i++) {
graph.catPages[i] = new int[numCatPages[i]];
graph.catChildren[i] = new int[numCatChildren[i]];
graph.catParents[i] = new int[numCatParents[i]];
}
// fill it
for (LocalCategoryMember lcm : catDao.get(filter)) {
int catIndex1 = graph.catIdToIndex(lcm.getArticleId()); // cat index for page (probably -1)
int catIndex2 = graph.catIdToIndex(lcm.getCategoryId()); // cat index for cat
if (catIndex1 >= 0 && catIndex2 >= 0) {
graph.catChildren[catIndex2][--numCatChildren[catIndex2]] = catIndex1;
graph.catParents[catIndex1][--numCatParents[catIndex1]] = catIndex2;
} else if (catIndex2 >= 0) {
graph.catPages[catIndex2][--numCatPages[catIndex2]] = lcm.getArticleId();
}
}
for (int n : numCatChildren) { assert(n == 0); }
for (int n : numCatPages) { assert(n == 0); }
for (int n : numCatParents) { assert(n == 0); }
LOG.info("loaded " + totalEdges + " edges in category graph");
}
public void computePageRanks(CategoryGraph graph) {
if (graph.catIds.length == 0) {
LOG.info("No categories found. Skipping page rank calculation.");
return;
}
LOG.info("computing category page ranks...");
// initialize page rank
long sumCredits = graph.catPages.length; // each category gets 1 credit to start
for (int i = 0; i < graph.catPages.length; i++) {
sumCredits += graph.catPages[i].length; // one more credit per page that references it.
}
for (int i = 0; i < graph.catPages.length; i++) {
graph.catCosts[i] = (1.0 + graph.catPages[i].length) / sumCredits;
}
for (int i = 0; i < 20; i++) {
LOG.info("performing page ranks iteration {0}.", i);
double error = onePageRankIteration(graph);
LOG.info("Error for iteration is {0}.", error);
if (error == 0) {
break;
}
}
Integer sortedIndexes[] = new Integer[graph.catCosts.length];
for (int i = 0; i < graph.catParents.length; i++) {
graph.catCosts[i] = 1.0/-Math.log(graph.catCosts[i]);
sortedIndexes[i] = i;
}
LOG.info("finished computing page ranks...");
final double[] costs = graph.catCosts;
Arrays.sort(sortedIndexes, new Comparator<Integer>() {
@Override
public int compare(Integer i1, Integer i2) {
Double pr1 = costs[i1];
Double pr2 = costs[i2];
return -1 * pr1.compareTo(pr2);
}
});
StringBuilder b = new StringBuilder();
for (int i = 0; i < 20 && i < sortedIndexes.length; i++) {
int j = sortedIndexes[i];
b.append("" + i + ". " + graph.cats[j] + "=" + graph.catCosts[j]);
b.append(", ");
}
graph.minCost = graph.catCosts[sortedIndexes[sortedIndexes.length - 1]];
LOG.info("Min cat cost: " + graph.minCost);
LOG.info("Top cat costs: " + b.toString());
}
private static final double DAMPING_FACTOR = 0.85;
public double onePageRankIteration(CategoryGraph graph) {
double nextRanks [] = new double[graph.catCosts.length];
Arrays.fill(nextRanks, (1.0 - DAMPING_FACTOR) / graph.catCosts.length);
for (int i = 0; i < graph.catParents.length; i++) {
int d = graph.catParents[i].length; // degree
double pr = graph.catCosts[i]; // current page-rank
for (int j : graph.catParents[i]) {
nextRanks[j] += DAMPING_FACTOR * pr / d;
}
}
double diff = 0.0;
for (int i = 0; i < graph.catParents.length; i++) {
diff += Math.abs(graph.catCosts[i] - nextRanks[i]);
}
graph.catCosts = nextRanks;
return diff;
}
}