package xyz.anduo.crawler;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Hashtable;
/**
* PageRank PageRank算法
*
* @author anduo
*
*/
public class PageRank {
private double[] rank;
Hashtable<String, Integer> hashedPages;
String[] sortedRank;
public PageRank() {}
private void rankFilter(BigMatrix dataMatrix) {
String[] tempRank = new String[sortedRank.length];
Boolean isEqual = true;
// 迭代计算,直到数据收敛或者次数达到50次
for (int i = 0; i < 50; i++) {
rank = dataMatrix.multiply(rank);
// 拷贝当前的数组值到临时数组
for (int j = 0; j < sortedRank.length; j++) {
tempRank[j] = sortedRank[j];
}
// 排序
Arrays.sort(sortedRank, new compareByRank());
// 计算是否收敛
for (int j = 0; j < sortedRank.length; j++) {
if (sortedRank[j].compareTo(tempRank[j]) != 0) {
isEqual = false;
break;
}
}
if (isEqual == true) {
break;
} else {
isEqual = true;
}
}
}
class compareByRank implements Comparator<String> {
public int compare(String a, String b) {
int indexA = hashedPages.get(a);
int indexB = hashedPages.get(b);
if (rank[indexA] == rank[indexB]) {
return (0);
} else if (rank[indexA] > rank[indexB]) {
return (-1);
} else {
return (1);
}
}
}
public java.lang.String[] pageRank(java.lang.String[] s) {
// height of data
int theSize = Math.max(4 * s.length / 3 + 1, 16);
// 初始化
hashedPages = new Hashtable<String, Integer>(theSize);
String[] pages = new String[s.length]; // theSize
int[] nLinks = new int[s.length]; // theSize
rank = new double[s.length];
sortedRank = new String[s.length];
String[] dataEntry = new String[s.length];
// 获取数据
for (int i = 0; i < s.length; i++) {
String[] temp = s[i].split(" ");
pages[i] = temp[0];
nLinks[i] = temp.length - 1;
sortedRank[i] = temp[0];
rank[i] = 1;
dataEntry[i] = "";
hashedPages.put(pages[i], i);
}
int tRow, tCol;
// 初始化矩阵
for (int i = 0; i < s.length; i++) {
String[] temp = s[i].split(" ");
for (int j = 1; j < temp.length; j++) {
tCol = hashedPages.get(temp[0]); // "to" aka row
tRow = hashedPages.get(temp[j]); // "from" aka col
// assumes no pages link to each other. else an if-statement is needed to check for i vs. j
// self-linking
dataEntry[tRow] += "{" + tCol + "," + (1 / (double) nLinks[i]) + "};";
}
}
// 创建矩阵数据
BigMatrix dataMatrix = new BigMatrix(dataEntry);
// 排序
rankFilter(dataMatrix);
// 返回排序后的URL列表
return (sortedRank);
}
}
// 矩阵
class BigMatrix {
public int nCols, nRows;
EntryList[] theRows;
// 构造函数采用String的数组作为输入,例如{"(1,1); (4,3); (5,8)", "(2,5); (3,4)","(3,8);(4,5)"}
// 每个字符串能够初始化一行数据。例如,(2,5)表示在第二行的第二列值为5
public BigMatrix(java.lang.String[] x) {
nRows = x.length;
nCols = 0;
theRows = new EntryList[nRows];
for (int i = 0; i < nRows; i++) {
theRows[i] = new EntryList();
if (x[i] != null) {
String[] tempArr = x[i].split(";");
if (tempArr[0] != null) {
for (int j = 0; j < tempArr.length; j++) {
Entry instance = new Entry(tempArr[j]);
theRows[i].add(instance);
if (nCols <= instance.col) {
nCols = instance.col + 1;
}
}
}
}
}
}
// 乘以1维向量
public double[] multiply(double[] x) {
double[] result = new double[nRows];
for (int i = 0; i < nRows; i++) {
EntryList temp = theRows[i];
while ((temp != null) && (temp.data != null)) {
result[i] += (temp.data.value * x[temp.data.col]);
temp = temp.next;
}
}
return (result);
}
}
// 矩阵的元素元素
class Entry {
int col;// 元素所在列
double value;// 元素值
public Entry(java.lang.String x) {
String[] temp = x.split(",");
if (temp[0].compareTo("") != 0) {
col = Integer.parseInt(temp[0].trim().substring(1));
value = Double.parseDouble(temp[1].trim().substring(0, temp[1].trim().length() - 1));
}
}
}
// 元素列表,对行进行建模
class EntryList {
Entry data;
EntryList next, tail;
public EntryList() {
next = null;
tail = null;
data = null;
}
// 添加数据
void add(Entry x) {
if (tail == null) {
data = x;
tail = this;
} else {
tail.next = new EntryList();
tail.next.data = x;
tail = tail.next;
}
}
}