package xyz.anduo.crawler;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* WebGraphMemory 内存Web图
*
* @author anduo
*
*/
public class WebGraphMemory {
// 把每个URL映射为一个整数,存储在web图中
private Map<Integer, String> IdentifyerToURL;
/**
* A Map storing relationships from URLs to numeric identifiers, usefull for storing Web graphs
*/
private Map<String, Map<String, Integer>> URLToIdentifyer;
/**
* 存储入度,其中整数第一个参数是URL的ID,第二个参数是存放指向这个URL链接的Map,Double表示权重
*/
private Map<Integer, Map<Integer, Double>> InLinks;
/**
* 存储出度,其中第一个参数是URL的ID,第二个参数是存放网页中的超链接,Double表示权重
*/
private Map<Integer, Map<Integer, Double>> OutLinks;
/** 图中节点的数目 */
private int nodeCount;
/**
* 构造函数,0个节点的构造函数
*/
public WebGraphMemory() {
IdentifyerToURL = new HashMap<Integer, String>();
URLToIdentifyer = new HashMap<String, Map<String, Integer>>();
InLinks = new HashMap<Integer, Map<Integer, Double>>();
OutLinks = new HashMap<Integer, Map<Integer, Double>>();
nodeCount = 0;
}
/**
* 从一个文本文件中取得节点的构造函数。 每行包含一个指向关系。例如: http://url1.com -> http://url2.com 1.0 表示 "http://url1.com"
* 包含一个超链接 "http://url2.com", 并且这个超链接的权重是1.0
*/
@SuppressWarnings("resource")
public WebGraphMemory(File file) throws IOException, FileNotFoundException {
this();
BufferedReader reader = new BufferedReader(new FileReader(file));
String line;
while ((line = reader.readLine()) != null) {
int index1 = line.indexOf("->");
if (index1 == -1)
addLink(line.trim());
else {
String url1 = line.substring(0, index1).trim();
String url2 = line.substring(index1 + 2).trim();
Double strength = new Double(1.0);
index1 = url2.indexOf(" ");
if (index1 != -1)
try {
strength = new Double(url2.substring(index1 + 1).trim());
url2 = url2.substring(0, index1).trim();
} catch (Exception e) {
}
addLink(url1, url2, strength);
}
}
}
/**
* 根据URL制定它的ID
*/
public Integer URLToIdentifyer(String URL) {
String host;
String name;
int index = 0, index2 = 0;
if (URL.startsWith("http://"))
index = 7;
else if (URL.startsWith("ftp://"))
index = 6;
index2 = URL.substring(index).indexOf("/");
if (index2 != -1) {
name = URL.substring(index + index2 + 1);
host = URL.substring(0, index + index2);
} else {
host = URL;
name = "";
}
// System.out.println("host:"+host + " name:"+name);
Map<String, Integer> map = (URLToIdentifyer.get(host));
if (map == null) {
// System.out.println("will return null");
return null;
}
// System.out.println("return:"+map.get(""));
return (map.get(name));
}
/**
* 根据ID获得URL
*/
public String IdentifyerToURL(Integer id) {
return (IdentifyerToURL.get(id));
}
/**
* 在图中增加一个节点
*/
public Integer addLink(String link) {
Integer id = URLToIdentifyer(link);
if (id == null) {
id = new Integer(++nodeCount);
String host;
String name;
int index = 0, index2 = 0;
if (link.startsWith("http://"))
index = 7;
else if (link.startsWith("ftp://"))
index = 6;
index2 = link.substring(index).indexOf("/");
if (index2 != -1) {
name = link.substring(index + index2 + 1);
host = link.substring(0, index + index2);
} else {
host = link;
name = "";
}
// System.out.println("HOST:"+host + " name:"+name);
Map<String, Integer> map = (URLToIdentifyer.get(host));
if (map == null) {
map = new HashMap<String, Integer>();
URLToIdentifyer.put(host, map);
}
map.put(name, id);
// error here
// URLToIdentifyer.put(link,map);
IdentifyerToURL.put(id, link);
InLinks.put(id, new HashMap<Integer, Double>());
OutLinks.put(id, new HashMap<Integer, Double>());
// System.out.println("id:"+id);
}
return id;
}
/**
* 在两个节点中增加一个对应关系。如果节点不存在,就新创建节点
*/
public Double addLink(String fromLink, String toLink, Double weight) {
Integer id1 = addLink(fromLink);
Integer id2 = addLink(toLink);
return addLink(id1, id2, weight);
}
/**
* 在两个节点中增加一个对应关系。如果节点不存在,就新创建节点
*/
private Double addLink(Integer fromLink, Integer toLink, Double weight) {
// System.out.println("from "+fromLink+" to "+toLink);
Double aux;
Map<Integer, Double> map1 = (InLinks.get(toLink));
Map<Integer, Double> map2 = (OutLinks.get(fromLink));
aux = (Double) (map1.get(fromLink));
if (aux == null)
map1.put(fromLink, weight);
else if (aux.doubleValue() < weight.doubleValue())
map1.put(fromLink, weight);
else
weight = new Double(aux.doubleValue());
aux = (map2.get(toLink));
if (aux == null)
map2.put(toLink, weight);
else if (aux.doubleValue() < weight.doubleValue())
map2.put(toLink, weight);
else {
weight = new Double(aux.doubleValue());
map1.put(fromLink, weight);
}
InLinks.put(toLink, map1);
OutLinks.put(fromLink, map2);
return weight;
}
/**
* 针对指定的URL返回包含它的入度的链接的Map
*/
public Map<Integer, Double> inLinks(String URL) {
Integer id = URLToIdentifyer(URL);
return inLinks(id);
}
/**
* 针对指定的URL返回包含它的入度的链接的Map
*/
public Map<Integer, Double> inLinks(Integer link) {
if (link == null)
return null;
Map<Integer, Double> aux = (InLinks.get(link));
return aux;
}
/**
* 针对指定的URL返回包含它的出度的链接的Map
*/
public Map<Integer, Double> outLinks(String URL) {
Integer id = URLToIdentifyer(URL);
return outLinks(id);
}
/**
* 针对指定的URL返回包含它的出度的链接的Map
*/
public Map<Integer, Double> outLinks(Integer link) {
if (link == null)
return null;
Map<Integer, Double> aux = OutLinks.get(link);
return aux;
}
/**
* 返回两个节点之间的权重,如果节点没有连接,就返回0
*/
public Double inLink(String fromLink, String toLink) {
Integer id1 = URLToIdentifyer(fromLink);
Integer id2 = URLToIdentifyer(toLink);
return inLink(id1, id2);
}
/**
* 返回两个节点之间的权重,如果节点没有连接,就返回0
*/
public Double outLink(String fromLink, String toLink) {
Integer id1 = URLToIdentifyer(fromLink);
Integer id2 = URLToIdentifyer(toLink);
return outLink(id1, id2);
}
/**
* 返回两个节点之间的权重,如果节点没有连接,就返回0
*/
public Double inLink(Integer fromLink, Integer toLink) {
Map<Integer, Double> aux = inLinks(toLink);
if (aux == null)
return new Double(0);
Double weight = (aux.get(fromLink));
return (weight == null) ? new Double(0) : weight;
}
/**
* 返回两个节点之间的权重,如果节点没有连接,就返回0
*/
public Double outLink(Integer fromLink, Integer toLink) {
Map<Integer, Double> aux = outLinks(fromLink);
if (aux == null)
return new Double(0);
Double weight = (aux.get(toLink));
return (weight == null) ? new Double(0) : weight;
}
/**
* 把有向图变为无向图。
*/
public void transformUnidirectional() {
Iterator<Integer> it = OutLinks.keySet().iterator();
while (it.hasNext()) {
Integer link1 = (Integer) (it.next());
Map<Integer, Double> auxMap = OutLinks.get(link1);
Iterator<Integer> it2 = auxMap.keySet().iterator();
while (it2.hasNext()) {
Integer link2 = (Integer) (it.next());
Double weight = (Double) (auxMap.get(link2));
addLink(link2, link1, weight);
}
}
}
/**
* 删除内部链接,内部链接就是指在同一主机上的链接
*/
public void removeInternalLinks() {
int index1;
Iterator<Integer> it = OutLinks.keySet().iterator();
while (it.hasNext()) {
Integer link1 = (Integer) (it.next());
Map<Integer, Double> auxMap = (OutLinks.get(link1));
Iterator<Integer> it2 = auxMap.keySet().iterator();
if (it2.hasNext()) {
String URL1 = (String) (IdentifyerToURL.get(link1));
index1 = URL1.indexOf("://");
if (index1 != -1)
URL1 = URL1.substring(index1 + 3);
index1 = URL1.indexOf("/");
if (index1 != -1)
URL1 = URL1.substring(0, index1);
while (it2.hasNext()) {
Integer link2 = (Integer) (it.next());
String URL2 = (String) (IdentifyerToURL.get(link2));
index1 = URL2.indexOf("://");
if (index1 != -1)
URL2 = URL1.substring(index1 + 3);
index1 = URL2.indexOf("/");
if (index1 != -1)
URL2 = URL1.substring(0, index1);
if (URL1.equals(URL2)) {
auxMap.remove(link2);
OutLinks.put(link1, auxMap);
auxMap = (InLinks.get(link2));
auxMap.remove(link1);
InLinks.put(link2, auxMap);
}
}
}
}
}
/**
* 删除内部导航链接。
*/
public void removeNepotistic() {
removeInternalLinks();
}
/**
* 删除 stop URLs.。
*/
public void removeStopLinks(String stopURLs[]) {
HashMap<String, Object> aux = new HashMap<String, Object>();
for (int i = 0; i < stopURLs.length; i++)
aux.put(stopURLs[i], null);
removeStopLinks(aux);
}
/**
* 删除 stop URLs。
*/
public void removeStopLinks(Map<String, Object> stopURLs) {
int index1;
Iterator<Integer> it = OutLinks.keySet().iterator();
while (it.hasNext()) {
Integer link1 = (Integer) (it.next());
String URL1 = (String) (IdentifyerToURL.get(link1));
index1 = URL1.indexOf("://");
if (index1 != -1)
URL1 = URL1.substring(index1 + 3);
index1 = URL1.indexOf("/");
if (index1 != -1)
URL1 = URL1.substring(0, index1);
if (stopURLs.containsKey(URL1)) {
OutLinks.put(link1, new HashMap<Integer, Double>());
InLinks.put(link1, new HashMap<Integer, Double>());
}
}
}
public int numNodes() {
return nodeCount;
}
}