package xyz.anduo.crawler;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.persist.EntityCursor;
import com.sleepycat.persist.EntityIndex;
import com.sleepycat.persist.EntityStore;
import com.sleepycat.persist.PrimaryIndex;
import com.sleepycat.persist.SecondaryIndex;
import com.sleepycat.persist.StoreConfig;
/**
* WebGraph Web图建模
*
* @author anduo
*
*/
public class WebGraph {
// 出度
private PrimaryIndex<String, Link> outLinkIndex;
// 入度
private SecondaryIndex<String, String, Link> inLinkIndex;
private EntityStore store;
/**
* 构造函数
*/
public WebGraph(String dbDir) throws DatabaseException {
File envDir = new File(dbDir);
EnvironmentConfig envConfig = new EnvironmentConfig();
envConfig.setTransactional(false);
envConfig.setAllowCreate(true);
Environment env = new Environment(envDir, envConfig);
StoreConfig storeConfig = new StoreConfig();
storeConfig.setAllowCreate(true);
storeConfig.setTransactional(false);
store = new EntityStore(env, "classDb", storeConfig);
outLinkIndex = store.getPrimaryIndex(String.class, Link.class);
inLinkIndex = store.getSecondaryIndex(outLinkIndex, String.class, "toURL");
}
/**
* 构造Web图,从文件内读入。每一行为一个对应关系,例如 http://url1.com -> http://url2.com 1.0 表示对于链接
* http://url1.com所表示的网页上面,有一个超链接http://url2.com 并且他们之间的权重为1.0
*/
public void load(File file) throws IOException, FileNotFoundException, DatabaseException {
@SuppressWarnings("resource")
BufferedReader reader = new BufferedReader(new FileReader(file));
String line;
while ((line = reader.readLine()) != null) {
int index1 = line.indexOf("->");
if (index1 == -1) {
continue;
} else {
String url1 = line.substring(0, index1).trim();
String url2 = line.substring(index1 + 2).trim();
// Double strength = new Double(1.0);
index1 = url2.indexOf(" ");
if (index1 != -1)
try {
// strength = new
// Double(url2.substring(index1+1).trim());
url2 = url2.substring(0, index1).trim();
} catch (Exception e) {
}
addLink(url1, url2);
}
}
}
/**
* 加入节点之间的对应关系,如果节点不存在,就创建,如果已经存在对应关 系,就更新权重
*
*/
public void addLink(String fromLink, String toLink) throws DatabaseException {
Link outLinks = new Link();
outLinks.fromURL = fromLink;
outLinks.toURL = new HashSet<String>();
outLinks.toURL.add(toLink);
boolean inserted = outLinkIndex.putNoOverwrite(outLinks);
if (!inserted) {
outLinks = outLinkIndex.get(fromLink);
outLinks.toURL.add(toLink);
// System.out.println("outLinks : "+ outLinks.fromURL + "
// outLinks.toURL:"+outLinks.toURL.size());
// System.out.println(fromLink+" : "+ toLink);
outLinkIndex.put(outLinks);
}
}
// 根据制定的URL,获得指向他的入度链接
public String[] inLinks(String URL) throws DatabaseException {
EntityIndex<String, Link> subIndex = inLinkIndex.subIndex(URL);
// System.out.println(subIndex.count());
String[] linkList = new String[(int) subIndex.count()];
int i = 0;
EntityCursor<Link> cursor = subIndex.entities();
try {
for (Link entity : cursor) {
linkList[i++] = entity.fromURL;
// System.out.println(entity.fromURL);
}
} finally {
cursor.close();
}
return linkList;
}
}