package focusedCrawler.link;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.StringTokenizer;
import focusedCrawler.util.parser.BackLinkNeighborhood;
import focusedCrawler.util.parser.LinkNeighborhood;
import focusedCrawler.util.persistence.PersistentHashtable;
import focusedCrawler.util.persistence.Tuple;
public class BipartiteGraphRepository {
private final int pagesToCommit = 100;
private int uncommittedCount = 0;
private PersistentHashtable<String> authGraph;
private PersistentHashtable<String> authID;
private PersistentHashtable<String> hubGraph;
private PersistentHashtable<String> hubID;
private PersistentHashtable<String> url2id;
private final String separator = "###";
private String authGraphDirectory = "data_backlinks/auth_graph";
private String urlIdDirectory = "data_backlinks/url";
private String authIdDirectory = "data_backlinks/auth_id";
private String hubIdDirectory = "data_backlinks/hub_id";
private String hubGraphDirectory = "data_backlinks/hub_graph";
public BipartiteGraphRepository(String dataPath) {
int cacheSize = 10000;
this.authGraph = new PersistentHashtable<>(dataPath + "/" + authGraphDirectory, cacheSize, String.class);
this.url2id = new PersistentHashtable<>(dataPath + "/" + urlIdDirectory, cacheSize, String.class);
this.authID = new PersistentHashtable<>(dataPath + "/" + authIdDirectory, cacheSize, String.class);
this.hubID = new PersistentHashtable<>(dataPath + "/" + hubIdDirectory, cacheSize, String.class);
this.hubGraph = new PersistentHashtable<>(dataPath + "/" + hubGraphDirectory, cacheSize, String.class);
}
public Tuple<String>[] getAuthGraph() throws Exception{
return authGraph.getTableAsArray();
}
public Tuple<String>[] getHubGraph() throws Exception{
return hubGraph.getTableAsArray();
}
public String getID(String url){
return url2id.get(url);
}
public String getHubURL(String id) throws IOException{
String url = hubID.get(id);
if(url != null){
String[] fields = url.split(":::");
url = fields[0];
}
return url;
}
public String getAuthURL(String id){
String url = authID.get(id);
if(url != null){
String[] fields = url.split(":::");
url = fields[0];
}
return url;
}
public String[] getOutlinks(String id){
String links = hubGraph.get(id);
if(links != null){
return links.split("###");
}else{
return null;
}
}
public String[] getBacklinks(String id){
String links = authGraph.get(id);
if(links != null){
return links.split("###");
}else{
return null;
}
}
public LinkNeighborhood[] getLNs() throws Exception{
Tuple<String>[] tuples = authID.getTableAsArray();
LinkNeighborhood[] lns = new LinkNeighborhood[tuples.length];
for (int i = 0; i < lns.length; i++) {
String strln = tuples[i].getValue();
if(strln != null){
String[] fields = strln.split(":::");
lns[i] = new LinkNeighborhood(new URL(fields[0]));
if(fields.length > 1){
lns[i].setAnchor(fields[1].split(" "));
if(fields.length > 2){
lns[i].setAround(fields[2].split(" "));
}
}
}
}
return lns;
}
public LinkNeighborhood[] getBacklinkLN() throws Exception{
Tuple<String>[] tuples = hubID.getTableAsArray();
LinkNeighborhood[] lns = new LinkNeighborhood[tuples.length];
for (int i = 0; i < lns.length; i++) {
String strln = tuples[i].getValue();
if(strln != null){
String[] fields = strln.split(":::");
lns[i] = new LinkNeighborhood(new URL(fields[0]));
if(fields.length > 1){
String title = fields[1];
if(title != null){
StringTokenizer tokenizer = new StringTokenizer(title," ");
List<String> anchorTemp = new ArrayList<String>();
while(tokenizer.hasMoreTokens()){
anchorTemp.add(tokenizer.nextToken());
}
String[] aroundArray = new String[anchorTemp.size()];
anchorTemp.toArray(aroundArray);
lns[i].setAround(aroundArray);
}
}
}
}
return lns;
}
public LinkNeighborhood getBacklinkLN(URL url) throws MalformedURLException{
LinkNeighborhood ln = null;
String urlId = url2id.get(url.toString());
if(urlId != null){
String strln = hubID.get(urlId);
if(strln != null){
String[] fields = strln.split(":::");
ln = new LinkNeighborhood(new URL(fields[0]));
if(fields.length > 1){
String title = fields[1];
if(title != null){
StringTokenizer tokenizer = new StringTokenizer(title," ");
List<String> anchorTemp = new ArrayList<String>();
while(tokenizer.hasMoreTokens()){
anchorTemp.add(tokenizer.nextToken());
}
String[] aroundArray = new String[anchorTemp.size()];
anchorTemp.toArray(aroundArray);
ln.setAround(aroundArray);
}
}
}
}
return ln;
}
public LinkNeighborhood getLN(URL url) throws MalformedURLException{
LinkNeighborhood ln = null;
URL normalizedURL = url;//new URL(url.getProtocol(), url.getHost(), "/");
String urlId = url2id.get(normalizedURL.toString());
if(urlId != null){
String strln = authID.get(urlId);
ln = parseString(strln);
}
return ln;
}
public LinkNeighborhood[] getOutlinks(URL url) throws IOException{
String urlId = url2id.get(url.toString());
if(urlId == null){
return null;
} else {
String[] linkIds = hubGraph.get(urlId).split("###");
LinkNeighborhood[] lns = new LinkNeighborhood[linkIds.length];
for (int i = 0; i < lns.length; i++) {
String strln = authID.get(linkIds[i]);
if(strln != null){
String[] fields = strln.split(":::");
LinkNeighborhood ln = new LinkNeighborhood(new URL(fields[0]));
lns[i] = ln;
if(fields.length > 1){
ln.setAnchor(fields[1].split(" "));
if(fields.length > 2){
ln.setAround(fields[2].split(" "));
}
}
}
}
return lns;
}
}
/**
* This method retrieves the the backlinks of a given url.
* @param url
* @return
* @throws IOException
*/
public BackLinkNeighborhood[] getBacklinks(URL url) throws IOException {
URL normalizedURL = new URL(url.getProtocol(), url.getHost(), "/");
String urlId = url2id.get(normalizedURL.toString());
if(urlId == null){
return null;
}
String strLinks = authGraph.get(urlId);
if(strLinks == null){
return null;
} else {
List<BackLinkNeighborhood> tempBacklinks = new ArrayList<BackLinkNeighborhood> ();
String[] backlinkIds = strLinks.split("###");
for (int i = 0; i < backlinkIds.length; i++) {
String url_title = hubID.get(backlinkIds[i]);
if(url_title != null){
BackLinkNeighborhood bln = new BackLinkNeighborhood();
String[] fields = url_title.split(":::");
bln.setLink(fields[0]);
if(fields.length > 1){
bln.setTitle(fields[1]);
}
tempBacklinks.add(bln);
}
}
BackLinkNeighborhood[] blns = new BackLinkNeighborhood[tempBacklinks.size()];
tempBacklinks.toArray(blns);
return blns;
}
}
public LinkNeighborhood[] getBacklinksLN(URL url) throws IOException {
String urlId = url2id.get(url.toString());
if(urlId == null){
return null;
}
String strLinks = authGraph.get(urlId);
if(strLinks == null){
return null;
} else {
List<LinkNeighborhood> tempLNs = new ArrayList<LinkNeighborhood> ();
String[] linkIds = strLinks.split("###");
for (int i = 0; i < linkIds.length; i++) {
String lnStr = authID.get(linkIds[i]);
LinkNeighborhood ln = parseString(lnStr);
if(ln != null){
tempLNs.add(ln);
}
}
LinkNeighborhood[] lns = new LinkNeighborhood[tempLNs.size()];
tempLNs.toArray(lns);
return lns;
}
}
/**
* Insert outlinks from hubs
* @param page
*/
public synchronized void insertOutlinks(URL url, LinkNeighborhood[] lns){
String urlId = getId(url.toString());
String strCurrentLinks = hubGraph.get(urlId);
HashSet<String> currentLinks = parseRecordForwardLink(strCurrentLinks);
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < lns.length; i++) {
if(lns[i] != null){
String lnURL = lns[i].getLink().toString();
String id = getId(lnURL);
if(!currentLinks.contains(id)){
String ln = authID.get(id);
if(ln == null){
authID.put(id, lnURL + ":::" + lns[i].getAnchorString() + ":::" + lns[i].getAroundString());
}
buffer.append(id);
buffer.append(separator);
currentLinks.add(id);
}
String strLinks = authGraph.get(id);
HashSet<String> tempCurrentLinks = parseRecordBacklink(strLinks);
if(!tempCurrentLinks.contains(urlId)){
if(tempCurrentLinks.size() == 0){
strLinks = urlId + separator;
}else{
strLinks = strLinks + urlId + separator;
}
String url_string = hubID.get(id);
if(url_string == null){
hubID.put(id, lnURL + ":::");
}
authGraph.put(id, strLinks);
}
}
}
if(strCurrentLinks == null){
strCurrentLinks = buffer.toString();
} else {
strCurrentLinks = strCurrentLinks + buffer.toString();
}
if(!strCurrentLinks.equals("")){
hubGraph.put(urlId, strCurrentLinks);
}
uncommittedCount++;
if (uncommittedCount == pagesToCommit) {
this.commit();
uncommittedCount = 0;
}
}
/**
* Insert backlinks from authorities
* @param page
* @throws IOException
*/
public synchronized void insertBacklinks(URL url, BackLinkNeighborhood[] links) throws IOException{
String urlId = getId(url.toString());
String strCurrentLinks = authGraph.get(urlId);
HashSet<String> currentLinks = parseRecordBacklink(strCurrentLinks);
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < links.length; i++) {
String id = getId(links[i].getLink());
if(!currentLinks.contains(id)){
String url_string = hubID.get(id);
if(url_string == null){
hubID.put(id, links[i].getLink() + ":::" + links[i].getTitle());
}
buffer.append(id);
buffer.append(separator);
currentLinks.add(id);
}
String strLinks = hubGraph.get(id);
HashSet<String> tempCurrentLinks = parseRecordForwardLink(strLinks);
if(!tempCurrentLinks.contains(urlId)){
if(tempCurrentLinks.size() == 0){
strLinks = urlId + separator;
}else{
strLinks = strLinks + urlId + separator;
}
hubGraph.put(id, strLinks);
}
}
if(strCurrentLinks == null){
strCurrentLinks = buffer.toString();
}else{
strCurrentLinks = strCurrentLinks + buffer.toString();
}
authGraph.put(urlId, strCurrentLinks);
uncommittedCount++;
if(uncommittedCount == pagesToCommit){
this.commit();
uncommittedCount = 0;
}
}
private String getId(String url){
String id = url2id.get(url);
if(id == null){
String maxId = url2id.get("MAX");
if(maxId == null){
maxId = "0";
}
int newId = Integer.parseInt(maxId) + 1;
id = newId+"";
url2id.put(url, id);
url2id.put("MAX", id);
}
return id;
}
public synchronized void commit(){
url2id.commit();
authGraph.commit();
authID.commit();
hubID.commit();
hubGraph.commit();
}
public void close(){
this.commit();
url2id.close();
authGraph.close();
authID.close();
hubID.close();
hubGraph.close();
}
private HashSet<String> parseRecordBacklink(String strLinks){
HashSet<String> currentLinks = new HashSet<String>();
if(strLinks != null){
String[] links = strLinks.split("###");
for (int i = 0; i < links.length; i++) {
currentLinks.add(links[i]);
}
}
return currentLinks;
}
private HashSet<String> parseRecordForwardLink(String strLinks){
HashSet<String> currentLinks = new HashSet<String>();
if(strLinks != null){
String[] linkIds = strLinks.split("###");
for (int i = 0; i < linkIds.length; i++) {
currentLinks.add(linkIds[i]);
}
}
return currentLinks;
}
private LinkNeighborhood parseString(String lnStr) throws MalformedURLException{
LinkNeighborhood ln = null;
if(lnStr != null){
String[] fields = lnStr.split(":::");
ln = new LinkNeighborhood(new URL(fields[0]));
if(fields.length > 1){
ln.setAnchor(fields[1].split(" "));
if(fields.length > 2){
ln.setAround(fields[2].split(" "));
}
}
}
return ln;
}
}