package focusedCrawler.link.linkanalysis;
import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Vector;
import focusedCrawler.link.BipartiteGraphRepository;
import focusedCrawler.util.parser.BackLinkNeighborhood;
import focusedCrawler.util.parser.LinkNeighborhood;
import focusedCrawler.util.persistence.Tuple;
import focusedCrawler.util.vsm.VSMElement;
import focusedCrawler.util.vsm.VSMElementComparator;
/**
* This class implements the HITS algorithm
* @author lbarbosa
*
*/
public class HITS {
private BipartiteGraphRepository graphRep;
private HashMap<String,VSMElement> authValues;
private HashMap<String,VSMElement> hubValues;
private double maxAuth = 0;
private double maxHub = 0;
private int iterations = 1;
private VSMElement[] hubRelevance;
private VSMElement[] authRelevance;
public HITS(){
this.authValues = new HashMap<String, VSMElement>();
this.hubValues = new HashMap<String, VSMElement>();
}
public HITS(BipartiteGraphRepository graphRep){
this.graphRep = graphRep;
this.authValues = new HashMap<String, VSMElement>();
this.hubValues = new HashMap<String, VSMElement>();
}
public VSMElement[] getHubRelevance(){
return hubRelevance;
}
public VSMElement[] getAuthRelevance(){
return authRelevance;
}
public void originalHITS() throws Exception{
Tuple<String>[] authTuples = graphRep.getAuthGraph();
Tuple<String>[] hubTuples = graphRep.getHubGraph();
// Tuple[] authTuples = new Tuple[7];
// Tuple t1 = new Tuple("D", "A###");
// Tuple t2 = new Tuple("E", "A###");
// Tuple t3 = new Tuple("F", "A###B###");
// Tuple t4 = new Tuple("G", "B###");
// Tuple t5 = new Tuple("H", "B###C###");
// Tuple t6 = new Tuple("I", "B###C###");
// Tuple t7 = new Tuple("J", "C###");
// authTuples[0] = t1;
// authTuples[1] = t2;
// authTuples[2] = t3;
// authTuples[3] = t4;
// authTuples[4] = t5;
// authTuples[5] = t6;
// authTuples[6] = t7;
// Tuple[] hubTuples = new Tuple[3];
// t1 = new Tuple("A", "D###E###F###");
// t2 = new Tuple("B", "F###G###H###I###");
// t3 = new Tuple("C", "H###I###J###");
// hubTuples[0] = t1;
// hubTuples[1] = t2;
// hubTuples[2] = t3;
inicialization(authTuples,hubTuples);
// normalization();
for (int l = 0; l < iterations; l++) {
updateHub(hubTuples);
maxNormalizationHub();
updateAuth(authTuples);
maxNormalizationAuth();
// normalization();
maxAuth = 0;
maxHub = 0;
// System.out.println(authValues.toString());
// System.out.println(hubValues.toString());
print();
}
setValues();
}
private void inicialization(Tuple<String>[] authTuples, Tuple<String>[] hubTuples){
for (int i = 0; i < authTuples.length; i++) {
authValues.put(authTuples[i].getKey(), new VSMElement(authTuples[i].getKey(), 1));
}
for (int i = 0; i < hubTuples.length; i++) {
hubValues.put(hubTuples[i].getKey(), new VSMElement(hubTuples[i].getKey(), 1));
}
}
private void print() throws IOException{
Vector<VSMElement> topAuths = new Vector<VSMElement>(authValues.values());
Collections.sort(topAuths,new VSMElementComparator());
System.out.println("-----TOP AUTHS-----");
for (int i = 0; i < topAuths.size() && i < 50; i++) {
String url = graphRep.getAuthURL(topAuths.elementAt(i).getWord());
System.out.println(i + ":" + url + "=" + topAuths.elementAt(i).getWeight());
}
Vector<VSMElement> topHubs = new Vector<VSMElement>(hubValues.values());
Collections.sort(topHubs,new VSMElementComparator());
System.out.println("-----TOP HUBS-----");
for (int i = 0; i < topHubs.size() && i < 50; i++) {
String url = graphRep.getHubURL(topHubs.elementAt(i).getWord()).toString();
System.out.println(i + ":" + URLDecoder.decode(url, "UTF-8") + "=" + topHubs.elementAt(i).getWeight());
}
}
private void setValues() throws IOException{
Vector<VSMElement> topAuths = new Vector<VSMElement>(authValues.values());
Collections.sort(topAuths,new VSMElementComparator());
authRelevance = new VSMElement[topAuths.size()];
for (int i = 0; i < topAuths.size(); i++) {
String url = graphRep.getAuthURL(topAuths.elementAt(i).getWord());
topAuths.elementAt(i).setWord(url);
authRelevance[i] = topAuths.elementAt(i);
}
Vector<VSMElement> topHubs = new Vector<VSMElement>(hubValues.values());
Collections.sort(topHubs,new VSMElementComparator());
hubRelevance = new VSMElement[topHubs.size()];
for (int i = 0; i < topHubs.size(); i++) {
String url = graphRep.getHubURL(topHubs.elementAt(i).getWord()).toString();
topHubs.elementAt(i).setWord(url);
hubRelevance[i] = topHubs.elementAt(i);
}
}
private void updateAuth(Tuple<String>[] authTuples){
for (int i = 0; i < authTuples.length; i++) {
String key = authTuples[i].getKey();
String[] backlinks = parseRecord(authTuples[i].getValue());
double totalAuth = 0;
for (int j = 0; j < backlinks.length; j++) {
VSMElement value = hubValues.get(backlinks[j]);
if(value != null){
totalAuth = totalAuth + value.getWeight();
}
}
if(totalAuth > maxAuth){
maxAuth = totalAuth;
}
authValues.put(key, new VSMElement(key,totalAuth));
}
}
private void updateHub(Tuple<String>[] hubTuples){
for (int i = 0; i < hubTuples.length; i++) {
String key = hubTuples[i].getKey();
String[] outlinks = parseRecord(hubTuples[i].getValue());
double totalHub = 0;
for (int j = 0; j < outlinks.length; j++) {
VSMElement value = authValues.get(outlinks[j]);
if(value != null){
totalHub = totalHub + value.getWeight();
}
}
if(totalHub > maxHub){
maxHub = totalHub;
}
hubValues.put(key, new VSMElement(key,totalHub));
}
}
private void maxNormalizationAuth(){
Iterator<String> authKeys = authValues.keySet().iterator();
while(authKeys.hasNext()){
String key = authKeys.next();
VSMElement value = authValues.get(key);
authValues.put(key, new VSMElement(key,value.getWeight()/maxAuth));
}
}
private void maxNormalizationHub(){
Iterator<String> hubKeys = hubValues.keySet().iterator();
while(hubKeys.hasNext()){
String key = hubKeys.next();
VSMElement value = hubValues.get(key);
hubValues.put(key, new VSMElement(key,value.getWeight()/maxHub));
}
}
private String[] parseRecord(String strLinks){
if(strLinks != null){
return strLinks.split("###");
}else{
return null;
}
}
public void firstIteration(HashSet<String> relSites) throws Exception{
authValues = new HashMap<String,VSMElement>();
hubValues = new HashMap<String,VSMElement>();
Iterator<String> values = relSites.iterator();
while(values.hasNext()){
String site = values.next();
BackLinkNeighborhood[] backlinks = graphRep.getBacklinks(new URL(site));
if(backlinks == null){
continue;
}
for (int j = 0; j < backlinks.length; j++) {
VSMElement count = hubValues.get(backlinks[j].getLink());
if(count == null){
count = new VSMElement(backlinks[j].getLink(), 0);
}
count.setWeight(count.getWeight()+1);
hubValues.put(backlinks[j].getLink(), count);
LinkNeighborhood[] outlinks = graphRep.getOutlinks(new URL(backlinks[j].getLink()));
for (int i = 0; i < outlinks.length; i++) {
if(outlinks[i] == null){
continue;
}
if(!relSites.contains(outlinks[i].getLink().toString())){
VSMElement count1 = authValues.get(outlinks[i].getLink().toString());
if(count1 == null){
count1 = new VSMElement(outlinks[i].getLink().toString(), 0);
}
count1.setWeight(count1.getWeight()+1);
authValues.put(outlinks[i].getLink().toString(), count1);
}
}
}
}
System.out.println("------");
System.out.println("HUB:" + hubValues.size());
Vector<VSMElement> finalHub = new Vector<VSMElement>(hubValues.values());
Collections.sort(finalHub, new VSMElementComparator());
hubRelevance = new VSMElement[finalHub.size()];
finalHub.toArray(hubRelevance);
for (int i = 0; i < 100 && i < finalHub.size(); i++) {
VSMElement elem = finalHub.elementAt(i);
System.out.println(elem.getWord() + ":" + elem.getWeight());
}
System.out.println("------");
System.out.println("AUTH:" + authValues.size());
Vector<VSMElement> finalAuth = new Vector<VSMElement>(authValues.values());
Collections.sort(finalAuth, new VSMElementComparator());
authRelevance = new VSMElement[finalAuth.size()];
finalAuth.toArray(authRelevance);
for (int i = 0; i < 100 && i < finalAuth.size(); i++) {
VSMElement elem = finalAuth.elementAt(i);
System.out.println(elem.getWord() + ":" + elem.getWeight());
}
}
}