package focusedCrawler.link;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import focusedCrawler.link.classifier.LinkClassifier;
import focusedCrawler.link.classifier.builder.LinkClassifierBuilder;
import focusedCrawler.link.frontier.Frontier;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.link.linkanalysis.HITS;
import focusedCrawler.link.linkanalysis.SALSA;
import focusedCrawler.target.TargetStorageMonitor;
import focusedCrawler.util.parser.LinkNeighborhood;
import focusedCrawler.util.vsm.VSMElement;
public class OnlineLearning {
private Frontier frontier;
private BipartiteGraphManager manager;
private BipartiteGraphRepository rep;
private LinkClassifierBuilder classifierBuilder;
private String method;
private String dataPath;
public OnlineLearning(Frontier frontier, BipartiteGraphManager manager,
LinkClassifierBuilder classifierBuilder, String method, String dataPath) {
this.frontier = frontier;
this.manager = manager;
this.classifierBuilder = classifierBuilder;
this.method = method;
this.dataPath = dataPath;
this.rep = manager.getRepository();
}
public synchronized void execute() throws Exception {
frontier.commit();
if (method.equals("SALSA")) {
runSALSA(null, false);
}
if (method.equals("SALSA_SEED")) {
runSALSA(readRelevantUrlsFromFile(), false);
}
if (method.equals("SALSA_CLASSIFIER")) {
runSALSA(readRelevantUrlsFromFile(), true);
}
if (method.equals("HITS")) {
runHITS(null);
}
if (method.equals("HITS_1ST")) {
runHITS(readRelevantUrlsFromFile());
}
if (method.equals("LINK_CLASSIFIERS")) {
createClassifiers(readRelevantUrlsFromFile(), true);
}
if (method.equals("FORWARD_CLASSIFIER_BINARY")) {
forwardClassifier(TargetStorageMonitor.readRelevantUrls(dataPath), true, 0);
}
if (method.equals("FORWARD_CLASSIFIER_LEVELS")) {
forwardClassifier(TargetStorageMonitor.readRelevantUrls(dataPath), true, 3);
}
frontier.commit();
}
private HashSet<String> readRelevantUrlsFromFile() throws IOException, FileNotFoundException {
HashSet<String> relSites = new HashSet<String>();
File file = new File(dataPath + File.separator + "entry_points");
try (BufferedReader input = new BufferedReader(new FileReader(file))) {
for (String line = input.readLine(); line != null; line = input.readLine()) {
if (line.startsWith("------")) {
String host = line.replace("-", "");
String url = "http://" + host + "/";
if (!relSites.contains(url)) {
relSites.add(url);
System.out.println(">>" + url);
}
}
}
}
return relSites;
}
public void runSALSA(HashSet<String> relSites, boolean useClassifier) throws Exception{
SALSA salsa = new SALSA(rep);
if(relSites != null){
HashMap<String,VSMElement> probs = new HashMap<String, VSMElement>();
if(useClassifier){
probs = createClassifiers(relSites,false);
}else{
Iterator<String> iter = relSites.iterator();
while(iter.hasNext()){
String site = iter.next();
System.out.println(">>>>>>>>" + site);
String id = rep.getID(site);
if(id == null){
continue;
}
probs.put(id + "_auth", new VSMElement(id,1));
String[] backlinks = rep.getBacklinks(id);
for (int i = 0; i < backlinks.length; i++) {
VSMElement elem = probs.get(id + "_hub");
if(elem == null){
elem = new VSMElement(id, 0);
probs.put(id + "_hub", elem);
}
elem.setWeight(elem.getWeight()+1);
}
}
}
normalize(probs);
salsa.setNodeRelevance(probs);
}
salsa.execute();
VSMElement[] hubRelevance = salsa.getHubValues();
double rel = 199;
System.out.println(">>>>>>>FRONTIER UPDATE...");
LinkRelevance lr = new LinkRelevance(new URL(hubRelevance[0].getWord()), rel);
frontier.update(lr);
for (int i = 1; i < hubRelevance.length; i++) {
if(i % (hubRelevance.length/99) == 0 ){
rel--;
}
if(hubRelevance[i].getWord() != null){
// double weight = (hubRelevance[i].getWeight()/hubRelevance[0].getWeight())*100 + 100;
lr = new LinkRelevance(new URL(hubRelevance[i].getWord()), rel);
// if(i < 50){
// System.out.println("###" + lr.getURL().toString() + "=" + lr.getRelevance());
// }
frontier.update(lr);
}
}
VSMElement[] authRelevance = salsa.getAuthValues();
rel = 299;
lr = new LinkRelevance(new URL(authRelevance[0].getWord()), rel);
frontier.update(lr);
for (int i = 1; i < authRelevance.length; i++) {
if(i % (authRelevance.length/99) == 0 ){
rel--;
}
if(authRelevance[i].getWord() != null){
// double weight = (authRelevance[i].getWeight()/authRelevance[0].getWeight())*100 + 200;
lr = new LinkRelevance(new URL(authRelevance[i].getWord()), rel);
// if(i < 500){
// System.out.println("###" + i + ":" + lr.getURL().toString() + "=" + lr.getRelevance() + ":" + authRelevance[i].getWeight());
// }
frontier.update(lr);
}
}
salsa = null;
}
private void normalize(HashMap<String,VSMElement> values){
//normalize
double totalAuth = 0;
double totalHub = 0;
Iterator<String> iter = values.keySet().iterator();
while(iter.hasNext()){
String key = iter.next();
VSMElement elem = values.get(key);
if(key.endsWith("_auth")){
totalAuth = totalAuth + elem.getWeight();
}
if(key.endsWith("_hub")){
totalHub = totalHub + elem.getWeight();
}
}
iter = values.keySet().iterator();
while(iter.hasNext()){
String key = iter.next();
VSMElement elem = values.get(key);
if(key.endsWith("_auth")){
elem.setWeight(elem.getWeight()/totalAuth);
}
if(key.endsWith("_hub")){
elem.setWeight(elem.getWeight()/totalHub);
}
}
}
public void runHITS(HashSet<String> relSites) throws Exception{
HITS hits = new HITS(rep);
if(relSites != null){
hits.firstIteration(relSites);
}else{
hits.originalHITS();
}
System.out.println(">>>>>>>FRONTIER UPDATE...");
VSMElement[] hubRelevance = hits.getHubRelevance();
double rel = 199;
LinkRelevance lr = new LinkRelevance(new URL(hubRelevance[0].getWord()), rel);
frontier.update(lr);
for (int i = 1; i < hubRelevance.length; i++) {
if(i % (hubRelevance.length/99) == 0 ){
rel--;
}
if(hubRelevance[i].getWord() != null){
lr = new LinkRelevance(new URL(hubRelevance[i].getWord()), rel);
frontier.update(lr);
}
}
VSMElement[] authRelevance = hits.getAuthRelevance();
rel = 299;
lr = new LinkRelevance(new URL(authRelevance[0].getWord()), rel);
frontier.update(lr);
for (int i = 1; i < authRelevance.length; i++) {
if(i % (authRelevance.length/99) == 0 ){
rel--;
}
if(authRelevance[i].getWord() != null){
lr = new LinkRelevance(new URL(authRelevance[i].getWord()), rel);
// System.out.println(">>>>>AUTH:" + lr.getURL().toString() + "=" + lr.getRelevance());
frontier.update(lr);
}
}
}
private void forwardClassifier(HashSet<String> relSites, boolean updateFrontier, int levels) throws Exception{
System.out.println(">>>BUILDING OUTLINK CLASSIFIER...:");
LinkClassifier outlinkClassifier = classifierBuilder.forwardlinkTraining(relSites,levels, "LinkClassifierImpl");
if(updateFrontier){
manager.setOutlinkClassifier(outlinkClassifier);
}
LinkNeighborhood[] outLNs = rep.getLNs();
for (int i = 0; i < outLNs.length; i++) {
if(outLNs[i] != null){
LinkRelevance lr = outlinkClassifier.classify(outLNs[i]);
if(updateFrontier){
frontier.update(lr);
}
}
}
}
private HashMap<String,VSMElement> createClassifiers(HashSet<String> relSites, boolean updateFrontier) throws Exception{
HashMap<String,VSMElement> elems = new HashMap<String,VSMElement>();
System.out.println(">>>BUILDING OUTLINK CLASSIFIER...:");
LinkClassifier outlinkClassifier = classifierBuilder.forwardlinkTraining(relSites,0,"LinkClassifierAuthority");
if(updateFrontier){
manager.setOutlinkClassifier(outlinkClassifier);
}
LinkNeighborhood[] outLNs = rep.getLNs();
HashSet<String> visitedAuths = frontier.visitedAuths();
HashSet<String> usedLinks = new HashSet<String>();
// Vector<VSMElement> temp = new Vector<VSMElement> ();
for (int i = 0; i < outLNs.length; i++) {
if(outLNs[i] != null){
LinkRelevance lr = outlinkClassifier.classify(outLNs[i]);
if(updateFrontier){
frontier.update(lr);
usedLinks.add(lr.getURL().toString());
}
String id = rep.getID(outLNs[i].getLink().toString());
if(id != null){
VSMElement elem = new VSMElement(id, (lr.getRelevance()-200)/100);
if(visitedAuths.contains(outLNs[i].getLink().toString())){
if(relSites.contains(outLNs[i].getLink().toString())){
elem.setWeight(1);
}else{
elem.setWeight(0.0000001);
}
}
elems.put(id + "_auth",elem);
}
}
}
System.out.println(">>>BUILDING BACKLINK CLASSIFIER...");
LinkClassifier backlinkClassifier = classifierBuilder.backlinkTraining(elems);
if(updateFrontier){
manager.setBacklinkClassifier(backlinkClassifier);
}
LinkNeighborhood[] backLNs = rep.getBacklinkLN();
for (int i = 0; i < backLNs.length; i++) {
if(backLNs[i] != null){
LinkRelevance lr = backlinkClassifier.classify(backLNs[i]);
if(updateFrontier && lr != null && !usedLinks.contains(lr.getURL().toString())){
frontier.update(lr);
}
String id = rep.getID(backLNs[i].getLink().toString());
if(id != null && lr != null){
VSMElement elem = new VSMElement(id, (lr.getRelevance()-100)/100);
elems.put(id + "_hub",elem);
}
}
}
return elems;
}
// public static void main(String[] args) {
// try {
// ParameterFile config = new ParameterFile(args[0]);
// PersistentHashtable url2id = new PersistentHashtable(config.getParam("URL_ID_DIRECTORY"),100000);
// PersistentHashtable authID = new PersistentHashtable(config.getParam("AUTH_ID_DIRECTORY"),100000);
// PersistentHashtable authGraph = new PersistentHashtable(config.getParam("AUTH_GRAPH_DIRECTORY"),100000);
// PersistentHashtable hubID = new PersistentHashtable(config.getParam("HUB_ID_DIRECTORY"),100000);
// PersistentHashtable hubGraph = new PersistentHashtable(config.getParam("HUB_GRAPH_DIRECTORY"),100000);
// BipartiteGraphRep rep = new BipartiteGraphRep(authGraph,url2id,authID,hubID,hubGraph);
// PersistentHashtable persistentHash = new PersistentHashtable(args[5],100000);
// FrontierTargetRepositoryBaseline frontier = new FrontierTargetRepositoryBaseline(persistentHash,10000);
// StopList stoplist = new StopListArquivo(args[1]);
// WrapperNeighborhoodLinks wrapper = new WrapperNeighborhoodLinks(stoplist);
// ClassifierBuilder cb = new ClassifierBuilder(rep,stoplist,wrapper,frontier);
// BipartiteGraphManager manager = new BipartiteGraphManager(frontier,rep,null,null);
// OnlineLearning onlineLearning = new OnlineLearning(frontier, manager, cb,"LINK_CLASSIFIERS",args[2]);
// BufferedReader input1 = new BufferedReader(new FileReader(new File(args[7])));
// HashSet<String> relSites = new HashSet<String>();
// for (String line = input1.readLine(); line != null; line = input1.readLine()) {
// String[] links = line.split(" ");
//// URL url = new URL(links[1]);
// if(!relSites.contains(links[1])){
// relSites.add(links[1]);
// }
// }
// onlineLearning.execute();
//// onlineLearning.runSALSA(relSites);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
}