package focusedCrawler.link.classifier;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import focusedCrawler.link.classifier.builder.Instance;
import focusedCrawler.link.classifier.builder.LinkNeighborhoodWrapper;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.model.Page;
import focusedCrawler.util.parser.LinkNeighborhood;
public class LinkClassifierBacklink implements LinkClassifier {
private LinkNeighborhoodWrapper wrapper;
private String[] attributes;
public LinkClassifierBacklink(LinkNeighborhoodWrapper wrapper, String[] attribute) {
this.wrapper = wrapper;
this.attributes = attribute;
}
public LinkRelevance[] classify(Page page) throws LinkClassifierException {
try {
LinkNeighborhood[] lns = page.getParsedData().getLinkNeighborhood();
HashMap<String, Instance> urlWords = wrapper.extractLinks(lns, attributes);
List<LinkRelevance> linkRelevance = new ArrayList<>();
for(String urlStr : urlWords.keySet()) {
URL url = new URL(urlStr);
double relevance = -1;
double pageRelevance = page.getTargetRelevance().getRelevance();
if (pageRelevance > 100 && pageRelevance < 200) {
if (isInitialPage(urlStr) && !page.getURL().getHost().equals(url.getHost())) {
relevance = 201;
url = new URL(url.getProtocol(), url.getHost(), "/");
}
}
linkRelevance.add(new LinkRelevance(url, relevance));
}
return (LinkRelevance[]) linkRelevance.toArray(new LinkRelevance[linkRelevance.size()]);
} catch (MalformedURLException ex) {
ex.printStackTrace();
throw new LinkClassifierException(ex.getMessage());
}
}
@Override
public LinkRelevance classify(LinkNeighborhood ln) throws LinkClassifierException {
// TODO Auto-generated method stub
return null;
}
private boolean isInitialPage(String urlStr) throws MalformedURLException {
boolean result = false;
URL url = new URL(urlStr);
String file = url.getFile();
if (file.equals("/") || file.equals("") || file.equals("index.htm") || file.equals("index.html")) {
result = true;
}
return result;
}
}