package focusedCrawler.link.classifier; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.Map; import focusedCrawler.link.classifier.builder.Instance; import focusedCrawler.link.classifier.builder.LinkNeighborhoodWrapper; import focusedCrawler.link.frontier.LinkRelevance; import focusedCrawler.target.model.Page; import focusedCrawler.util.parser.LinkNeighborhood; import weka.classifiers.Classifier; import weka.core.Instances; public class LinkClassifierAuthority implements LinkClassifier{ private LinkNeighborhoodWrapper wrapper; private String[] attributes; private Classifier classifier; private Instances instances; public LinkClassifierAuthority(Classifier classifier, Instances instances, LinkNeighborhoodWrapper wrapper,String[] attributes) { this.wrapper = wrapper; this.attributes = attributes; this.classifier = classifier; this.instances = instances; } public LinkClassifierAuthority() { } public LinkClassifierAuthority(LinkNeighborhoodWrapper wrapper,String[] attributes) { this.wrapper = wrapper; this.attributes = attributes; } public LinkRelevance[] classify(Page page) throws LinkClassifierException { try { LinkNeighborhood[] lns = page.getParsedData().getLinkNeighborhood(); LinkRelevance[] linkRelevance = null; if(classifier != null){ HashMap<String, Instance> urlWords = wrapper.extractLinks(lns, attributes); linkRelevance = new LinkRelevance[urlWords.size()]; int count = 0; for (Map.Entry<String, Instance> entry : urlWords.entrySet()) { URL url = new URL(entry.getKey()); double relevance = -1; if(!page.getURL().getHost().equals(url.getHost())){ Instance instance = entry.getValue(); double[] values = instance.getValues(); weka.core.Instance instanceWeka = new weka.core.Instance(1, values); instanceWeka.setDataset(instances); double[] prob = classifier.distributionForInstance(instanceWeka); relevance = LinkRelevance.DEFAULT_AUTH_RELEVANCE + (prob[0]*100); } linkRelevance[count] = new LinkRelevance(url, relevance); count++; } } else { linkRelevance = new LinkRelevance[lns.length]; for (int i = 0; i < lns.length; i++) { double relevance = -1; if(!page.getURL().getHost().equals(lns[i].getLink().getHost())){ relevance = LinkRelevance.DEFAULT_AUTH_RELEVANCE+1; } linkRelevance[i] = new LinkRelevance(lns[i].getLink(), relevance); } } return linkRelevance; } catch (Exception e) { throw new LinkClassifierException(e.getMessage(), e); } } @Override public LinkRelevance classify(LinkNeighborhood ln) throws LinkClassifierException { LinkRelevance linkRel = null; try{ HashMap<String, Instance> urlWords = wrapper.extractLinks(ln, attributes); for (Map.Entry<String, Instance> entry : urlWords.entrySet()) { double relevance = -1; if(isRootPage(entry.getKey())){ if(classifier != null){ Instance instance = (Instance) entry.getValue(); double[] values = instance.getValues(); weka.core.Instance instanceWeka = new weka.core.Instance(1, values); instanceWeka.setDataset(instances); double[] prob = classifier.distributionForInstance(instanceWeka); if(prob[0] == 1){ prob[0] = 0.99; } relevance = LinkRelevance.DEFAULT_AUTH_RELEVANCE + (prob[0]*100); }else{ relevance = LinkRelevance.DEFAULT_AUTH_RELEVANCE+1; } } linkRel = new LinkRelevance(new URL(entry.getKey()),relevance); } } catch (Exception e) { throw new LinkClassifierException("Failed to classify link", e); } return linkRel; } private boolean isRootPage(String urlStr) throws MalformedURLException { boolean result = false; URL url = new URL(urlStr); String file = url.getFile(); if(file.equals("/") || file.equals("") || file.equals("index.htm") || file.equals("index.html")){ result = true; } return result; } }