package focusedCrawler.link.classifier; import java.io.FileInputStream; import java.io.InputStream; import java.io.ObjectInputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import focusedCrawler.link.classifier.builder.Instance; import focusedCrawler.link.classifier.builder.LinkNeighborhoodWrapper; import focusedCrawler.link.classifier.builder.WordField; import focusedCrawler.link.frontier.LinkRelevance; import focusedCrawler.target.model.Page; import focusedCrawler.util.ParameterFile; import focusedCrawler.util.parser.LinkNeighborhood; import focusedCrawler.util.string.StopList; import focusedCrawler.util.string.StopListFile; import weka.classifiers.Classifier; import weka.core.Instances; public class LinkClassifierRegression implements LinkClassifier{ private Classifier classifier; private Instances instances; private LinkNeighborhoodWrapper wrapper; private String[] attributes; public LinkClassifierRegression(Classifier classifier, Instances instances, LinkNeighborhoodWrapper wrapper, String[] attributes) { this.classifier = classifier; this.instances = instances; this.wrapper = wrapper; this.attributes = attributes; } public LinkRelevance[] classify(Page page) throws LinkClassifierException { LinkRelevance[] linkRelevance = null; try { Map<String, Instance> urlWords = wrapper.extractLinks(page, attributes); linkRelevance = new LinkRelevance[urlWords.size()]; Iterator<String> iter = urlWords.keySet().iterator(); int count = 0; while(iter.hasNext()){ String url = (String)iter.next(); Instance instance = (Instance)urlWords.get(url); double[] values = instance.getValues(); weka.core.Instance instanceWeka = new weka.core.Instance(1, values); instanceWeka.setDataset(instances); double classificationResult = classifier.classifyInstance(instanceWeka); // double[] prob = classifier.distributionForInstance(instanceWeka); double relevance = -1; if(isInitialPage(url)){ relevance = classificationResult*100 + 99; }else{ // relevance = 100 + (prob[(int)classificationResult]*100)-1; } // System.out.println(">>>>RELEVANCE:" + relevance); // double relevance = classificationResult*100 + random.nextInt(100); linkRelevance[count] = new LinkRelevance(new URL(url),relevance); count++; } }catch (MalformedURLException ex) { ex.printStackTrace(); throw new LinkClassifierException(ex.getMessage()); }catch(Exception ex){ ex.printStackTrace(); throw new LinkClassifierException(ex.getMessage()); } return linkRelevance; } private boolean isInitialPage(String urlStr) throws MalformedURLException { boolean result = false; URL url = new URL(urlStr); String file = url.getFile(); if(file.equals("/") || file.equals("")){ result = true; } return result; } public LinkRelevance classify(LinkNeighborhood ln) throws LinkClassifierException { LinkRelevance linkRel = null; try { Map<String, Instance> urlWords = wrapper.extractLinks(ln, attributes); Iterator<String> iter = urlWords.keySet().iterator(); while(iter.hasNext()){ String url = (String)iter.next(); Instance instance = (Instance)urlWords.get(url); double[] values = instance.getValues(); weka.core.Instance instanceWeka = new weka.core.Instance(1, values); instanceWeka.setDataset(instances); double classificationResult = classifier.classifyInstance(instanceWeka); double[] prob = classifier.distributionForInstance(instanceWeka); double relevance = classificationResult*100 + prob[(int)classificationResult]*100; linkRel = new LinkRelevance(new URL(url),relevance); } } catch (MalformedURLException ex) { ex.printStackTrace(); throw new LinkClassifierException(ex.getMessage()); } catch (Exception ex) { ex.printStackTrace(); throw new LinkClassifierException(ex.getMessage()); } return linkRel; } public static void main(String[] args) { try{ ParameterFile config = new ParameterFile(args[0]); LinkClassifier linkClassifier = null; StopList stoplist = new StopListFile(config.getParam("STOPLIST_FILES")); LinkNeighborhoodWrapper wrapper = new LinkNeighborhoodWrapper(stoplist); String[] attributes = config.getParam("ATTRIBUTES", " "); String[][] fieldWords = new String[WordField.FIELD_NAMES.length][]; List<String> tempURL = new ArrayList<String>(); List<String> tempAnchor = new ArrayList<String>(); List<String> tempAround = new ArrayList<String>(); for (int i = 0; i < attributes.length; i++) { if(attributes[i].contains("url_")){ tempURL.add(attributes[i]); } if(attributes[i].contains("anchor_")){ tempAnchor.add(attributes[i]); } if(attributes[i].contains("around_")){ tempAround.add(attributes[i]); } } fieldWords[WordField.URLFIELD] = new String[tempURL.size()]; fieldWords[WordField.ANCHOR] = new String[tempAnchor.size()]; fieldWords[WordField.AROUND] = new String[tempAround.size()]; tempURL.toArray(fieldWords[WordField.URLFIELD]); tempAnchor.toArray(fieldWords[WordField.ANCHOR]); tempAround.toArray(fieldWords[WordField.AROUND]); wrapper.setFeatures(fieldWords); InputStream is = new FileInputStream(config.getParam("FILE_CLASSIFIER")); ObjectInputStream objectInputStream = new ObjectInputStream(is); Classifier classifier = (Classifier) objectInputStream.readObject(); objectInputStream.close(); weka.core.FastVector vectorAtt = new weka.core.FastVector(); for (int i = 0; i < attributes.length; i++) { vectorAtt.addElement(new weka.core.Attribute(attributes[i])); } // String[] classValues = config.getParam("CLASS_VALUES", " "); // weka.core.FastVector classAtt = new weka.core.FastVector(); // for (int i = 0; i < classValues.length; i++) { // classAtt.addElement(classValues[i]); // } vectorAtt.addElement(new weka.core.Attribute("class",attributes.length)); System.out.println("SIZE:" + vectorAtt.size()); Instances insts = new Instances("link_classification", vectorAtt, 1); System.out.println("SIZE" + attributes.length); insts.setClassIndex(attributes.length); // linkClassifier = new LinkClassifierImpl(classifier, insts, wrapper, // attributes, // config.getParamInt("LEVEL")); linkClassifier = new LinkClassifierRegression(classifier, insts, wrapper,attributes); LinkNeighborhood ln = new LinkNeighborhood(new URL("http://www.new.com/sport")); ln.setAnchor(new String[]{"advertis","subscrib","opinion","site", "obituari"}); ln.setAround(new String[]{"advertis","subscrib","opinion","site", "obituari"}); LinkRelevance lr = linkClassifier.classify(ln); System.out.println(lr.getRelevance()); }catch(Exception ex){ ex.printStackTrace(); } } }