/* ############################################################################ ## ## Copyright (C) 2006-2009 University of Utah. All rights reserved. ## ## This file is part of DeepPeep. ## ## This file may be used under the terms of the GNU General Public ## License version 2.0 as published by the Free Software Foundation ## and appearing in the file LICENSE.GPL included in the packaging of ## this file. Please review the following to ensure GNU General Public ## Licensing requirements will be met: ## http://www.opensource.org/licenses/gpl-license.php ## ## If you are unsure which license is appropriate for your use (for ## instance, you are interested in developing a commercial derivative ## of DeepPeep), please contact us at deeppeep@sci.utah.edu. ## ## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE ## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. ## ############################################################################ */ package focusedCrawler.target.classifier; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.nio.file.Path; import org.xml.sax.SAXException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import focusedCrawler.target.model.Page; import focusedCrawler.util.ParameterFile; import focusedCrawler.util.string.StopList; import focusedCrawler.util.string.StopListFile; import focusedCrawler.util.vsm.VSMElement; import focusedCrawler.util.vsm.VSMVector; import weka.classifiers.Classifier; import weka.core.Instances; /** * <p> </p> * * <p>Description: </p> * * <p>Copyright: Copyright (c) 2004</p> * * <p> </p> * * @author Luciano Barbosa * @version 1.0 */ public class WekaTargetClassifier implements TargetClassifier { private final Classifier classifier; private final Instances instances; private final String[] attributes; private final StopList stoplist; private final double relevanceThreshold; public WekaTargetClassifier(Classifier classifier, double relevanceThreshold, Instances instances, String[] attributes, StopList stoplist){ this.classifier = classifier; this.relevanceThreshold = relevanceThreshold; this.instances = instances; this.attributes = attributes; this.stoplist = stoplist; } public TargetRelevance classify(Page page) throws TargetClassifierException{ try{ double[] classificationResult = distributionForInstance(page); final double relevanceProbability = classificationResult[0]; if (relevanceProbability > relevanceThreshold) { return new TargetRelevance(true, relevanceProbability); } else { return new TargetRelevance(false, relevanceProbability); } }catch(Exception ex){ throw new TargetClassifierException(ex.getMessage(), ex); } } private double[] distributionForInstance(Page page) throws TargetClassifierException { double[] result = null; try { double[] values = getValues(page); synchronized (classifier) { weka.core.Instance instanceWeka = new weka.core.Instance(1, values); instanceWeka.setDataset(instances); result = classifier.distributionForInstance(instanceWeka); } } catch (Exception ex) { throw new TargetClassifierException(ex.getMessage(), ex); } return result; } private double[] getValues(Page page) throws IOException, SAXException { VSMVector vsm = new VSMVector(page.getContentAsString(), stoplist, true); double[] values = new double[attributes.length]; for (int i = 0; i < attributes.length; i++) { VSMElement elem = vsm.getElement(attributes[i]); if (elem == null) { values[i] = 0; } else { values[i] = elem.getWeight(); } } return values; } public static TargetClassifier create(String modelPath, double relevanceThreshold, StopListFile stopwordsFile) throws IOException { return create(modelPath + "/pageclassifier.model", modelPath + "/pageclassifier.features", relevanceThreshold, stopwordsFile); } public static TargetClassifier create(String modelFile, String featureFile, double relevanceThreshold, String stopwordsFile) throws IOException { StopListFile stoplist; if(stopwordsFile != null && !stopwordsFile.isEmpty()) { stoplist = new StopListFile(stopwordsFile); } else { stoplist = StopListFile.DEFAULT; } return create(modelFile, featureFile, relevanceThreshold, stoplist); } public static TargetClassifier create(String modelFile, String featureFile, double relevanceThreshold, StopList stoplist) throws IOException { try { ParameterFile featureConfig = new ParameterFile(featureFile); InputStream is = new FileInputStream(modelFile); ObjectInputStream objectInputStream = new ObjectInputStream(is); Classifier classifier = (Classifier) objectInputStream.readObject(); is.close(); String[] attributes = featureConfig.getParam("ATTRIBUTES", " "); String[] classValues = featureConfig.getParam("CLASS_VALUES", " "); Instances insts = createWekaIntances(attributes, classValues); return new WekaTargetClassifier(classifier, relevanceThreshold, insts, attributes, stoplist); } catch (FileNotFoundException e) { throw new IllegalArgumentException("Could not find file: " + modelFile, e); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Could not deserialize classifier from file:"+modelFile, e); } catch (IOException e) { throw new IllegalArgumentException("Could not load classifier.", e); } } private static Instances createWekaIntances(String[] attributes, String[] classValues) { weka.core.FastVector vectorAtt = new weka.core.FastVector(); for (int i = 0; i < attributes.length; i++) { vectorAtt.addElement(new weka.core.Attribute(attributes[i])); } weka.core.FastVector classAtt = new weka.core.FastVector(); for (int i = 0; i < classValues.length; i++) { classAtt.addElement(classValues[i]); } vectorAtt.addElement(new weka.core.Attribute("class", classAtt)); Instances instances = new Instances("target_classification", vectorAtt, 1); instances.setClassIndex(attributes.length); return instances; } static class WekaClassifierConfig { public String features_file = "pageclassifier.features"; public String model_file = "pageclassifier.features"; public String stopwords_file = null; public double relevanceThreshold = 0.5; } public static class Builder { public TargetClassifier build(Path basePath, ObjectMapper yaml, JsonNode parameters) throws IOException { WekaClassifierConfig params = yaml.treeToValue(parameters, WekaClassifierConfig.class); params.model_file = basePath.resolve(params.model_file).toFile().getAbsolutePath(); params.features_file = basePath.resolve(params.features_file).toFile().getAbsolutePath(); params.stopwords_file = basePath.resolve(params.stopwords_file).toFile().getAbsolutePath(); return WekaTargetClassifier.create( params.model_file, params.features_file, params.relevanceThreshold, params.stopwords_file); } } }