package focusedCrawler.target.classifier; import java.io.File; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import focusedCrawler.util.string.StopListFile; public class TargetClassifierFactory { private static final Logger logger = LoggerFactory.getLogger(TargetClassifierFactory.class); public static TargetClassifier create(String modelPath) throws IOException { logger.info("Loading TargetClassifier..."); Path basePath = Paths.get(modelPath); Path configPath = Paths.get(modelPath, "/pageclassifier.yml"); File configFile = Paths.get(modelPath, "pageclassifier.yml").toFile(); if(configFile.exists() && configFile.canRead()) { ObjectMapper yaml = new ObjectMapper(new YAMLFactory()); JsonNode tree = yaml.readTree(configFile); String classifierType = tree.get("type").asText(); JsonNode parameters = tree.get("parameters"); logger.info("TARGET_CLASSIFIER: "+classifierType); TargetClassifier classifier = null; if("url_regex".equals(classifierType)) { classifier = new UrlRegexTargetClassifier.Builder().build(basePath, yaml, parameters); } if("title_regex".equals(classifierType)) { classifier = new TitleRegexTargetClassifier.Builder().build(basePath, yaml, parameters); } if("body_regex".equals(classifierType)) { classifier = new BodyRegexTargetClassifier.Builder().build(basePath, yaml, parameters); } if("regex".equals(classifierType)) { classifier = new RegexTargetClassifier.Builder().build(basePath , yaml, parameters); } if("keep_link_relevance".equals(classifierType)) { classifier = new KeepLinkRelevanceTargetClassifier.Builder().build(basePath, yaml, parameters); } if("weka".equals(classifierType)) { classifier = new WekaTargetClassifier.Builder().build(basePath, yaml, parameters); } if(classifier != null) { return classifier; } else { String errorMsg = "Could not instantiate classifier using config: " + configPath; throw new IllegalArgumentException(errorMsg); } } // create classic weka classifer to maintain compatibility with older versions return WekaTargetClassifier.create(modelPath, 0.5, StopListFile.DEFAULT); } }