package focusedCrawler.target.classifier; import static org.hamcrest.Matchers.is; import static org.junit.Assert.assertThat; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; import org.apache.commons.io.IOUtils; import org.junit.Test; import focusedCrawler.target.model.Page; import focusedCrawler.target.model.ParsedData; import focusedCrawler.util.parser.PaginaURL; public class RegexTargetClassifierTest { @Test public void testRegexClassifierMatcheConfig1() throws TargetClassifierException, IOException { // given String path = ClassifierFactoryTest.class.getResource("regex_classifier_config/").getPath(); String url1 = "http://example.com/foo"; String con1 = "<html><div><a href=\"http://j6im4v42ur6dpic3.onion/\">Page 1, foo</a></div></html>"; Page page1 = createPage(url1, con1); String url2 = "http://example.com/?category=1"; String con2 = "<html><div><a href=\"http://example.com/\">Page 2, foo</a></div></html>"; Page page2 = createPage(url2, con2); String url3 = "http://example.com/?post=1"; String con3 = "<html><div><a href=\"http://example.com/\">Page 2, bar</a></div></html>"; Page page3 = createPage(url3, con3); String url4 = "http://example.com/?post=1"; String con4 = "<html><div><a href=\"http://example.com/\">Page 2, asdf</a></div></html>"; Page page4 = createPage(url4, con4); RegexTargetClassifier classifier = (RegexTargetClassifier) TargetClassifierFactory.create(path); // then assertThat(classifier.classify(page1).isRelevant(), is(false)); assertThat(classifier.classify(page2).isRelevant(), is(true)); assertThat(classifier.classify(page3).isRelevant(), is(true)); assertThat(classifier.classify(page4).isRelevant(), is(false)); } @Test public void testRegexClassifierMatcheConfig2() throws TargetClassifierException, IOException { // given String config = ClassifierFactoryTest.class.getResource("regex_classifier/config_jobs/").getPath(); String pageFile = "https%3A%2F%2Fmarkettrack.com%2Fcareers%2Fjob-openings"; InputStream fileInput = ClassifierFactoryTest.class.getResourceAsStream("regex_classifier/"+pageFile); String url = URLDecoder.decode(pageFile, "UTF-8"); String content = IOUtils.toString(fileInput, "UTF-8"); Page page = createPage(url, content); RegexTargetClassifier classifier = (RegexTargetClassifier) TargetClassifierFactory.create(config); // then assertThat(classifier.classify(page).isRelevant(), is(true)); } private Page createPage(String urlStr, String cont) throws MalformedURLException { URL url = new URL(urlStr); Page page1 = new Page(url, cont); page1.setParsedData(new ParsedData(new PaginaURL(page1))); return page1; } }