package focusedCrawler.target.classifier;
import static java.util.Arrays.asList;
import static org.hamcrest.Matchers.is;
import static org.junit.Assert.assertThat;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import org.junit.Test;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.util.parser.PaginaURL;
public class BodyRegexTargetClassifierTest {
@Test
public void test() throws MalformedURLException, TargetClassifierException {
// given
URL url = new URL("http://example.com");
String cont = "<html><div><a href=\"http://j6im4v42ur6dpic3.onion/\">TorProject Archive</a></div></html>";
Page page1 = new Page(url, cont);
page1.setParsedData(new ParsedData(new PaginaURL(page1)));
URL url2 = new URL("http://example.com");
String cont2 = "<html><div><a href=\"http://example.com/\">Garlic Project Archive</a></div></html>";
Page page2 = new Page(url2, cont2);
page2.setParsedData(new ParsedData(new PaginaURL(page2)));
List<String> patterns = asList(".*[a-zA-Z0-9]*.onion.*");
BodyRegexTargetClassifier classifier = new BodyRegexTargetClassifier(patterns);
// when
TargetRelevance relevance1 = classifier.classify(page1);
TargetRelevance relevance2 = classifier.classify(page2);
// then
assertThat(relevance1.isRelevant(), is(true));
assertThat(relevance2.isRelevant(), is(false));
}
@Test
public void shouldMatchHtmlFileWithMultipleLines() throws Exception {
// given
Path file = Paths.get(getClass().getResource("body_regex_classifier/test-file.html").toURI());
URL url = new URL("https://en.wikipedia.org/wiki/Ebola_virus_disease");
String content = new String(Files.readAllBytes(file));
Page page1 = new Page(url, content);
page1.setParsedData(new ParsedData(new PaginaURL(page1)));
List<String> patterns = asList(".*ebola.*");
BodyRegexTargetClassifier classifier = new BodyRegexTargetClassifier(patterns);
// when
TargetRelevance relevance1 = classifier.classify(page1);
// then
assertThat(relevance1.isRelevant(), is(true));
}
}