package uk.bl.wa.analyser.payload;
/*
* #%L
* warc-indexer
* %%
* Copyright (C) 2015 State and University Library, Denmark
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-2.0.html>.
* #L%
*/
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecordMetaData;
import org.junit.Test;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
/**
* @author Toke Eskildsen <te@statsbiblioteket.dk>
*
*/
public class HTMLAnalyserTest {
// NOTE: The number of extract links is 4. This is correct as the empty
// String should be discarded.
@Test
public void testLinksExtraction() throws IOException {
final URL SAMPLE_RESOURCE = Thread.currentThread().getContextClassLoader().getResource("links_extract.html");
assertNotNull("The sample file should be resolved", SAMPLE_RESOURCE);
final File SAMPLE = new File(SAMPLE_RESOURCE.getFile());
final URL CONF_RESOURCE = Thread.currentThread().getContextClassLoader().getResource("links_extract.conf");
assertNotNull("The config file should be resolved", CONF_RESOURCE);
final File CONF = new File(CONF_RESOURCE.getFile());
Config config = ConfigFactory.parseFile(CONF);
HTMLAnalyser ha = new HTMLAnalyser(config);
Map<String, Object> core = new HashMap<String, Object>();
core.put("subject-uri", "NotPresent");
core.put("ip-address", "192.168.1.10");
core.put("creation-date", "Invalid");
core.put("content-type", "text/html");
core.put("length", Long.toString(SAMPLE.length()));
core.put("version", "InvalidVersion");
core.put("absolute-offset", "0");
ArchiveRecordHeader header = new ARCRecordMetaData("invalid", core);
SolrRecord solr = new SolrRecord();
InputStream in = new BufferedInputStream(new FileInputStream(SAMPLE), (int) SAMPLE.length());
in.mark((int) SAMPLE.length());
ha.analyse(header, in, solr);
// Check number of links:
assertEquals("The number of links should be correct", 4,
solr.getField(SolrFields.SOLR_LINKS).getValueCount());
// Check hosts are canonicalized:
assertEquals("The number of hosts should be correct", 1,
solr.getField(SolrFields.SOLR_LINKS_HOSTS).getValueCount());
String host = (String) solr.getField(SolrFields.SOLR_LINKS_HOSTS)
.getFirstValue();
assertEquals("The host should be formatted correctly", "example.org",
host);
// The domains and suffixes too:
String domain = (String) solr.getField(SolrFields.SOLR_LINKS_DOMAINS)
.getFirstValue();
assertEquals("The domain should be formatted correctly", "example.org",
domain);
String suffix = (String) solr.getField(
SolrFields.SOLR_LINKS_PUBLIC_SUFFIXES).getFirstValue();
assertEquals("The suffix should be formatted correctly", "org",
suffix);
}
}