package uk.bl.wa.indexer; /* * #%L * warc-indexer * $Id:$ * $HeadURL:$ * %% * Copyright (C) 2013 - 2014 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import static org.junit.Assert.assertEquals; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.MalformedURLException; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; import org.archive.io.ArchiveRecord; import org.archive.util.ArchiveUtils; import org.junit.Test; import uk.bl.wa.solr.SolrRecord; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigValue; import com.typesafe.config.ConfigValueFactory; public class WARCIndexerTest { /** * Check timestamp parsing is working correctly, as various forms exist in the ARCs and WARCs. */ private static final String TIMESTAMP_12 = "200009200005"; private static final String TIMESTAMP_14 = "20000920000545"; private static final String TIMESTAMP_16 = "2000092000054543"; private static final String TIMESTAMP_17 = "20000920000545439"; @Test public void testExtractYear() { assertEquals("2000", WARCIndexer.extractYear(TIMESTAMP_16)); } @Test public void testParseCrawlDate() { assertEquals("2000-09-20T00:05:00Z", WARCIndexer.parseCrawlDate(TIMESTAMP_12)); assertEquals("2000-09-20T00:05:45Z", WARCIndexer.parseCrawlDate(TIMESTAMP_14)); assertEquals("2000-09-20T00:05:45Z", WARCIndexer.parseCrawlDate(TIMESTAMP_16)); assertEquals("2000-09-20T00:05:45Z", WARCIndexer.parseCrawlDate(TIMESTAMP_17)); } /** * Check URL and extension parsing is robust enough. */ @Test public void testParseExtension() { assertEquals("png", WARCIndexer.parseExtension("http://host/image.png")); assertEquals("png", WARCIndexer.parseExtension("http://host/this/that/image.parseExtension.png")); // FIXME Get some bad extensions from the current Solr server and check we can deal with them //fail("Not implemented yet!"); } /** * Test protocol filtering is working ok. * * @throws NoSuchAlgorithmException * @throws IOException * @throws MalformedURLException */ @Test public void testProtocolFilters() throws NoSuchAlgorithmException, MalformedURLException, IOException { // Check protocol excludes: String path = "warc.index.extract.protocol_include"; List<String> protocols = new ArrayList<String>(); protocols.add("http"); protocols.add("https"); this.testFilterBehaviour(path, protocols, 29); protocols.remove("http"); this.testFilterBehaviour(path, protocols, 34); } /** * Test URL filtering is working ok. * * @throws NoSuchAlgorithmException * @throws IOException * @throws MalformedURLException */ @Test public void testUrlFilters() throws NoSuchAlgorithmException, MalformedURLException, IOException { // Now URL excludes: String path = "warc.index.extract.url_exclude"; List<String> url_excludes = new ArrayList<String>(); this.testFilterBehaviour(path, url_excludes, 29); url_excludes.add("robots.txt"); this.testFilterBehaviour(path, url_excludes, 30); } /** * Test reponse code filtering is working ok. * * @throws NoSuchAlgorithmException * @throws IOException * @throws MalformedURLException */ @Test public void testResponseCodeFilters() throws NoSuchAlgorithmException, MalformedURLException, IOException { // Now URL excludes: String path = "warc.index.extract.response_include"; List<String> response_includes = new ArrayList<String>(); this.testFilterBehaviour(path, response_includes, 36); response_includes.add("2"); this.testFilterBehaviour(path, response_includes, 29); response_includes.add("3"); this.testFilterBehaviour(path, response_includes, 20); } /** * * @throws MalformedURLException * @throws NoSuchAlgorithmException * @throws IOException */ @Test public void testExclusionFilter() throws MalformedURLException, NoSuchAlgorithmException, IOException { Config config = ConfigFactory.load(); // Enable excusion: config = this.modifyValueAt(config, "warc.index.exclusions.enabled", true); // config exclusion file: File exclusions_file = new File("src/test/resources/exclusion_test.txt"); assertEquals( true, exclusions_file.exists()); config = this.modifyValueAt(config, "warc.index.exclusions.file", exclusions_file.getAbsolutePath() ); // config check interval: config = this.modifyValueAt(config, "warc.index.exclusions.check_interval", 600); // And run the trial: this.testFilterBehaviourWithConfig(config, 32); } /* ------------------------------------------------------------ */ /* * Internal implementations of filter test core methods. */ /* ------------------------------------------------------------ */ private void testFilterBehaviour(String path, Object newValue, int expectedNullCount ) throws MalformedURLException, IOException, NoSuchAlgorithmException { // Override the config: Config config = ConfigFactory.load(); Config config2 = this.modifyValueAt(config, path, newValue); // And invoke: this.testFilterBehaviourWithConfig(config2, expectedNullCount); } private Config modifyValueAt(Config config, String path, Object newValue ) { ConfigValue value = ConfigValueFactory.fromAnyRef( newValue ); return config.withValue(path, value); } private void testFilterBehaviourWithConfig(Config config2, int expectedNullCount ) throws MalformedURLException, IOException, NoSuchAlgorithmException { // Instanciate the indexer: WARCIndexer windex = new WARCIndexer(config2); windex.setCheckSolrForDuplicates(false); String inputFile = "src/test/resources/IAH-urls-wget.warc.gz"; System.out.println("ArchiveUtils.isGZipped: "+ArchiveUtils.isGzipped( new FileInputStream(inputFile))); ArchiveReader reader = ArchiveReaderFactory.get(inputFile); Iterator<ArchiveRecord> ir = reader.iterator(); int recordCount = 0; int nullCount = 0; // Iterate though each record in the WARC file while( ir.hasNext() ) { ArchiveRecord rec = ir.next(); SolrRecord doc = windex.extract("",rec); recordCount++; if( doc == null ) { nullCount++; } else { // System.out.println("DOC: " + doc.toXml()); } } System.out.println("recordCount: "+recordCount); assertEquals(36, recordCount); System.out.println("nullCount: "+nullCount); assertEquals(expectedNullCount, nullCount); } }