package uk.bl.wa.analyser.payload; /* * #%L * warc-indexer * %% * Copyright (C) 2015 State and University Library, Denmark * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import junit.framework.TestCase; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.Log; import org.archive.io.ArchiveRecordHeader; import uk.bl.wa.solr.SolrRecord; import java.util.Map; import java.util.Set; public class WARCPayloadAnalysersTest extends TestCase { private static Log log = LogFactory.getLog(WARCPayloadAnalysersTest.class); public void testConfig() { ARCNameAnalyser ana = getAnalyser(); assertEquals("The expected number of rules should be created", 7, ana.getRules().size()); assertEquals("The number of templates for the first rule should be correct", 2, ana.getRules().get(0).templates.size()); } public void testSampleRule() { ARCNameAnalyser ana = getAnalyser(); ArchiveRecordHeader header = new FakeHeader("whatever/localrun-job87-20150219-133227.warc"); SolrRecord solr = new SolrRecord(); ana.analyse(header, null, solr); assertEquals("The solr documents should have the right content for field harvest_job", "job87", (solr.getFieldValue("harvest_job").toString())); assertEquals("The solr documents should have the right content for field harvest_year", "2015", solr.getFieldValue("harvest_year").toString()); } public void testSBRules() { // Local rules used at Statsbiblioteket ARCNameAnalyser ana = getAnalyser(); for (String test[]: new String[][]{ { "arc_orig:sb, arc_harvesttime:2008-02-21T00:35:33.000Z, arc_job:25666, arc_harvest:33, " + "arc_name:25666-33-20080221003533-00046-sb-prod-har-004.arc, " + "arc_full:25666-33-20080221003533-00046-sb-prod-har-004.arc", "25666-33-20080221003533-00046-sb-prod-har-004.arc" }, { "arc_orig:sb, arc_harvesttime:2007-04-18T02:46:37.000Z, arc_job:15626, arc_harvest:38, " + "arc_name:15626-38-20070418024637-00385-sb-prod-har-001.statsbiblioteket.dk.arc, " + "arc_full:/netarkiv/0101/filedir/15626-38-20070418024637-00385-sb-prod-har-001.statsbiblioteket.dk.arc", "/netarkiv/0101/filedir/15626-38-20070418024637-00385-sb-prod-har-001.statsbiblioteket.dk.arc" }, { "arc_orig:kb, arc_harvesttime:2007-04-18T16:37:59.000Z, arc_job:15638, arc_harvest:38, " + "arc_name:15638-38-20070418163759-00235-kb-prod-har-002.kb.dk.arc, " + "arc_full:somepath/15638-38-20070418163759-00235-kb-prod-har-002.kb.dk.arc", "somepath/15638-38-20070418163759-00235-kb-prod-har-002.kb.dk.arc" }, { "arc_orig:kb, arc_harvesttime:2013-11-11T17:55:47.000Z, arc_job:193305, arc_harvest:197, " + "arc_name:193305-197-20131111175547-00001-kb228081.kb.dk.warc, " + "arc_full:193305-197-20131111175547-00001-kb228081.kb.dk.warc", "193305-197-20131111175547-00001-kb228081.kb.dk.warc" }, { "arc_orig:kb, arc_harvesttime:2012-10-18T21:02:45.000Z, arc_job:36861, " + "arc_name:kb-pligtsystem-36861-20121018210245-00000.warc, " + "arc_full:kb-pligtsystem-36861-20121018210245-00000.warc", "kb-pligtsystem-36861-20121018210245-00000.warc" }, { "arc_orig:metadata, " + "arc_name:1298-metadata-2.arc, " + "arc_full:anotherpath/1298-metadata-2.arc", "anotherpath/1298-metadata-2.arc" }, { "arc_orig:unknown, " + "arc_name:ksjvksjfvsk, " + "arc_full:ksjvksjfvsk", "ksjvksjfvsk" } }) { SolrRecord solr = new SolrRecord(); ana.analyse(new FakeHeader(test[1]), null, solr); for (String expectedPair:test[0].split(" *, *")) { String[] tokens = expectedPair.split(":", 2); Object valO = solr.getFieldValue(tokens[0]); String value = valO == null ? "N/A" : valO.toString(); assertEquals("The parsing of " + test[1] + " should have the right content for field " + tokens[0], tokens[1], value); } } } private ARCNameAnalyser getAnalyser() { Config conf = ConfigFactory.parseURL( Thread.currentThread().getContextClassLoader().getResource("arcnameanalyser.conf")); return new ARCNameAnalyser(conf); } private class FakeHeader implements ArchiveRecordHeader { private final String arcPath; public FakeHeader(String arcPath) { this.arcPath = arcPath; } @Override public String getDate() { return null; } @Override public long getLength() { return 0; } @Override public long getContentLength() { return 0; } @Override public String getUrl() { return null; } @Override public String getMimetype() { return null; } @Override public String getVersion() { return null; } @Override public long getOffset() { return 0; } @Override public Object getHeaderValue(String key) { return null; } @Override public Set<String> getHeaderFieldKeys() { return null; } @Override public Map<String, Object> getHeaderFields() { return null; } @Override public String getReaderIdentifier() { return arcPath; } @Override public String getRecordIdentifier() { return null; } @Override public String getDigest() { return null; } @Override public int getContentBegin() { return 0; } } }