/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.crawl; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.SequenceFile.Reader.Option; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; /** * Basic injector test: 1. Creates a text file with urls 2. Injects them into * crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls * into webdb 5. Reads crawldb entries and verifies contents * */ public class TestInjector { private Configuration conf; private FileSystem fs; final static Path testdir = new Path("build/test/inject-test"); Path crawldbPath; Path urlPath; @Before public void setUp() throws Exception { conf = CrawlDBTestUtil.createConfiguration(); urlPath = new Path(testdir, "urls"); crawldbPath = new Path(testdir, "crawldb"); fs = FileSystem.get(conf); if (fs.exists(urlPath)) fs.delete(urlPath, false); if (fs.exists(crawldbPath)) fs.delete(crawldbPath, true); } @After public void tearDown() throws IOException { fs.delete(testdir, true); } @Test public void testInject() throws IOException, ClassNotFoundException, InterruptedException { ArrayList<String> urls = new ArrayList<String>(); // We'll use a separate list for MD so we can still compare url with // containsAll ArrayList<String> metadata = new ArrayList<String>(); for (int i = 0; i < 100; i++) { urls.add("http://zzz.com/" + i + ".html"); metadata.add("\tnutch.score=2." + i + "\tnutch.fetchInterval=171717\tkey=value"); } CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata); Injector injector = new Injector(conf); injector.inject(crawldbPath, urlPath); // verify results List<String> read = readCrawldb(); Collections.sort(read); Collections.sort(urls); Assert.assertEquals(urls.size(), read.size()); Assert.assertTrue(read.containsAll(urls)); Assert.assertTrue(urls.containsAll(read)); // inject more urls ArrayList<String> urls2 = new ArrayList<String>(); for (int i = 0; i < 100; i++) { urls2.add("http://xxx.com/" + i + ".html"); // We'll overwrite previously injected records but preserve their original // MD urls2.add("http://zzz.com/" + i + ".html"); } CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2); injector = new Injector(conf); conf.setBoolean("db.injector.update", true); injector.inject(crawldbPath, urlPath); urls.addAll(urls2); // verify results read = readCrawldb(); Collections.sort(read); Collections.sort(urls); // We should have 100 less records because we've overwritten Assert.assertEquals(urls.size() - 100, read.size()); Assert.assertTrue(read.containsAll(urls)); Assert.assertTrue(urls.containsAll(read)); // Check if we correctly preserved MD Map<String, CrawlDatum> records = readCrawldbRecords(); // Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs // so we can check for MD and score and interval Text writableKey = new Text("key"); Text writableValue = new Text("value"); for (String url : urls) { if (url.indexOf("http://zzz") == 0) { // Check for fetch interval Assert.assertTrue(records.get(url).getFetchInterval() == 171717); // Check for default score Assert.assertTrue(records.get(url).getScore() != 1.0); // Check for MD key=value Assert.assertEquals(writableValue, records.get(url).getMetaData().get(writableKey)); } } } private List<String> readCrawldb() throws IOException { Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME + "/part-r-00000/data"); System.out.println("reading:" + dbfile); Option rFile = SequenceFile.Reader.file(dbfile); @SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); ArrayList<String> read = new ArrayList<String>(); READ: do { Text key = new Text(); CrawlDatum value = new CrawlDatum(); if (!reader.next(key, value)) break READ; read.add(key.toString()); } while (true); return read; } private HashMap<String, CrawlDatum> readCrawldbRecords() throws IOException { Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME + "/part-r-00000/data"); System.out.println("reading:" + dbfile); Option rFile = SequenceFile.Reader.file(dbfile); @SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>(); READ: do { Text key = new Text(); CrawlDatum value = new CrawlDatum(); if (!reader.next(key, value)) break READ; read.put(key.toString(), value); } while (true); return read; } }