/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import junit.framework.TestCase;
/**
* Basic injector test:
* 1. Creates a text file with urls
* 2. Injects them into crawldb
* 3. Reads crawldb entries and verifies contents
* 4. Injects more urls into webdb
* 5. Reads crawldb entries and verifies contents
*
* @author nutch-dev <nutch-dev at lucene.apache.org>
*/
public class TestInjector extends TestCase {
private Configuration conf;
private FileSystem fs;
final static Path testdir=new Path("build/test/inject-test");
Path crawldbPath;
Path urlPath;
protected void setUp() throws Exception {
conf = CrawlDBTestUtil.createConfiguration();
urlPath=new Path(testdir,"urls");
crawldbPath=new Path(testdir,"crawldb");
fs=FileSystem.get(conf);
if (fs.exists(urlPath)) fs.delete(urlPath, false);
if (fs.exists(crawldbPath)) fs.delete(crawldbPath, true);
}
protected void tearDown() throws IOException{
fs.delete(testdir, true);
}
public void testInject() throws IOException {
ArrayList<String> urls=new ArrayList<String>();
for(int i=0;i<100;i++) {
urls.add("http://zzz.com/" + i + ".html");
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
Injector injector=new Injector(conf);
injector.inject(crawldbPath, urlPath);
// verify results
List<String>read=readCrawldb();
Collections.sort(read);
Collections.sort(urls);
assertEquals(urls.size(), read.size());
assertTrue(read.containsAll(urls));
assertTrue(urls.containsAll(read));
//inject more urls
ArrayList<String> urls2=new ArrayList<String>();
for(int i=0;i<100;i++) {
urls2.add("http://xxx.com/" + i + ".html");
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
injector.inject(crawldbPath, urlPath);
urls.addAll(urls2);
// verify results
read=readCrawldb();
Collections.sort(read);
Collections.sort(urls);
assertEquals(urls.size(), read.size());
assertTrue(read.containsAll(urls));
assertTrue(urls.containsAll(read));
}
private List<String> readCrawldb() throws IOException{
Path dbfile=new Path(crawldbPath,CrawlDb.CURRENT_NAME + "/part-00000/data");
System.out.println("reading:" + dbfile);
SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
ArrayList<String> read=new ArrayList<String>();
READ:
do {
Text key=new Text();
CrawlDatum value=new CrawlDatum();
if(!reader.next(key, value)) break READ;
read.add(key.toString());
} while(true);
return read;
}
}