/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import junit.framework.TestCase;
/**
* Basic injector test:
* 1. Creates a text file with urls
* 2. Injects them into crawldb
* 3. Reads crawldb entries and verifies contents
* 4. Injects more urls into webdb
* 5. Reads crawldb entries and verifies contents
*
* @author nutch-dev <nutch-dev at lucene.apache.org>
*/
public class TestInjector extends TestCase {
private Configuration conf;
private FileSystem fs;
final static Path testdir=new Path("build/test/inject-test");
Path crawldbPath;
Path urlPath;
protected void setUp() throws Exception {
conf = CrawlDBTestUtil.createConfiguration();
urlPath=new Path(testdir,"urls");
crawldbPath=new Path(testdir,"crawldb");
fs=FileSystem.get(conf);
if (fs.exists(urlPath)) fs.delete(urlPath, false);
if (fs.exists(crawldbPath)) fs.delete(crawldbPath, true);
}
protected void tearDown() throws IOException{
fs.delete(testdir, true);
}
public void testInject() throws IOException {
ArrayList<String> urls=new ArrayList<String>();
// We'll use a separate list for MD so we can still compare url with containsAll
ArrayList<String> metadata=new ArrayList<String>();
for(int i=0;i<100;i++) {
urls.add("http://zzz.com/" + i + ".html");
metadata.add("\tnutch.score=2." + i + "\tnutch.fetchInterval=171717\tkey=value");
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata);
Injector injector=new Injector(conf);
injector.inject(crawldbPath, urlPath);
// verify results
List<String>read=readCrawldb();
Collections.sort(read);
Collections.sort(urls);
assertEquals(urls.size(), read.size());
assertTrue(read.containsAll(urls));
assertTrue(urls.containsAll(read));
//inject more urls
ArrayList<String> urls2=new ArrayList<String>();
for(int i=0;i<100;i++) {
urls2.add("http://xxx.com/" + i + ".html");
// We'll overwrite previously injected records but preserve their original MD
urls2.add("http://zzz.com/" + i + ".html");
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
injector=new Injector(conf);
conf.setBoolean("db.injector.update", true);
injector.inject(crawldbPath, urlPath);
urls.addAll(urls2);
// verify results
read=readCrawldb();
Collections.sort(read);
Collections.sort(urls);
// We should have 100 less records because we've overwritten
assertEquals(urls.size() - 100, read.size());
assertTrue(read.containsAll(urls));
assertTrue(urls.containsAll(read));
// Check if we correctly preserved MD
Map<String, CrawlDatum> records = readCrawldbRecords();
// Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs
// so we can check for MD and score and interval
Text writableKey = new Text("key");
Text writableValue = new Text("value");
for (String url : urls) {
if (url.indexOf("http://zzz") == 0) {
// Check for fetch interval
assertTrue(records.get(url).getFetchInterval() == 171717);
// Check for default score
assertTrue(records.get(url).getScore() != 1.0);
// Check for MD key=value
assertEquals(writableValue, records.get(url).getMetaData().get(writableKey));
}
}
}
private List<String> readCrawldb() throws IOException{
Path dbfile=new Path(crawldbPath,CrawlDb.CURRENT_NAME + "/part-00000/data");
System.out.println("reading:" + dbfile);
SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
ArrayList<String> read=new ArrayList<String>();
READ:
do {
Text key=new Text();
CrawlDatum value=new CrawlDatum();
if(!reader.next(key, value)) break READ;
read.add(key.toString());
} while(true);
return read;
}
private HashMap<String,CrawlDatum> readCrawldbRecords() throws IOException{
Path dbfile=new Path(crawldbPath,CrawlDb.CURRENT_NAME + "/part-00000/data");
System.out.println("reading:" + dbfile);
SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
HashMap<String,CrawlDatum> read=new HashMap<String,CrawlDatum>();
READ:
do {
Text key=new Text();
CrawlDatum value=new CrawlDatum();
if(!reader.next(key, value)) break READ;
read.put(key.toString(), value);
} while(true);
return read;
}
}