/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.nutch.util.NutchConfiguration;
import junit.framework.TestCase;
public class TestCrawlDbMerger extends TestCase {
private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class.getName());
String url10 = "http://example.com/";
String url11 = "http://example.com/foo";
String url20 = "http://example.com/";
String url21 = "http://example.com/bar";
String[] urls_expected = new String[] {
url10,
url11,
url21
};
TreeSet init1 = new TreeSet();
TreeSet init2 = new TreeSet();
HashMap expected = new HashMap();
CrawlDatum cd1, cd2, cd3;
Configuration conf;
FileSystem fs;
Path testDir;
CrawlDbReader reader;
public void setUp() throws Exception {
init1.add(url10);
init1.add(url11);
init2.add(url20);
init2.add(url21);
long time = System.currentTimeMillis();
cd1 = new CrawlDatum();
cd1.setFetchInterval(1.0f);
cd1.setFetchTime(time);
cd1.getMetaData().put(new Text("name"), new Text("cd1"));
cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
cd2 = new CrawlDatum();
cd2.setFetchInterval(1.0f);
cd2.setFetchTime(time + 10000);
cd2.getMetaData().put(new Text("name"), new Text("cd2"));
cd3 = new CrawlDatum();
cd3.setFetchInterval(1.0f);
cd3.setFetchTime(time + 10000);
cd3.getMetaData().putAll(cd1.getMetaData());
cd3.getMetaData().putAll(cd2.getMetaData());
expected.put(url10, cd3);
expected.put(url11, cd1);
expected.put(url21, cd2);
conf = NutchConfiguration.create();
fs = FileSystem.get(conf);
testDir = new Path("test-crawldb-" +
new java.util.Random().nextInt());
fs.mkdirs(testDir);
}
public void tearDown() {
try {
if (fs.exists(testDir))
fs.delete(testDir);
} catch (Exception e) { }
try {
reader.close();
} catch (Exception e) { }
}
public void testMerge() throws Exception {
Path crawldb1 = new Path(testDir, "crawldb1");
Path crawldb2 = new Path(testDir, "crawldb2");
Path output = new Path(testDir, "output");
createCrawlDb(conf, fs, crawldb1, init1, cd1);
createCrawlDb(conf, fs, crawldb2, init2, cd2);
CrawlDbMerger merger = new CrawlDbMerger(conf);
LOG.fine("* merging crawldbs to " + output);
merger.merge(output, new Path[]{crawldb1, crawldb2}, false, false);
LOG.fine("* reading crawldb: " + output);
reader = new CrawlDbReader();
String crawlDb = output.toString();
Iterator it = expected.keySet().iterator();
while (it.hasNext()) {
String url = (String)it.next();
LOG.fine("url=" + url);
CrawlDatum cd = (CrawlDatum)expected.get(url);
CrawlDatum res = reader.get(crawlDb, url, conf);
LOG.fine(" -> " + res);
System.out.println("url=" + url);
System.out.println(" cd " + cd);
System.out.println(" res " + res);
// may not be null
assertNotNull(res);
assertTrue(cd.equals(res));
}
reader.close();
fs.delete(testDir);
}
private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb, TreeSet init, CrawlDatum cd) throws Exception {
LOG.fine("* creating crawldb: " + crawldb);
Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, "part-00000").toString(), Text.class, CrawlDatum.class);
Iterator it = init.iterator();
while (it.hasNext()) {
String key = (String)it.next();
writer.append(new Text(key), cd);
}
writer.close();
}
}