/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.nutch.util.NutchConfiguration;
import junit.framework.TestCase;
public class TestLinkDbMerger extends TestCase {
private static final Logger LOG = Logger.getLogger(TestLinkDbMerger.class.getName());
String url10 = "http://example.com/foo";
String[] urls10 = new String[] {
"http://example.com/100",
"http://example.com/101"
};
String url11 = "http://example.com/";
String[] urls11 = new String[] {
"http://example.com/110",
"http://example.com/111"
};
String url20 = "http://example.com/";
String[] urls20 = new String[] {
"http://foo.com/200",
"http://foo.com/201"
};
String url21 = "http://example.com/bar";
String[] urls21 = new String[] {
"http://foo.com/210",
"http://foo.com/211"
};
String[] urls10_expected = urls10;
String[] urls11_expected = new String[] {
urls11[0],
urls11[1],
urls20[0],
urls20[1]
};
String[] urls20_expected = urls11_expected;
String[] urls21_expected = urls21;
TreeMap init1 = new TreeMap();
TreeMap init2 = new TreeMap();
HashMap expected = new HashMap();
Configuration conf;
Path testDir;
FileSystem fs;
LinkDbReader reader;
public void setUp() throws Exception {
init1.put(url10, urls10);
init1.put(url11, urls11);
init2.put(url20, urls20);
init2.put(url21, urls21);
expected.put(url10, urls10_expected);
expected.put(url11, urls11_expected);
expected.put(url20, urls20_expected);
expected.put(url21, urls21_expected);
conf = NutchConfiguration.create();
fs = FileSystem.get(conf);
testDir = new Path("build/test/test-linkdb-" +
new java.util.Random().nextInt());
fs.mkdirs(testDir);
}
public void tearDown() {
try {
if (fs.exists(testDir))
fs.delete(testDir, true);
} catch (Exception e) { }
try {
reader.close();
} catch (Exception e) { }
}
public void testMerge() throws Exception {
Configuration conf = NutchConfiguration.create();
FileSystem fs = FileSystem.get(conf);
fs.mkdirs(testDir);
Path linkdb1 = new Path(testDir, "linkdb1");
Path linkdb2 = new Path(testDir, "linkdb2");
Path output = new Path(testDir, "output");
createLinkDb(conf, fs, linkdb1, init1);
createLinkDb(conf, fs, linkdb2, init2);
LinkDbMerger merger = new LinkDbMerger(conf);
LOG.fine("* merging linkdbs to " + output);
merger.merge(output, new Path[]{linkdb1, linkdb2}, false, false);
LOG.fine("* reading linkdb: " + output);
reader = new LinkDbReader(conf, output);
Iterator it = expected.keySet().iterator();
while (it.hasNext()) {
String url = (String)it.next();
LOG.fine("url=" + url);
String[] vals = (String[])expected.get(url);
Inlinks inlinks = reader.getInlinks(new Text(url));
// may not be null
assertNotNull(inlinks);
ArrayList links = new ArrayList();
Iterator it2 = inlinks.iterator();
while (it2.hasNext()) {
Inlink in = (Inlink)it2.next();
links.add(in.getFromUrl());
}
for (int i = 0; i < vals.length; i++) {
LOG.fine(" -> " + vals[i]);
assertTrue(links.contains(vals[i]));
}
}
reader.close();
fs.delete(testDir, true);
}
private void createLinkDb(Configuration config, FileSystem fs, Path linkdb, TreeMap init) throws Exception {
LOG.fine("* creating linkdb: " + linkdb);
Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, "part-00000").toString(), Text.class, Inlinks.class);
Iterator it = init.keySet().iterator();
while (it.hasNext()) {
String key = (String)it.next();
Inlinks inlinks = new Inlinks();
String[] vals = (String[])init.get(key);
for (int i = 0; i < vals.length; i++) {
Inlink in = new Inlink(vals[i], vals[i]);
inlinks.add(in);
}
writer.append(new Text(key), inlinks);
}
writer.close();
}
}