/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.segment; import java.text.DecimalFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.nutch.parse.ParseText; import org.apache.nutch.util.NutchConfiguration; import junit.framework.TestCase; public class TestSegmentMerger extends TestCase { Configuration conf; FileSystem fs; Path testDir; Path seg1; Path seg2; Path out; int countSeg1, countSeg2; public void setUp() throws Exception { conf = NutchConfiguration.create(); fs = FileSystem.get(conf); long blkSize = fs.getDefaultBlockSize(); testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis()); seg1 = new Path(testDir, "seg1"); seg2 = new Path(testDir, "seg2"); out = new Path(testDir, "out"); // create large parse-text segments System.err.println("Creating large segment 1..."); DecimalFormat df = new DecimalFormat("0000000"); Text k = new Text(); Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000"); MapFile.Writer w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class); long curSize = 0; countSeg1 = 0; while (curSize < blkSize * 2) { k.set("seg1-" + df.format(countSeg1)); w.append(k, new ParseText("seg1 text " + countSeg1)); countSeg1++; curSize += 40; // roughly ... } w.close(); System.err.println(" - done: " + countSeg1 + " records."); System.err.println("Creating large segment 2..."); ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000"); w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class); curSize = 0; countSeg2 = 0; while (curSize < blkSize * 2) { k.set("seg2-" + df.format(countSeg2)); w.append(k, new ParseText("seg2 text " + countSeg2)); countSeg2++; curSize += 40; // roughly ... } w.close(); System.err.println(" - done: " + countSeg2 + " records."); } public void tearDown() throws Exception { fs.delete(testDir, true); } public void testLargeMerge() throws Exception { SegmentMerger merger = new SegmentMerger(conf); merger.merge(out, new Path[]{seg1, seg2}, false, false, -1); // verify output FileStatus[] stats = fs.listStatus(out); // there should be just one path assertEquals(1, stats.length); Path outSeg = stats[0].getPath(); Text k = new Text(); ParseText v = new ParseText(); MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf); int cnt1 = 0, cnt2 = 0; for (MapFile.Reader r : readers) { while (r.next(k, v)) { String ks = k.toString(); String vs = v.getText(); if (ks.startsWith("seg1-")) { cnt1++; assertTrue(vs.startsWith("seg1 ")); } else if (ks.startsWith("seg2-")) { cnt2++; assertTrue(vs.startsWith("seg2 ")); } } r.close(); } assertEquals(countSeg1, cnt1); assertEquals(countSeg2, cnt2); } }