TestTmpFileCompression.java example

Explorer
spork-streaming-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.test;

import static org.junit.Assert.*;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Properties;

import java.io.BufferedReader;
import java.io.FileReader;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.ExecType;
import org.apache.pig.PigRunner;
import org.apache.pig.PigServer;
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.io.TFileRecordReader;
import org.apache.pig.impl.io.TFileRecordWriter;
import org.apache.pig.impl.io.TFileStorage;
import org.apache.pig.impl.io.InterStorage;
import org.apache.pig.tools.pigstats.OutputStats;
import org.apache.pig.tools.pigstats.PigStats;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.Test;

import org.apache.log4j.FileAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.SimpleLayout;

public class TestTmpFileCompression {
    private PigServer pigServer;
    static MiniCluster cluster = MiniCluster.buildCluster();
    File logFile;

    @Before
    public void setUp() throws Exception {
        pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    }

    private void resetLog(Class clazz) throws Exception {
        if (logFile != null)
            logFile.delete();
        Logger logger = Logger.getLogger(clazz);
        logger.removeAllAppenders();
        logger.setLevel(Level.DEBUG);
        SimpleLayout layout = new SimpleLayout();
        logFile = File.createTempFile("log", "");
        FileAppender appender = new FileAppender(layout, logFile.toString(),
                        false, false, 0);
        logger.addAppender(appender);
    }

    public boolean checkLogFileMessage(String[] messages) {
        BufferedReader reader = null;

        try {
            reader = new BufferedReader(new FileReader(logFile));
            String logMessage = "";
            String line;
            while ((line = reader.readLine()) != null) {
                logMessage = logMessage + line + "\n";
            }
            for (int i = 0; i < messages.length; i++) {
                if (!logMessage.contains(messages[i])) return false;
            }
            return true;
        }
        catch (IOException e) {
            return false;
        }
    }

    @After
    public void tearDown() throws Exception {
        if (logFile != null)
            logFile.delete();
    }

    @AfterClass
    public static void oneTimeTearDown() throws Exception {
        cluster.shutDown();
    }

    @Test
    public void testImplicitSplitUncompressed() throws Exception {
        resetLog(InterStorage.class);
        int LOOP_SIZE = 20;
        String[] input = new String[LOOP_SIZE];
        for (int i = 1; i <= LOOP_SIZE; i++) {
            input[i - 1] = Integer.toString(i);
        }
        String inputFileName = "testImplicitSplit-input.txt";
        Util.createInputFile(cluster, inputFileName, input);
        pigServer.registerQuery("A = LOAD '" + inputFileName + "';");
        pigServer.registerQuery("B = filter A by $0<=10;");
        pigServer.registerQuery("C = filter A by $0>10;");
        pigServer.registerQuery("D = union B,C;");
        Iterator<Tuple> iter = pigServer.openIterator("D");
        if (!iter.hasNext()) fail("No Output received");
        int cnt = 0;
        while (iter.hasNext()) {
            Tuple t = iter.next();
            ++cnt;
        }
        assertEquals(20, cnt);
        Util.deleteFile(cluster, inputFileName);
        assertTrue(checkLogFileMessage(new String[] {
            "Pig Internal storage in use"
        }));
    }

    @Test
    public void testImplicitSplitInCoGroupUncompressed() throws Exception {
        // this query is similar to the one reported in JIRA - PIG-537
        // Create input file
        resetLog(InterStorage.class);
        String input1 = "testImplicitSplitInCoGroup-input1.txt";
        String input2 = "testImplicitSplitInCoGroup-input2.txt";
        Util.createInputFile(cluster, input1, new String[] {
                        "a:1", "b:2", "b:20", "c:3", "c:30"
        });
        Util.createInputFile(cluster, input2, new String[] {
                        "a:first", "b:second", "c:third"
        });
        pigServer.registerQuery("a = load '" + input1 + "' using PigStorage(':') as (name:chararray, marks:int);");
        pigServer.registerQuery("b = load '" + input2 + "' using PigStorage(':') as (name:chararray, rank:chararray);");
        pigServer.registerQuery("c = cogroup a by name, b by name;");
        pigServer.registerQuery("d = foreach c generate group, FLATTEN(a.marks) as newmarks;");
        pigServer.registerQuery("e = cogroup a by marks, d by newmarks;");
        pigServer.registerQuery("f = foreach e generate group, flatten(a), flatten(d);");
        HashMap<Integer, Object[]> results = new HashMap<Integer, Object[]>();
        results.put(1, new Object[] {
                        "a", 1, "a", 1
        });
        results.put(2, new Object[] {
                        "b", 2, "b", 2
        });
        results.put(3, new Object[] {
                        "c", 3, "c", 3
        });
        results.put(20, new Object[] {
                        "b", 20, "b", 20
        });
        results.put(30, new Object[] {
                        "c", 30, "c", 30
        });

        Iterator<Tuple> it = pigServer.openIterator("f");
        while (it.hasNext()) {
            Tuple t = it.next();
            System.err.println("Tuple:" + t);
            Integer group = (Integer) t.get(0);
            Object[] groupValues = results.get(group);
            for (int i = 0; i < 4; i++) {
                assertEquals(groupValues[i], t.get(i + 1));
            }
        }
        Util.deleteFile(cluster, input1);
        Util.deleteFile(cluster, input2);
        assertTrue(checkLogFileMessage(new String[] {
            "Pig Internal storage in use"
        }));
    }

    @Test
    public void testImplicitSplit() throws Exception {
        resetLog(TFileStorage.class);
        int LOOP_SIZE = 20;
        String[] input = new String[LOOP_SIZE];
        for (int i = 1; i <= LOOP_SIZE; i++) {
            input[i - 1] = Integer.toString(i);
        }
        String inputFileName = "testImplicitSplit-input.txt";
        Util.createInputFile(cluster, inputFileName, input);
        pigServer.getPigContext().getProperties().setProperty(
                        "pig.tmpfilecompression", "true");
        pigServer.getPigContext().getProperties().setProperty(
                        "pig.tmpfilecompression.codec", "gz");
        pigServer.registerQuery("A = LOAD '" + inputFileName + "';");
        pigServer.registerQuery("B = filter A by $0<=10;");
        pigServer.registerQuery("C = filter A by $0>10;");
        pigServer.registerQuery("D = union B,C;");
        Iterator<Tuple> iter = pigServer.openIterator("D");
        if (!iter.hasNext()) fail("No Output received");
        int cnt = 0;
        while (iter.hasNext()) {
            Tuple t = iter.next();
            ++cnt;
        }
        assertEquals(20, cnt);
        Util.deleteFile(cluster, inputFileName);
        assertTrue(checkLogFileMessage(new String[] {
            "TFile storage in use"
        }));
    }

    @Test
    public void testImplicitSplitInCoGroup() throws Exception {
        // this query is similar to the one reported in JIRA - PIG-537
        // Create input file
        resetLog(TFileStorage.class);
        String input1 = "testImplicitSplitInCoGroup-input1.txt";
        String input2 = "testImplicitSplitInCoGroup-input2.txt";
        Util.createInputFile(cluster, input1, new String[] {
                        "a:1", "b:2", "b:20", "c:3", "c:30"
        });
        Util.createInputFile(cluster, input2, new String[] {
                        "a:first", "b:second", "c:third"
        });
        pigServer.getPigContext().getProperties().setProperty(
                        "pig.tmpfilecompression", "true");
        pigServer.getPigContext().getProperties().setProperty(
                        "pig.tmpfilecompression.codec", "gz");
        pigServer.registerQuery("a = load '" + input1 + "' using PigStorage(':') as (name:chararray, marks:int);");
        pigServer.registerQuery("b = load '" + input2 + "' using PigStorage(':') as (name:chararray, rank:chararray);");
        pigServer.registerQuery("c = cogroup a by name, b by name;");
        pigServer.registerQuery("d = foreach c generate group, FLATTEN(a.marks) as newmarks;");
        pigServer.registerQuery("e = cogroup a by marks, d by newmarks;");
        pigServer.registerQuery("f = foreach e generate group, flatten(a), flatten(d);");
        HashMap<Integer, Object[]> results = new HashMap<Integer, Object[]>();
        results.put(1, new Object[] {
                        "a", 1, "a", 1
        });
        results.put(2, new Object[] {
                        "b", 2, "b", 2
        });
        results.put(3, new Object[] {
                        "c", 3, "c", 3
        });
        results.put(20, new Object[] {
                        "b", 20, "b", 20
        });
        results.put(30, new Object[] {
                        "c", 30, "c", 30
        });

        Iterator<Tuple> it = pigServer.openIterator("f");
        while (it.hasNext()) {
            Tuple t = it.next();
            System.err.println("Tuple:" + t);
            Integer group = (Integer) t.get(0);
            Object[] groupValues = results.get(group);
            for (int i = 0; i < 4; i++) {
                assertEquals(groupValues[i], t.get(i + 1));
            }
        }
        Util.deleteFile(cluster, input1);
        Util.deleteFile(cluster, input2);
        assertTrue(checkLogFileMessage(new String[] {
            "TFile storage in use"
        }));
    }

    
    // PIG-1977
    @Test
    public void testTFileRecordReader() throws Exception {
        PrintWriter w = new PrintWriter(new FileWriter("1.txt"));
        for (int i = 0; i < 30; i++) {
            w.println("1\tthis is a test for compression of temp files");
        }
        w.close();
        
        Util.copyFromLocalToCluster(cluster, "1.txt", "1.txt");
        
        PrintWriter w1 = new PrintWriter(new FileWriter("tfile.pig"));
        w1.println("A = load '1.txt' as (a0:int, a1:chararray);");
        w1.println("B = group A by a0;");
        w1.println("store B into 'tfile' using org.apache.pig.impl.io.TFileStorage();");
        w1.close();
        
        PrintWriter w2 = new PrintWriter(new FileWriter("tfile2.pig"));
        w2.println("A = load 'tfile' using org.apache.pig.impl.io.TFileStorage() as (a:int, b:bag{(b0:int, b1:chararray)});");
        w2.println("B = foreach A generate flatten($1);");
        w2.println("store B into '2.txt';");
        w2.close();
        
        try {
            String[] args = { "-Dpig.tmpfilecompression.codec=gz",
                    "-Dtfile.io.chunk.size=100", "tfile.pig" };
            PigStats stats = PigRunner.run(args, null);
     
            assertTrue(stats.isSuccessful());
 
            String[] args2 = { "-Dpig.tmpfilecompression.codec=gz",
                    "-Dtfile.io.chunk.size=100", "tfile2.pig" };
            PigStats stats2 = PigRunner.run(args2, null);

            assertTrue(stats2.isSuccessful());
            
            OutputStats os = stats2.result("B");
            Iterator<Tuple> iter = os.iterator();
            int count = 0;
            String expected = "(1,this is a test for compression of temp files)";
            while (iter.hasNext()) {
                count++;
                assertEquals(expected, iter.next().toString());
            }
            assertEquals(30, count);
            
        } finally {
            new File("tfile.pig").delete(); 
            new File("tfile2.pig").delete(); 
            new File("1.txt").delete(); 
        }
    }

    @Test
    public void testTFileRecordWriterReaderAndProgress() throws Exception {
        // Create a small tFile by pig tfilewriter, read by tfilereader and
        // make sure that data matches,
        // progress is above zero and increasing
        File tFile = File.createTempFile("test", "tfile");
        Path basicTFile = new Path(tFile.getAbsolutePath());
        //delete the empty file and let TFileRecordWriter create it again.
        tFile.delete();
        Configuration conf = new Configuration();
        conf.set("tfile.io.chunk.size","100");
        conf.set("fs.default.name", "file:///");

        for (String codec: new String [] {"none", "gz"} ) {
            System.err.println("Testing RecordWriter/Reader with codec: "
                               + codec);
            try {
                TFileRecordWriter writer = new TFileRecordWriter(basicTFile,
                                                                 codec, conf);

                Tuple tuple = TupleFactory.getInstance().newTuple(1);
                int LOOP_SIZE = 25000;
                for( int i=0; i <= LOOP_SIZE; i++) {
                    String key = String.format("%010d",i);
                    tuple.set(0,key);
                    writer.write(null, tuple);
                }
                writer.close(null);
                int size = (int) tFile.length();
                FileSplit split = new FileSplit(basicTFile, 0, size, null);
                TFileRecordReader reader = new TFileRecordReader();
                reader.initialize(split,
                    HadoopShims.createTaskAttemptContext(
                        conf,
                        HadoopShims.createTaskAttemptID("jt", 1, true, 1, 1)));

                float progress = 0,  lastprogress = 0;
                int curval = 0, prevval = -1;
                while (reader.nextKeyValue()) {
                    Tuple t = (Tuple) reader.getCurrentValue();
                    curval = Integer.valueOf((String) t.get(0));
                    assertEquals("Unexpected Value", curval, prevval + 1);
                    prevval = curval;

                    progress = reader.getProgress();
                    if( progress != lastprogress ) {
                        System.err.println("progress: " + progress);
                    }
                    assertTrue("Progress is not positive", progress > 0);
                    assertTrue("Progress is not increasing",
                               progress >= lastprogress);
                    lastprogress = progress;
                }
                assertEquals("Last value does not match",
                            curval, LOOP_SIZE );
            } finally {
                tFile.delete();
            }
        }
    }
}