TestMultipleOutputs1.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.zebra.pig;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.zebra.BaseTestCase;
import org.apache.hadoop.zebra.mapreduce.ZebraOutputPartition;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.pig.backend.executionengine.ExecJob;
import org.apache.pig.data.Tuple;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.util.Iterator;
import junit.framework.Assert;

/**
 * Assume the input files contain rows of word and count, separated by a space:
 * 
 * <pre>
 * us 2
 * japan 2
 * india 4
 * us 2
 * japan 1
 * india 3
 * nouse 5
 * nowhere 4
 * 
 */
public class TestMultipleOutputs1 extends BaseTestCase implements Tool {
  static String inputPath;
  static String inputFileName = "multi-input.txt";
  public static String sortKey = null;

  @Before
  public void setUp() throws Exception {
    init();
    
    inputPath = getTableFullPath(inputFileName).toString();
    
    writeToFile(inputPath);
  }
  
  @After
  public void tearDown() throws Exception {
    if (mode == TestMode.local) {
      pigServer.shutdown();
    }
  }
  
  public static void writeToFile (String inputFile) throws IOException{
    if (mode == TestMode.local) {
      FileWriter fstream = new FileWriter(inputFile);
      BufferedWriter out = new BufferedWriter(fstream);
      out.write("us\t2\n");
      out.write("japan\t2\n");
      out.write("india\t4\n");
      out.write("us\t2\n");
      out.write("japan\t1\n");
      out.write("india\t3\n");
      out.write("nouse\t5\n");
      out.write("nowhere\t4\n");
      out.close();
    }

    if (mode == TestMode.cluster) {
      FSDataOutputStream fout = fs.create(new Path (inputFile));
      fout.writeBytes("us\t2\n");
      fout.writeBytes("japan\t2\n");
      fout.writeBytes("india\t4\n");
      fout.writeBytes("us\t2\n");
      fout.writeBytes("japan\t1\n");
      fout.writeBytes("india\t3\n");
      fout.writeBytes("nouse\t5\n");
      fout.writeBytes("nowhere\t4\n");
      fout.close();
    }
  }
  
  // test no sort key;
  @Test
  public void test1() throws ParseException, IOException,
      org.apache.hadoop.zebra.parser.ParseException, Exception {
    // Load data;
    String query = "records = LOAD '" + inputPath + "' as (word:chararray, count:int);";
    System.out.println("query = " + query);
    pigServer.registerQuery(query);

    Iterator<Tuple> it = pigServer.openIterator("records");
    while (it.hasNext()) {
      Tuple cur = it.next();
      System.out.println(cur);
    }
    
    // Store using multiple outputs;
    String outputPaths = "us,india,japan";
    removeDir(getTableFullPath("us"));
    removeDir(getTableFullPath("india"));
    removeDir(getTableFullPath("japan"));

    query = "store records into '" + outputPaths + "' using org.apache.hadoop.zebra.pig.TableStorer('[word,count]'," +
        "'org.apache.hadoop.zebra.pig.TestMultipleOutputs1$OutputPartitionerClass');";
    System.out.println("query = " + query);
    pigServer.registerQuery(query);    
    
    // Validate results;
    query = "records = LOAD '" + "us"
          + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    int count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("us", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("us", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));                 
      } else if (count == 3) {
        Assert.assertEquals("nouse", RowValue.get(0));
        Assert.assertEquals(5, RowValue.get(1));       
      } else if (count == 4) {
        Assert.assertEquals("nowhere", RowValue.get(0));
        Assert.assertEquals(4, RowValue.get(1));       
      }
    }
    Assert.assertEquals(count, 4);

    query = "records = LOAD '" + "india"
      + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("india", RowValue.get(0));
        Assert.assertEquals(4, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("india", RowValue.get(0));
        Assert.assertEquals(3, RowValue.get(1));                 
      } 
    }
    Assert.assertEquals(count, 2);

    query = "records = LOAD '" + "japan"
    + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("japan", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("japan", RowValue.get(0));
        Assert.assertEquals(1, RowValue.get(1));                 
      } 
    }
    Assert.assertEquals(count, 2);
  }    

  //Test sort key on word;
  @Test
  public void test2() throws ParseException, IOException,
      org.apache.hadoop.zebra.parser.ParseException, Exception {
    // Load data;
    String query = "a = LOAD '" + inputPath + "' as (word:chararray, count:int);";
    System.out.println("query = " + query);
    pigServer.registerQuery(query);
    
    query = "records = order a by word;";
    System.out.println("query = " + query);
    pigServer.registerQuery(query);

    Iterator<Tuple> it = pigServer.openIterator("records");
    while (it.hasNext()) {
      Tuple cur = it.next();
      System.out.println(cur);
    }
    
    // Store using multiple outputs;
    String outputPaths = "us,india,japan";
    removeDir(getTableFullPath("us"));
    removeDir(getTableFullPath("india"));
    removeDir(getTableFullPath("japan"));
    ExecJob pigJob = pigServer
      .store(
        "records",
        outputPaths,
        TableStorer.class.getCanonicalName() +
             "('[word,count]', 'org.apache.hadoop.zebra.pig.TestMultipleOutputs1$OutputPartitionerClass')");    
    
    Assert.assertNull(pigJob.getException());
    
    // Validate results;
    query = "records = LOAD '" + "us"
          + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    int count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("nouse", RowValue.get(0));
        Assert.assertEquals(5, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("nowhere", RowValue.get(0));
        Assert.assertEquals(4, RowValue.get(1));                 
      } else if (count == 3) {
        Assert.assertEquals("us", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));       
      } else if (count == 4) {
        Assert.assertEquals("us", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));       
      }
    }
    Assert.assertEquals(count, 4);

    query = "records = LOAD '" + "india"
      + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("india", RowValue.get(0));
        Assert.assertEquals(4, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("india", RowValue.get(0));
        Assert.assertEquals(3, RowValue.get(1));                 
      } 
    }
    Assert.assertEquals(count, 2);

    query = "records = LOAD '" + "japan"
    + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("japan", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("japan", RowValue.get(0));
        Assert.assertEquals(1, RowValue.get(1));                 
      } 
    }
    Assert.assertEquals(count, 2);
  }
  
  //Test sort key on word and count;
  @Test
  public void test3() throws ParseException, IOException,
      org.apache.hadoop.zebra.parser.ParseException, Exception {
    // Load data;
    String query = "a = LOAD '" + inputPath + "' as (word:chararray, count:int);";
    System.out.println("query = " + query);
    pigServer.registerQuery(query);
    
    query = "records = order a by word, count;";
    System.out.println("query = " + query);
    pigServer.registerQuery(query);

    Iterator<Tuple> it = pigServer.openIterator("records");
    while (it.hasNext()) {
      Tuple cur = it.next();
      System.out.println(cur);
    }
    
    // Store using multiple outputs;
    String outputPaths = "us,india,japan";
    removeDir(getTableFullPath("us"));
    removeDir(getTableFullPath("india"));
    removeDir(getTableFullPath("japan"));
    ExecJob pigJob = pigServer
      .store(
        "records",
        outputPaths,
        TableStorer.class.getCanonicalName() +
             "('[word,count]', 'org.apache.hadoop.zebra.pig.TestMultipleOutputs1$OutputPartitionerClass')");    
    
    Assert.assertNull(pigJob.getException());
    
    // Validate results;
    query = "records = LOAD '" + "us"
          + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    int count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("nouse", RowValue.get(0));
        Assert.assertEquals(5, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("nowhere", RowValue.get(0));
        Assert.assertEquals(4, RowValue.get(1));                 
      } else if (count == 3) {
        Assert.assertEquals("us", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));       
      } else if (count == 4) {
        Assert.assertEquals("us", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));       
      }
    }
    Assert.assertEquals(count, 4);

    query = "records = LOAD '" + "india"
      + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("india", RowValue.get(0));
        Assert.assertEquals(3, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("india", RowValue.get(0));
        Assert.assertEquals(4, RowValue.get(1));                 
      } 
    }
    Assert.assertEquals(count, 2);

    query = "records = LOAD '" + "japan"
    + "' USING org.apache.hadoop.zebra.pig.TableLoader();";

    count = 0;
    System.out.println(query);
    pigServer.registerQuery(query);
    it = pigServer.openIterator("records");
    while (it.hasNext()) {
      count ++;
      Tuple RowValue = it.next();
      System.out.println(RowValue);
      if (count == 1) {
        Assert.assertEquals("japan", RowValue.get(0));
        Assert.assertEquals(1, RowValue.get(1));       
      } else if (count == 2) {
        Assert.assertEquals("japan", RowValue.get(0));
        Assert.assertEquals(2, RowValue.get(1));                 
      } 
    }
    Assert.assertEquals(count, 2);
  }
  
  //Negative test case: invalid partition class;
  @Test (expected = IOException.class)
  public void testNegative1() throws ParseException, IOException,
      org.apache.hadoop.zebra.parser.ParseException, Exception {
    // Load data;
    String query = "a = LOAD '" + inputPath + "' as (word:chararray, count:int);";
    System.out.println("query = " + query);
    pigServer.registerQuery(query);
    
    query = "records = order a by word, count;";
    System.out.println("query = " + query);
    pigServer.registerQuery(query);

    Iterator<Tuple> it = pigServer.openIterator("records");
    while (it.hasNext()) {
      Tuple cur = it.next();
      System.out.println(cur);
    }
    
    // Store using multiple outputs;
    String outputPaths = "us,india,japan";
    removeDir(getTableFullPath("us"));
    removeDir(getTableFullPath("india"));
    removeDir(getTableFullPath("japan"));
    pigServer
      .store(
        "records",
        outputPaths,
        TableStorer.class.getCanonicalName() +
             "('[word,count]', 'org.apache.hadoop.zebra.pig.notexistingclass')");    
  }
  
  public static class OutputPartitionerClass extends ZebraOutputPartition {

    @Override
    public int getOutputPartition(BytesWritable key, Tuple value) {
      String reg = null;
      try {
        reg = (String) (value.get(0));
      } catch (Exception e) {
        //
      }

      if (reg.equals("us"))
        return 0;
      if (reg.equals("india"))
        return 1;
      if (reg.equals("japan"))
        return 2;

      return 0;
    }
  }

  @Override
  public int run(String[] args) throws Exception {
    TestMultipleOutputs1 test = new TestMultipleOutputs1();
    
    test.setUp();
    test.test1();
    test.tearDown();
    
    test.setUp();
    test.test2();
    test.tearDown();    

    test.setUp();
    test.test3();
    test.tearDown();    

    test.setUp();
    test.testNegative1();
    test.tearDown();
    
    return 0;
  }
  
  public static void main(String[] args) throws Exception {
    conf = new Configuration();
    
    int res = ToolRunner.run(conf, new TestMultipleOutputs1(), args);
    System.out.println("PASS");
    System.exit(res);
  }
}