/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.examples.terasort; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.math.BigInteger; import java.util.zip.Checksum; import org.apache.hadoop.util.PureJavaCrc32; /** * A single process data generator for the terasort data. Based on gensort.c * version 1.1 (3 Mar 2009) from Chris Nyberg <chris.nyberg@ordinal.com>. */ public class GenSort { /** * Generate a "binary" record suitable for all sort benchmarks *except* * PennySort. */ static void generateRecord(byte[] recBuf, Unsigned16 rand, Unsigned16 recordNumber) { /* generate the 10-byte key using the high 10 bytes of the 128-bit * random number */ for(int i=0; i < 10; ++i) { recBuf[i] = rand.getByte(i); } /* add 2 bytes of "break" */ recBuf[10] = 0x00; recBuf[11] = 0x11; /* convert the 128-bit record number to 32 bits of ascii hexadecimal * as the next 32 bytes of the record. */ for (int i = 0; i < 32; i++) { recBuf[12 + i] = (byte) recordNumber.getHexDigit(i); } /* add 4 bytes of "break" data */ recBuf[44] = (byte) 0x88; recBuf[45] = (byte) 0x99; recBuf[46] = (byte) 0xAA; recBuf[47] = (byte) 0xBB; /* add 48 bytes of filler based on low 48 bits of random number */ for(int i=0; i < 12; ++i) { recBuf[48+i*4] = recBuf[49+i*4] = recBuf[50+i*4] = recBuf[51+i*4] = (byte) rand.getHexDigit(20 + i); } /* add 4 bytes of "break" data */ recBuf[96] = (byte) 0xCC; recBuf[97] = (byte) 0xDD; recBuf[98] = (byte) 0xEE; recBuf[99] = (byte) 0xFF; } private static BigInteger makeBigInteger(long x) { byte[] data = new byte[8]; for(int i=0; i < 8; ++i) { data[i] = (byte) (x >>> (56 - 8*i)); } return new BigInteger(1, data); } private static final BigInteger NINETY_FIVE = new BigInteger("95"); /** * Generate an ascii record suitable for all sort benchmarks including * PennySort. */ static void generateAsciiRecord(byte[] recBuf, Unsigned16 rand, Unsigned16 recordNumber) { /* generate the 10-byte ascii key using mostly the high 64 bits. */ long temp = rand.getHigh8(); if (temp < 0) { // use biginteger to avoid the negative sign problem BigInteger bigTemp = makeBigInteger(temp); recBuf[0] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue())); temp = bigTemp.divide(NINETY_FIVE).longValue(); } else { recBuf[0] = (byte) (' ' + (temp % 95)); temp /= 95; } for(int i=1; i < 8; ++i) { recBuf[i] = (byte) (' ' + (temp % 95)); temp /= 95; } temp = rand.getLow8(); if (temp < 0) { BigInteger bigTemp = makeBigInteger(temp); recBuf[8] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue())); temp = bigTemp.divide(NINETY_FIVE).longValue(); } else { recBuf[8] = (byte) (' ' + (temp % 95)); temp /= 95; } recBuf[9] = (byte)(' ' + (temp % 95)); /* add 2 bytes of "break" */ recBuf[10] = ' '; recBuf[11] = ' '; /* convert the 128-bit record number to 32 bits of ascii hexadecimal * as the next 32 bytes of the record. */ for (int i = 0; i < 32; i++) { recBuf[12 + i] = (byte) recordNumber.getHexDigit(i); } /* add 2 bytes of "break" data */ recBuf[44] = ' '; recBuf[45] = ' '; /* add 52 bytes of filler based on low 48 bits of random number */ for(int i=0; i < 13; ++i) { recBuf[46+i*4] = recBuf[47+i*4] = recBuf[48+i*4] = recBuf[49+i*4] = (byte) rand.getHexDigit(19 + i); } /* add 2 bytes of "break" data */ recBuf[98] = '\r'; /* nice for Windows */ recBuf[99] = '\n'; } private static void usage() { PrintStream out = System.out; out.println("usage: gensort [-a] [-c] [-bSTARTING_REC_NUM] NUM_RECS FILE_NAME"); out.println("-a Generate ascii records required for PennySort or JouleSort."); out.println(" These records are also an alternative input for the other"); out.println(" sort benchmarks. Without this flag, binary records will be"); out.println(" generated that contain the highest density of randomness in"); out.println(" the 10-byte key."); out.println( "-c Calculate the sum of the crc32 checksums of each of the"); out.println(" generated records and send it to standard error."); out.println("-bN Set the beginning record generated to N. By default the"); out.println(" first record generated is record 0."); out.println("NUM_RECS The number of sequential records to generate."); out.println("FILE_NAME The name of the file to write the records to.\n"); out.println("Example 1 - to generate 1000000 ascii records starting at record 0 to"); out.println("the file named \"pennyinput\":"); out.println(" gensort -a 1000000 pennyinput\n"); out.println("Example 2 - to generate 1000 binary records beginning with record 2000"); out.println("to the file named \"partition2\":"); out.println(" gensort -b2000 1000 partition2"); System.exit(1); } public static void outputRecords(OutputStream out, boolean useAscii, Unsigned16 firstRecordNumber, Unsigned16 recordsToGenerate, Unsigned16 checksum ) throws IOException { byte[] row = new byte[100]; Unsigned16 recordNumber = new Unsigned16(firstRecordNumber); Unsigned16 lastRecordNumber = new Unsigned16(firstRecordNumber); Checksum crc = new PureJavaCrc32(); Unsigned16 tmp = new Unsigned16(); lastRecordNumber.add(recordsToGenerate); Unsigned16 ONE = new Unsigned16(1); Unsigned16 rand = Random16.skipAhead(firstRecordNumber); while (!recordNumber.equals(lastRecordNumber)) { Random16.nextRand(rand); if (useAscii) { generateAsciiRecord(row, rand, recordNumber); } else { generateRecord(row, rand, recordNumber); } if (checksum != null) { crc.reset(); crc.update(row, 0, row.length); tmp.set(crc.getValue()); checksum.add(tmp); } recordNumber.add(ONE); out.write(row); } } public static void main(String[] args) throws Exception { Unsigned16 startingRecord = new Unsigned16(); Unsigned16 numberOfRecords; OutputStream out; boolean useAscii = false; Unsigned16 checksum = null; int i; for(i=0; i < args.length; ++i) { String arg = args[i]; int argLength = arg.length(); if (argLength >= 1 && arg.charAt(0) == '-') { if (argLength < 2) { usage(); } switch (arg.charAt(1)) { case 'a': useAscii = true; break; case 'b': startingRecord = Unsigned16.fromDecimal(arg.substring(2)); break; case 'c': checksum = new Unsigned16(); break; default: usage(); } } else { break; } } if (args.length - i != 2) { usage(); } numberOfRecords = Unsigned16.fromDecimal(args[i]); out = new FileOutputStream(args[i+1]); outputRecords(out, useAscii, startingRecord, numberOfRecords, checksum); out.close(); if (checksum != null) { System.out.println(checksum); } } }