/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.exec.physical.impl.window; import org.apache.drill.common.util.TestTools; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; public class GenerateTestData { private static final int BATCH_SIZE = 20; private static class Builder { List<Partition> partitions = new ArrayList<>(); int cur_length; List<Integer> cur_subs = new ArrayList<>(); List<Integer> cur_subs_size = new ArrayList<>(); Builder partition(int length) { if (cur_length > 0) { addPartition(); } cur_length = length; cur_subs.clear(); cur_subs_size.clear(); return this; } Builder sub(int subId) { return sub(subId, subId); } Builder sub(int subId, int num) { cur_subs.add(subId); cur_subs_size.add(num); return this; } void addPartition() { partitions.add( new Partition(cur_length, cur_subs.toArray(new Integer[cur_subs.size()]), cur_subs_size.toArray(new Integer[cur_subs_size.size()]))); } Partition[] build() { if (cur_length > 0) { addPartition(); } // set previous partitions for (int i = 1; i < partitions.size(); i++) { partitions.get(i).previous = partitions.get(i - 1); } return partitions.toArray(new Partition[partitions.size()]); } } private static class Partition { Partition previous; final int length; final Integer[] subs; final Integer[] subs_sizes; public Partition(int length, Integer[] subs, Integer[] sub_sizes) { this.length = length; this.subs = subs; this.subs_sizes = sub_sizes; } /** * @return total number of rows since first partition, this partition included */ public int cumulLength() { int prevLength = previous != null ? previous.cumulLength() : 0; return length + prevLength; } public boolean isPartOf(int rowNumber) { int prevLength = previous != null ? previous.cumulLength() : 0; return rowNumber >= prevLength && rowNumber < cumulLength(); } public int getSubIndex(final int sub) { return Arrays.binarySearch(subs, sub); } public int getSubSize(int sub) { if (sub != subs[subs.length - 1]) { return subs_sizes[getSubIndex(sub)]; } else { //last sub has enough rows to reach partition length int size = length; for (int i = 0; i < subs.length - 1; i++) { size -= subs_sizes[i]; } return size; } } /** * @return sub id of the sub that contains rowNumber */ public int getSubId(int rowNumber) { assert isPartOf(rowNumber) : "row "+rowNumber+" isn't part of this partition"; int prevLength = previous != null ? previous.cumulLength() : 0; rowNumber -= prevLength; // row num from start of this partition for (int s : subs) { if (rowNumber < subRunningCount(s)) { return s; } } throw new RuntimeException("should never happen!"); } /** * @return running count of rows from first row of the partition to current sub, this sub included */ public int subRunningCount(int sub) { int count = 0; for (int s : subs) { count += getSubSize(s); if (s == sub) { break; } } return count; } /** * @return running sum of salaries from first row of the partition to current sub, this sub included */ public int subRunningSum(int sub) { int sum = 0; for (int s : subs) { sum += (s+10) * getSubSize(s); if (s == sub) { break; } } return sum; } /** * @return sum of salaries for all rows of the partition */ public int totalSalary() { return subRunningSum(subs[subs.length-1]); } } private static Partition[] dataB1P1() { // partition rows 20, subs [1, 2, 3, 4, 5, 6] return new Builder() .partition(20).sub(1).sub(2).sub(3).sub(4).sub(5).sub(6) .build(); } private static Partition[] dataB1P2(boolean pby) { // partition rows 10, subs [1, 2, 3, 4] // partition rows 10, subs [4, 5, 6] if (pby) { return new Builder() .partition(10).sub(1).sub(2).sub(3).sub(4) .partition(10).sub(4).sub(5).sub(6) .build(); } else { return new Builder() .partition(20).sub(1).sub(2).sub(3).sub(4, 8).sub(5).sub(6) .build(); } } private static Partition[] dataB2P2(boolean pby) { // partition rows 20, subs [3, 5, 9] // partition rows 20, subs [9, 10] if (pby) { return new Builder() .partition(20).sub(3).sub(5).sub(9) .partition(20).sub(9).sub(10) .build(); } else { return new Builder() .partition(40).sub(3).sub(5).sub(9, 12 + 9).sub(10) .build(); } } private static Partition[] dataB2P4(boolean pby) { // partition rows 5, subs [1, 2, 3] // partition rows 10, subs [3, 4, 5] // partition rows 15, subs [5, 6, 7] // partition rows 10, subs [7, 8] if (pby) { return new Builder() .partition(5).sub(1).sub(2).sub(3) .partition(10).sub(3).sub(4).sub(5) .partition(15).sub(5).sub(6).sub(7) .partition(10).sub(7).sub(8) .build(); } else { return new Builder() .partition(40).sub(1).sub(2).sub(3, 5).sub(4).sub(5, 8).sub(6).sub(7, 11).sub(8) .build(); } } private static Partition[] dataB3P2(boolean pby) { // partition rows 5, subs [1, 2, 3] // partition rows 55, subs [4, 5, 7, 8, 9, 10, 11, 12] if (pby) { return new Builder() .partition(5).sub(1).sub(2).sub(3) .partition(55).sub(4).sub(5).sub(7).sub(8).sub(9).sub(10).sub(11).sub(12) .build(); } else { return new Builder() .partition(60).sub(1).sub(2).sub(3, 2).sub(4).sub(5).sub(7).sub(8).sub(9).sub(10).sub(11).sub(12) .build(); } } private static Partition[] dataB4P4(boolean pby) { // partition rows 10, subs [1, 2, 3] // partition rows 30, subs [3, 4, 5, 6, 7, 8] // partition rows 20, subs [8, 9, 10] // partition rows 20, subs [10, 11] if (pby) { return new Builder() .partition(10).sub(1).sub(2).sub(3) .partition(30).sub(3).sub(4).sub(5).sub(6).sub(7).sub(8) .partition(20).sub(8).sub(9).sub(10) .partition(20).sub(10).sub(11) .build(); } else { return new Builder() .partition(80).sub(1).sub(2).sub(3, 10) .sub(4).sub(5).sub(6).sub(7).sub(8, 13) .sub(9).sub(10, 13).sub(11, 10) .build(); } } private static void writeData(final String path, final Partition[] partitions, final boolean addLineNo) throws FileNotFoundException { // total number of rows int total = partitions[partitions.length - 1].cumulLength(); // create data rows in random order List<Integer> emp_ids = new ArrayList<>(total); for (int i = 0; i < total; i++) { emp_ids.add(i); } Collections.shuffle(emp_ids); // data file(s) int fileId = 0; PrintStream dataStream = new PrintStream(path + "/" + fileId + ".data.json"); int emp_idx = 0; int lineNo = 0; for (int id : emp_ids) { int p = 0; while (!partitions[p].isPartOf(id)) { // emp x is @ row x-1 p++; } int sub = partitions[p].getSubId(id); int salary = 10 + sub; if (addLineNo) { dataStream.printf("{ \"employee_id\":%d, \"position_id\":%d, \"sub\":%d, \"salary\":%d, \"line_no\":%d }%n", id, p + 1, sub, salary, lineNo); } else { dataStream.printf("{ \"employee_id\":%d, \"position_id\":%d, \"sub\":%d, \"salary\":%d }%n", id, p + 1, sub, salary); } emp_idx++; if ((emp_idx % BATCH_SIZE)==0 && emp_idx < total) { System.out.printf("total: %d, emp_idx: %d, fileID: %d%n", total, emp_idx, fileId); dataStream.close(); fileId++; dataStream = new PrintStream(path + "/" + fileId + ".data.json"); } lineNo++; } dataStream.close(); } private static void writeResults(final String path, final String prefix, final Partition[] partitions) throws FileNotFoundException { // expected results for query without order by clause final PrintStream resultStream = new PrintStream(path + prefix + ".tsv"); // expected results for query with order by clause final PrintStream resultOrderStream = new PrintStream(path + prefix + ".oby.tsv"); int idx = 0; for (final Partition partition : partitions) { for (int i = 0; i < partition.length; i++, idx++) { final int sub = partition.getSubId(idx); final int rowNumber = i + 1; final int rank = 1 + partition.subRunningCount(sub) - partition.getSubSize(sub); final int denseRank = partition.getSubIndex(sub) + 1; final double cumeDist = (double) partition.subRunningCount(sub) / partition.length; final double percentRank = partition.length == 1 ? 0 : (double)(rank - 1)/(partition.length - 1); // each line has: count(*) sum(salary) row_number() rank() dense_rank() cume_dist() percent_rank() resultOrderStream.printf("%d\t%d\t%d\t%d\t%d\t%s\t%s%n", partition.subRunningCount(sub), partition.subRunningSum(sub), rowNumber, rank, denseRank, Double.toString(cumeDist), Double.toString(percentRank)); // each line has: count(*) sum(salary) resultStream.printf("%d\t%d%n", partition.length, partition.totalSalary()); } } resultStream.close(); resultOrderStream.close(); } private static void generateData(final String tableName, final Partition[] pby_data, final Partition[] nopby_data) throws FileNotFoundException { generateData(tableName, pby_data, nopby_data, false); } private static void generateData(final String tableName, final Partition[] pby_data, final Partition[] nopby_data, final boolean addLineNo) throws FileNotFoundException { final String WORKING_PATH = TestTools.getWorkingPath(); final String TEST_RES_PATH = WORKING_PATH + "/src/test/resources"; final String path = TEST_RES_PATH+"/window/" + tableName; final File pathFolder = new File(path); if (!pathFolder.exists()) { if (!pathFolder.mkdirs()) { System.err.printf("Couldn't create folder %s, exiting%n", path); } } writeData(path, pby_data, addLineNo); writeResults(path, "", nopby_data); writeResults(path, ".pby", pby_data); } public static void main(String[] args) throws FileNotFoundException { generateData("b1.p1", dataB1P1(), dataB1P1()); generateData("b1.p2", dataB1P2(true), dataB1P2(false)); generateData("b2.p2", dataB2P2(true), dataB2P2(false)); generateData("b2.p4", dataB2P4(true), dataB2P4(false)); generateData("b3.p2", dataB3P2(true), dataB3P2(false)); generateData("b4.p4", dataB4P4(true), dataB4P4(false), true); } }