/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.txn.compactor;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.cli.CliSessionState;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.CompactionRequest;
import org.apache.hadoop.hive.metastore.api.CompactionType;
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.ShowCompactRequest;
import org.apache.hadoop.hive.metastore.api.ShowCompactResponse;
import org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement;
import org.apache.hadoop.hive.metastore.api.StringColumnStatsData;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.metastore.txn.CompactionInfo;
import org.apache.hadoop.hive.metastore.txn.TxnDbUtil;
import org.apache.hadoop.hive.metastore.txn.TxnStore;
import org.apache.hadoop.hive.metastore.txn.TxnUtils;
import org.apache.hadoop.hive.ql.CommandNeedRetryException;
import org.apache.hadoop.hive.ql.Driver;
import org.apache.hadoop.hive.ql.io.AcidInputFormat;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.streaming.DelimitedInputWriter;
import org.apache.hive.hcatalog.streaming.HiveEndPoint;
import org.apache.hive.hcatalog.streaming.StreamingConnection;
import org.apache.hive.hcatalog.streaming.StreamingException;
import org.apache.hive.hcatalog.streaming.TransactionBatch;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*/
public class TestCompactor {
private static final AtomicInteger salt = new AtomicInteger(new Random().nextInt());
private static final Logger LOG = LoggerFactory.getLogger(TestCompactor.class);
private final String TEST_DATA_DIR = HCatUtil.makePathASafeFileName(System.getProperty("java.io.tmpdir") +
File.separator + TestCompactor.class.getCanonicalName() + "-" + System.currentTimeMillis() + "_" + salt.getAndIncrement());
private final String BASIC_FILE_NAME = TEST_DATA_DIR + "/basic.input.data";
private final String TEST_WAREHOUSE_DIR = TEST_DATA_DIR + "/warehouse";
@Rule
public TemporaryFolder stagingFolder = new TemporaryFolder();
private HiveConf conf;
IMetaStoreClient msClient;
private Driver driver;
@Before
public void setup() throws Exception {
File f = new File(TEST_WAREHOUSE_DIR);
if (f.exists()) {
FileUtil.fullyDelete(f);
}
if(!(new File(TEST_WAREHOUSE_DIR).mkdirs())) {
throw new RuntimeException("Could not create " + TEST_WAREHOUSE_DIR);
}
HiveConf hiveConf = new HiveConf(this.getClass());
hiveConf.setVar(HiveConf.ConfVars.PREEXECHOOKS, "");
hiveConf.setVar(HiveConf.ConfVars.POSTEXECHOOKS, "");
hiveConf.setVar(HiveConf.ConfVars.METASTOREWAREHOUSE, TEST_WAREHOUSE_DIR);
hiveConf.setVar(HiveConf.ConfVars.HIVEINPUTFORMAT, HiveInputFormat.class.getName());
hiveConf.setVar(HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict");
hiveConf.setVar(HiveConf.ConfVars.HIVEMAPREDMODE, "nonstrict");
//"org.apache.hadoop.hive.ql.io.HiveInputFormat"
TxnDbUtil.setConfValues(hiveConf);
TxnDbUtil.cleanDb();
TxnDbUtil.prepDb();
conf = hiveConf;
msClient = new HiveMetaStoreClient(conf);
driver = new Driver(hiveConf);
SessionState.start(new CliSessionState(hiveConf));
int LOOP_SIZE = 3;
String[] input = new String[LOOP_SIZE * LOOP_SIZE];
int k = 0;
for (int i = 1; i <= LOOP_SIZE; i++) {
String si = i + "";
for (int j = 1; j <= LOOP_SIZE; j++) {
String sj = "S" + j + "S";
input[k] = si + "\t" + sj;
k++;
}
}
createTestDataFile(BASIC_FILE_NAME, input);
}
@After
public void tearDown() {
conf = null;
if(msClient != null) {
msClient.close();
}
if(driver != null) {
driver.close();
}
}
/**
* Simple schema evolution add columns with partitioning.
* @throws Exception
*/
@Test
public void schemaEvolutionAddColDynamicPartitioningInsert() throws Exception {
String tblName = "dpct";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" PARTITIONED BY(ds string)" +
" CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
// First INSERT round.
executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " +
"'today'), (2, 'wilma', 'yesterday')", driver);
// ALTER TABLE ... ADD COLUMNS
executeStatementOnDriver("ALTER TABLE " + tblName + " ADD COLUMNS(c int)", driver);
// Validate there is an added NULL for column c.
executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver);
ArrayList<String> valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
Assert.assertEquals(2, valuesReadFromHiveDriver.size());
Assert.assertEquals("1\tfred\tNULL\ttoday", valuesReadFromHiveDriver.get(0));
Assert.assertEquals("2\twilma\tNULL\tyesterday", valuesReadFromHiveDriver.get(1));
// Second INSERT round with new inserts into previously existing partition 'yesterday'.
executeStatementOnDriver("insert into " + tblName + " partition (ds) values " +
"(3, 'mark', 1900, 'soon'), (4, 'douglas', 1901, 'last_century'), " +
"(5, 'doc', 1902, 'yesterday')",
driver);
// Validate there the new insertions for column c.
executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver);
valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
Assert.assertEquals(5, valuesReadFromHiveDriver.size());
Assert.assertEquals("1\tfred\tNULL\ttoday", valuesReadFromHiveDriver.get(0));
Assert.assertEquals("2\twilma\tNULL\tyesterday", valuesReadFromHiveDriver.get(1));
Assert.assertEquals("3\tmark\t1900\tsoon", valuesReadFromHiveDriver.get(2));
Assert.assertEquals("4\tdouglas\t1901\tlast_century", valuesReadFromHiveDriver.get(3));
Assert.assertEquals("5\tdoc\t1902\tyesterday", valuesReadFromHiveDriver.get(4));
Initiator initiator = new Initiator();
initiator.setThreadId((int)initiator.getId());
conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 0);
initiator.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean();
stop.set(true);
initiator.init(stop, new AtomicBoolean());
initiator.run();
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
Assert.assertEquals(4, compacts.size());
SortedSet<String> partNames = new TreeSet<String>();
for (int i = 0; i < compacts.size(); i++) {
Assert.assertEquals("default", compacts.get(i).getDbname());
Assert.assertEquals(tblName, compacts.get(i).getTablename());
Assert.assertEquals("initiated", compacts.get(i).getState());
partNames.add(compacts.get(i).getPartitionname());
}
List<String> names = new ArrayList<String>(partNames);
Assert.assertEquals("ds=last_century", names.get(0));
Assert.assertEquals("ds=soon", names.get(1));
Assert.assertEquals("ds=today", names.get(2));
Assert.assertEquals("ds=yesterday", names.get(3));
// Validate after compaction.
executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver);
valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
Assert.assertEquals(5, valuesReadFromHiveDriver.size());
Assert.assertEquals("1\tfred\tNULL\ttoday", valuesReadFromHiveDriver.get(0));
Assert.assertEquals("2\twilma\tNULL\tyesterday", valuesReadFromHiveDriver.get(1));
Assert.assertEquals("3\tmark\t1900\tsoon", valuesReadFromHiveDriver.get(2));
Assert.assertEquals("4\tdouglas\t1901\tlast_century", valuesReadFromHiveDriver.get(3));
Assert.assertEquals("5\tdoc\t1902\tyesterday", valuesReadFromHiveDriver.get(4));
}
@Test
public void schemaEvolutionAddColDynamicPartitioningUpdate() throws Exception {
String tblName = "udpct";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" PARTITIONED BY(ds string)" +
" CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " +
"'today'), (2, 'wilma', 'yesterday')", driver);
executeStatementOnDriver("update " + tblName + " set b = 'barney'", driver);
// Validate the update.
executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver);
ArrayList<String> valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
Assert.assertEquals(2, valuesReadFromHiveDriver.size());
Assert.assertEquals("1\tbarney\ttoday", valuesReadFromHiveDriver.get(0));
Assert.assertEquals("2\tbarney\tyesterday", valuesReadFromHiveDriver.get(1));
// ALTER TABLE ... ADD COLUMNS
executeStatementOnDriver("ALTER TABLE " + tblName + " ADD COLUMNS(c int)", driver);
// Validate there is an added NULL for column c.
executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver);
valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
Assert.assertEquals(2, valuesReadFromHiveDriver.size());
Assert.assertEquals("1\tbarney\tNULL\ttoday", valuesReadFromHiveDriver.get(0));
Assert.assertEquals("2\tbarney\tNULL\tyesterday", valuesReadFromHiveDriver.get(1));
// Second INSERT round with new inserts into previously existing partition 'yesterday'.
executeStatementOnDriver("insert into " + tblName + " partition (ds) values " +
"(3, 'mark', 1900, 'soon'), (4, 'douglas', 1901, 'last_century'), " +
"(5, 'doc', 1902, 'yesterday')",
driver);
// Validate there the new insertions for column c.
executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver);
valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
Assert.assertEquals(5, valuesReadFromHiveDriver.size());
Assert.assertEquals("1\tbarney\tNULL\ttoday", valuesReadFromHiveDriver.get(0));
Assert.assertEquals("2\tbarney\tNULL\tyesterday", valuesReadFromHiveDriver.get(1));
Assert.assertEquals("3\tmark\t1900\tsoon", valuesReadFromHiveDriver.get(2));
Assert.assertEquals("4\tdouglas\t1901\tlast_century", valuesReadFromHiveDriver.get(3));
Assert.assertEquals("5\tdoc\t1902\tyesterday", valuesReadFromHiveDriver.get(4));
executeStatementOnDriver("update " + tblName + " set c = 2000", driver);
// Validate the update of new column c, even in old rows.
executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver);
valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
Assert.assertEquals(5, valuesReadFromHiveDriver.size());
Assert.assertEquals("1\tbarney\t2000\ttoday", valuesReadFromHiveDriver.get(0));
Assert.assertEquals("2\tbarney\t2000\tyesterday", valuesReadFromHiveDriver.get(1));
Assert.assertEquals("3\tmark\t2000\tsoon", valuesReadFromHiveDriver.get(2));
Assert.assertEquals("4\tdouglas\t2000\tlast_century", valuesReadFromHiveDriver.get(3));
Assert.assertEquals("5\tdoc\t2000\tyesterday", valuesReadFromHiveDriver.get(4));
Initiator initiator = new Initiator();
initiator.setThreadId((int)initiator.getId());
// Set to 1 so insert doesn't set it off but update does
conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 1);
initiator.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean();
stop.set(true);
initiator.init(stop, new AtomicBoolean());
initiator.run();
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
Assert.assertEquals(4, compacts.size());
SortedSet<String> partNames = new TreeSet<String>();
for (int i = 0; i < compacts.size(); i++) {
Assert.assertEquals("default", compacts.get(i).getDbname());
Assert.assertEquals(tblName, compacts.get(i).getTablename());
Assert.assertEquals("initiated", compacts.get(i).getState());
partNames.add(compacts.get(i).getPartitionname());
}
List<String> names = new ArrayList<String>(partNames);
Assert.assertEquals("ds=last_century", names.get(0));
Assert.assertEquals("ds=soon", names.get(1));
Assert.assertEquals("ds=today", names.get(2));
Assert.assertEquals("ds=yesterday", names.get(3));
// Validate after compaction.
executeStatementOnDriver("SELECT * FROM " + tblName + " ORDER BY a", driver);
valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
Assert.assertEquals(5, valuesReadFromHiveDriver.size());
Assert.assertEquals("1\tbarney\t2000\ttoday", valuesReadFromHiveDriver.get(0));
Assert.assertEquals("2\tbarney\t2000\tyesterday", valuesReadFromHiveDriver.get(1));
Assert.assertEquals("3\tmark\t2000\tsoon", valuesReadFromHiveDriver.get(2));
Assert.assertEquals("4\tdouglas\t2000\tlast_century", valuesReadFromHiveDriver.get(3));
Assert.assertEquals("5\tdoc\t2000\tyesterday", valuesReadFromHiveDriver.get(4));
}
/**
* After each major compaction, stats need to be updated on each column of the
* table/partition which previously had stats.
* 1. create a bucketed ORC backed table (Orc is currently required by ACID)
* 2. populate 2 partitions with data
* 3. compute stats
* 4. insert some data into the table using StreamingAPI
* 5. Trigger major compaction (which should update stats)
* 6. check that stats have been updated
* @throws Exception
* todo:
* 2. add non-partitioned test
* 4. add a test with sorted table?
*/
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
//as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
String tblName = "compaction_test";
String tblNameStg = tblName + "_stg";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("drop table if exists " + tblNameStg, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" PARTITIONED BY(bkt INT)" +
" CLUSTERED BY(a) INTO 4 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("CREATE EXTERNAL TABLE " + tblNameStg + "(a INT, b STRING)" +
" ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n'" +
" STORED AS TEXTFILE" +
" LOCATION '" + stagingFolder.newFolder().toURI().getPath() + "'", driver);
executeStatementOnDriver("load data local inpath '" + BASIC_FILE_NAME +
"' overwrite into table " + tblNameStg, driver);
execSelectAndDumpData("select * from " + tblNameStg, driver, "Dumping data for " +
tblNameStg + " after load:");
executeStatementOnDriver("FROM " + tblNameStg +
" INSERT INTO TABLE " + tblName + " PARTITION(bkt=0) " +
"SELECT a, b where a < 2", driver);
executeStatementOnDriver("FROM " + tblNameStg +
" INSERT INTO TABLE " + tblName + " PARTITION(bkt=1) " +
"SELECT a, b where a >= 2", driver);
execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " +
tblName + " after load:");
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
CompactionInfo ci = new CompactionInfo("default", tblName, "bkt=0", CompactionType.MAJOR);
LOG.debug("List of stats columns before analyze Part1: " + txnHandler.findColumnsWithStats(ci));
Worker.StatsUpdater su = Worker.StatsUpdater.init(ci, colNames, conf,
System.getProperty("user.name"));
su.gatherStats();//compute stats before compaction
LOG.debug("List of stats columns after analyze Part1: " + txnHandler.findColumnsWithStats(ci));
CompactionInfo ciPart2 = new CompactionInfo("default", tblName, "bkt=1", CompactionType.MAJOR);
LOG.debug("List of stats columns before analyze Part2: " + txnHandler.findColumnsWithStats(ci));
su = Worker.StatsUpdater.init(ciPart2, colNames, conf, System.getProperty("user.name"));
su.gatherStats();//compute stats before compaction
LOG.debug("List of stats columns after analyze Part2: " + txnHandler.findColumnsWithStats(ci));
//now make sure we get the stats we expect for partition we are going to add data to later
Map<String, List<ColumnStatisticsObj>> stats = msClient.getPartitionColumnStatistics(ci.dbname,
ci.tableName, Arrays.asList(ci.partName), colNames);
List<ColumnStatisticsObj> colStats = stats.get(ci.partName);
Assert.assertNotNull("No stats found for partition " + ci.partName, colStats);
Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName());
Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName());
LongColumnStatsData colAStats = colStats.get(0).getStatsData().getLongStats();
Assert.assertEquals("lowValue a", 1, colAStats.getLowValue());
Assert.assertEquals("highValue a", 1, colAStats.getHighValue());
Assert.assertEquals("numNulls a", 0, colAStats.getNumNulls());
Assert.assertEquals("numNdv a", 1, colAStats.getNumDVs());
StringColumnStatsData colBStats = colStats.get(1).getStatsData().getStringStats();
Assert.assertEquals("maxColLen b", 3, colBStats.getMaxColLen());
Assert.assertEquals("avgColLen b", 3.0, colBStats.getAvgColLen(), 0.01);
Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls());
Assert.assertEquals("nunDVs", 2, colBStats.getNumDVs());
//now save stats for partition we won't modify
stats = msClient.getPartitionColumnStatistics(ciPart2.dbname,
ciPart2.tableName, Arrays.asList(ciPart2.partName), colNames);
colStats = stats.get(ciPart2.partName);
LongColumnStatsData colAStatsPart2 = colStats.get(0).getStatsData().getLongStats();
StringColumnStatsData colBStatsPart2 = colStats.get(1).getStatsData().getStringStats();
HiveEndPoint endPt = new HiveEndPoint(null, ci.dbname, ci.tableName, Arrays.asList("0"));
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt);
/*next call will eventually end up in HiveEndPoint.createPartitionIfNotExists() which
makes an operation on Driver
* and starts it's own CliSessionState and then closes it, which removes it from ThreadLoacal;
* thus the session
* created in this class is gone after this; I fixed it in HiveEndPoint*/
StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName());
TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer);
txnBatch.beginNextTransaction();
Assert.assertEquals(TransactionBatch.TxnState.OPEN, txnBatch.getCurrentTransactionState());
txnBatch.write("50,Kiev".getBytes());
txnBatch.write("51,St. Petersburg".getBytes());
txnBatch.write("44,Boston".getBytes());
txnBatch.commit();
txnBatch.beginNextTransaction();
txnBatch.write("52,Tel Aviv".getBytes());
txnBatch.write("53,Atlantis".getBytes());
txnBatch.write("53,Boston".getBytes());
txnBatch.commit();
txnBatch.close();
connection.close();
execSelectAndDumpData("select * from " + ci.getFullTableName(), driver, ci.getFullTableName());
//so now we have written some new data to bkt=0 and it shows up
CompactionRequest rqst = new CompactionRequest(ci.dbname, ci.tableName, CompactionType.MAJOR);
rqst.setPartitionname(ci.partName);
txnHandler.compact(rqst);
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean();
AtomicBoolean looped = new AtomicBoolean();
stop.set(true);
t.init(stop, looped);
t.run();
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
if (1 != compacts.size()) {
Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts.toString());
}
Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName,
Arrays.asList(ci.partName), colNames);
colStats = stats.get(ci.partName);
Assert.assertNotNull("No stats found for partition " + ci.partName, colStats);
Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName());
Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName());
colAStats = colStats.get(0).getStatsData().getLongStats();
Assert.assertEquals("lowValue a", 1, colAStats.getLowValue());
Assert.assertEquals("highValue a", 53, colAStats.getHighValue());
Assert.assertEquals("numNulls a", 0, colAStats.getNumNulls());
Assert.assertEquals("numNdv a", 6, colAStats.getNumDVs());
colBStats = colStats.get(1).getStatsData().getStringStats();
Assert.assertEquals("maxColLen b", 14, colBStats.getMaxColLen());
//cast it to long to get rid of periodic decimal
Assert.assertEquals("avgColLen b", (long)6.1111111111, (long)colBStats.getAvgColLen());
Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls());
Assert.assertEquals("nunDVs", 10, colBStats.getNumDVs());
//now check that stats for partition we didn't modify did not change
stats = msClient.getPartitionColumnStatistics(ciPart2.dbname, ciPart2.tableName,
Arrays.asList(ciPart2.partName), colNames);
colStats = stats.get(ciPart2.partName);
Assert.assertEquals("Expected stats for " + ciPart2.partName + " to stay the same",
colAStatsPart2, colStats.get(0).getStatsData().getLongStats());
Assert.assertEquals("Expected stats for " + ciPart2.partName + " to stay the same",
colBStatsPart2, colStats.get(1).getStatsData().getStringStats());
}
@Test
public void dynamicPartitioningInsert() throws Exception {
String tblName = "dpct";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" PARTITIONED BY(ds string)" +
" CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " +
"'today'), (2, 'wilma', 'yesterday')", driver);
Initiator initiator = new Initiator();
initiator.setThreadId((int)initiator.getId());
conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 0);
initiator.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean();
stop.set(true);
initiator.init(stop, new AtomicBoolean());
initiator.run();
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
Assert.assertEquals(2, compacts.size());
SortedSet<String> partNames = new TreeSet<String>();
for (int i = 0; i < compacts.size(); i++) {
Assert.assertEquals("default", compacts.get(i).getDbname());
Assert.assertEquals(tblName, compacts.get(i).getTablename());
Assert.assertEquals("initiated", compacts.get(i).getState());
partNames.add(compacts.get(i).getPartitionname());
}
List<String> names = new ArrayList<String>(partNames);
Assert.assertEquals("ds=today", names.get(0));
Assert.assertEquals("ds=yesterday", names.get(1));
}
@Test
public void dynamicPartitioningUpdate() throws Exception {
String tblName = "udpct";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" PARTITIONED BY(ds string)" +
" CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " +
"'today'), (2, 'wilma', 'yesterday')", driver);
executeStatementOnDriver("update " + tblName + " set b = 'barney'", driver);
Initiator initiator = new Initiator();
initiator.setThreadId((int)initiator.getId());
// Set to 1 so insert doesn't set it off but update does
conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 1);
initiator.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean();
stop.set(true);
initiator.init(stop, new AtomicBoolean());
initiator.run();
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
Assert.assertEquals(2, compacts.size());
SortedSet<String> partNames = new TreeSet<String>();
for (int i = 0; i < compacts.size(); i++) {
Assert.assertEquals("default", compacts.get(i).getDbname());
Assert.assertEquals(tblName, compacts.get(i).getTablename());
Assert.assertEquals("initiated", compacts.get(i).getState());
partNames.add(compacts.get(i).getPartitionname());
}
List<String> names = new ArrayList<String>(partNames);
Assert.assertEquals("ds=today", names.get(0));
Assert.assertEquals("ds=yesterday", names.get(1));
}
@Test
public void dynamicPartitioningDelete() throws Exception {
String tblName = "ddpct";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" PARTITIONED BY(ds string)" +
" CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " +
"'today'), (2, 'wilma', 'yesterday')", driver);
executeStatementOnDriver("update " + tblName + " set b = 'fred' where a = 1", driver);
executeStatementOnDriver("delete from " + tblName + " where b = 'fred'", driver);
Initiator initiator = new Initiator();
initiator.setThreadId((int)initiator.getId());
// Set to 2 so insert and update don't set it off but delete does
conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 2);
initiator.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean();
stop.set(true);
initiator.init(stop, new AtomicBoolean());
initiator.run();
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
Assert.assertEquals(1, compacts.size());
SortedSet<String> partNames = new TreeSet<String>();
for (int i = 0; i < compacts.size(); i++) {
Assert.assertEquals("default", compacts.get(i).getDbname());
Assert.assertEquals(tblName, compacts.get(i).getTablename());
Assert.assertEquals("initiated", compacts.get(i).getState());
partNames.add(compacts.get(i).getPartitionname());
}
List<String> names = new ArrayList<String>(partNames);
Assert.assertEquals("ds=today", names.get(0));
}
@Test
public void minorCompactWhileStreaming() throws Exception {
String dbName = "default";
String tblName = "cws";
List<String> colNames = Arrays.asList("a", "b");
String columnNamesProperty = "a,b";
String columnTypesProperty = "int:string";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null);
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt);
StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName());
try {
// Write a couple of batches
for (int i = 0; i < 2; i++) {
writeBatch(connection, writer, false);
}
// Start a third batch, but don't close it.
writeBatch(connection, writer, true);
// Now, compact
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR));
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean(true);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
// Find the location of the table
IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
Table table = msClient.getTable(dbName, tblName);
FileSystem fs = FileSystem.get(conf);
FileStatus[] stat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter);
String[] names = new String[stat.length];
Path resultFile = null;
for (int i = 0; i < names.length; i++) {
names[i] = stat[i].getPath().getName();
if (names[i].equals("delta_0000003_0000006")) {
resultFile = stat[i].getPath();
}
}
Arrays.sort(names);
String[] expected = new String[]{"delta_0000003_0000004",
"delta_0000003_0000006", "delta_0000005_0000006", "delta_0000007_0000008"};
if (!Arrays.deepEquals(expected, names)) {
Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names));
}
checkExpectedTxnsPresent(null, new Path[]{resultFile},columnNamesProperty, columnTypesProperty, 0, 3L, 6L);
} finally {
connection.close();
}
}
@Test
public void majorCompactWhileStreaming() throws Exception {
String dbName = "default";
String tblName = "cws";
List<String> colNames = Arrays.asList("a", "b");
String columnNamesProperty = "a,b";
String columnTypesProperty = "int:string";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true') ", driver);
HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null);
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt);
StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName());
try {
// Write a couple of batches
for (int i = 0; i < 2; i++) {
writeBatch(connection, writer, false);
}
// Start a third batch, but don't close it.
writeBatch(connection, writer, true);
// Now, compact
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MAJOR));
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean(true);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
// Find the location of the table
IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
Table table = msClient.getTable(dbName, tblName);
FileSystem fs = FileSystem.get(conf);
FileStatus[] stat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter);
if (1 != stat.length) {
Assert.fail("Expecting 1 file \"base_0000006\" and found " + stat.length + " files " + Arrays.toString(stat));
}
String name = stat[0].getPath().getName();
Assert.assertEquals(name, "base_0000006");
checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 3L, 6L);
} finally {
connection.close();
}
}
@Test
public void minorCompactAfterAbort() throws Exception {
String agentInfo = "UT_" + Thread.currentThread().getName();
String dbName = "default";
String tblName = "cws";
List<String> colNames = Arrays.asList("a", "b");
String columnNamesProperty = "a,b";
String columnTypesProperty = "int:string";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null);
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt);
StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName());
try {
// Write a couple of batches
for (int i = 0; i < 2; i++) {
writeBatch(connection, writer, false);
}
// Start a third batch, abort everything, don't properly close it
TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer);
txnBatch.beginNextTransaction();
txnBatch.abort();
txnBatch.beginNextTransaction();
txnBatch.abort();
// Now, compact
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR));
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean(true);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
// Find the location of the table
IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
Table table = msClient.getTable(dbName, tblName);
FileSystem fs = FileSystem.get(conf);
FileStatus[] stat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter);
String[] names = new String[stat.length];
Path resultDelta = null;
for (int i = 0; i < names.length; i++) {
names[i] = stat[i].getPath().getName();
if (names[i].equals("delta_0000003_0000006")) {
resultDelta = stat[i].getPath();
}
}
Arrays.sort(names);
String[] expected = new String[]{"delta_0000003_0000004",
"delta_0000003_0000006", "delta_0000005_0000006"};
if (!Arrays.deepEquals(expected, names)) {
Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names));
}
checkExpectedTxnsPresent(null, new Path[]{resultDelta}, columnNamesProperty, columnTypesProperty, 0, 3L, 6L);
} finally {
connection.close();
}
}
@Test
public void majorCompactAfterAbort() throws Exception {
String dbName = "default";
String tblName = "cws";
List<String> colNames = Arrays.asList("a", "b");
String columnNamesProperty = "a,b";
String columnTypesProperty = "int:string";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null);
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt);
StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName());
try {
// Write a couple of batches
for (int i = 0; i < 2; i++) {
writeBatch(connection, writer, false);
}
// Start a third batch, but don't close it.
TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer);
txnBatch.beginNextTransaction();
txnBatch.abort();
txnBatch.beginNextTransaction();
txnBatch.abort();
// Now, compact
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MAJOR));
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean(true);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
// Find the location of the table
IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
Table table = msClient.getTable(dbName, tblName);
FileSystem fs = FileSystem.get(conf);
FileStatus[] stat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter);
if (1 != stat.length) {
Assert.fail("majorCompactAfterAbort FileStatus[] stat " + Arrays.toString(stat));
}
if (1 != stat.length) {
Assert.fail("Expecting 1 file \"base_0000006\" and found " + stat.length + " files " + Arrays.toString(stat));
}
String name = stat[0].getPath().getName();
if (!name.equals("base_0000006")) {
Assert.fail("majorCompactAfterAbort name " + name + " not equals to base_0000006");
}
checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 3L, 6L);
} finally {
connection.close();
}
}
@Test
public void majorCompactWhileStreamingForSplitUpdate() throws Exception {
String dbName = "default";
String tblName = "cws";
List<String> colNames = Arrays.asList("a", "b");
String columnNamesProperty = "a,b";
String columnTypesProperty = "int:string";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true', "
+ "'transactional_properties'='default') ", driver); // this turns on split-update U=D+I
HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null);
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt);
StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName());
try {
// Write a couple of batches
for (int i = 0; i < 2; i++) {
writeBatch(connection, writer, false);
}
// Start a third batch, but don't close it.
writeBatch(connection, writer, true);
// Now, compact
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MAJOR));
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean(true);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
// Find the location of the table
IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
Table table = msClient.getTable(dbName, tblName);
FileSystem fs = FileSystem.get(conf);
FileStatus[] stat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.baseFileFilter);
if (1 != stat.length) {
Assert.fail("Expecting 1 file \"base_0000006\" and found " + stat.length + " files " + Arrays.toString(stat));
}
String name = stat[0].getPath().getName();
Assert.assertEquals(name, "base_0000006");
checkExpectedTxnsPresent(stat[0].getPath(), null, columnNamesProperty, columnTypesProperty, 0, 3L, 6L);
} finally {
connection.close();
}
}
@Test
public void testMinorCompactionForSplitUpdateWithInsertsAndDeletes() throws Exception {
String agentInfo = "UT_" + Thread.currentThread().getName();
String dbName = "default";
String tblName = "cws";
List<String> colNames = Arrays.asList("a", "b");
String columnNamesProperty = "a,b";
String columnTypesProperty = "int:string";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true',"
+ "'transactional_properties'='default')", driver);
// Insert some data -> this will generate only insert deltas and no delete deltas: delta_3_3
executeStatementOnDriver("INSERT INTO " + tblName +"(a,b) VALUES(1, 'foo')", driver);
// Insert some data -> this will again generate only insert deltas and no delete deltas: delta_4_4
executeStatementOnDriver("INSERT INTO " + tblName +"(a,b) VALUES(2, 'bar')", driver);
// Delete some data -> this will generate only delete deltas and no insert deltas: delete_delta_5_5
executeStatementOnDriver("DELETE FROM " + tblName +" WHERE a = 2", driver);
// Now, compact -> Compaction produces a single range for both delta and delete delta
// That is, both delta and delete_deltas would be compacted into delta_3_5 and delete_delta_3_5
// even though there are only two delta_3_3, delta_4_4 and one delete_delta_5_5.
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR));
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean(true);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
// Find the location of the table
IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
Table table = msClient.getTable(dbName, tblName);
FileSystem fs = FileSystem.get(conf);
// Verify that we have got correct set of deltas.
FileStatus[] stat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter);
String[] deltas = new String[stat.length];
Path minorCompactedDelta = null;
for (int i = 0; i < deltas.length; i++) {
deltas[i] = stat[i].getPath().getName();
if (deltas[i].equals("delta_0000003_0000005")) {
minorCompactedDelta = stat[i].getPath();
}
}
Arrays.sort(deltas);
String[] expectedDeltas = new String[]{"delta_0000003_0000003_0000", "delta_0000003_0000005", "delta_0000004_0000004_0000"};
if (!Arrays.deepEquals(expectedDeltas, deltas)) {
Assert.fail("Expected: " + Arrays.toString(expectedDeltas) + ", found: " + Arrays.toString(deltas));
}
checkExpectedTxnsPresent(null, new Path[]{minorCompactedDelta}, columnNamesProperty, columnTypesProperty, 0, 3L, 4L);
// Verify that we have got correct set of delete_deltas.
FileStatus[] deleteDeltaStat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter);
String[] deleteDeltas = new String[deleteDeltaStat.length];
Path minorCompactedDeleteDelta = null;
for (int i = 0; i < deleteDeltas.length; i++) {
deleteDeltas[i] = deleteDeltaStat[i].getPath().getName();
if (deleteDeltas[i].equals("delete_delta_0000003_0000005")) {
minorCompactedDeleteDelta = deleteDeltaStat[i].getPath();
}
}
Arrays.sort(deleteDeltas);
String[] expectedDeleteDeltas = new String[]{"delete_delta_0000003_0000005", "delete_delta_0000005_0000005_0000"};
if (!Arrays.deepEquals(expectedDeleteDeltas, deleteDeltas)) {
Assert.fail("Expected: " + Arrays.toString(expectedDeleteDeltas) + ", found: " + Arrays.toString(deleteDeltas));
}
checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, 0, 4L, 4L);
}
@Test
public void testMinorCompactionForSplitUpdateWithOnlyInserts() throws Exception {
String agentInfo = "UT_" + Thread.currentThread().getName();
String dbName = "default";
String tblName = "cws";
List<String> colNames = Arrays.asList("a", "b");
String columnNamesProperty = "a,b";
String columnTypesProperty = "int:string";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true',"
+ "'transactional_properties'='default')", driver);
// Insert some data -> this will generate only insert deltas and no delete deltas: delta_1_1
executeStatementOnDriver("INSERT INTO " + tblName +"(a,b) VALUES(1, 'foo')", driver);
// Insert some data -> this will again generate only insert deltas and no delete deltas: delta_2_2
executeStatementOnDriver("INSERT INTO " + tblName +"(a,b) VALUES(2, 'bar')", driver);
// Now, compact
// One important thing to note in this test is that minor compaction always produces
// delta_x_y and a counterpart delete_delta_x_y, even when there are no delete_delta events.
// Such a choice has been made to simplify processing of AcidUtils.getAcidState().
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR));
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean(true);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
// Find the location of the table
IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
Table table = msClient.getTable(dbName, tblName);
FileSystem fs = FileSystem.get(conf);
// Verify that we have got correct set of deltas.
FileStatus[] stat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter);
String[] deltas = new String[stat.length];
Path minorCompactedDelta = null;
for (int i = 0; i < deltas.length; i++) {
deltas[i] = stat[i].getPath().getName();
if (deltas[i].equals("delta_0000003_0000004")) {
minorCompactedDelta = stat[i].getPath();
}
}
Arrays.sort(deltas);
String[] expectedDeltas = new String[]{"delta_0000003_0000003_0000", "delta_0000003_0000004", "delta_0000004_0000004_0000"};
if (!Arrays.deepEquals(expectedDeltas, deltas)) {
Assert.fail("Expected: " + Arrays.toString(expectedDeltas) + ", found: " + Arrays.toString(deltas));
}
checkExpectedTxnsPresent(null, new Path[]{minorCompactedDelta}, columnNamesProperty, columnTypesProperty, 0, 3L, 4L);
// Verify that we have got correct set of delete_deltas.
FileStatus[] deleteDeltaStat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter);
String[] deleteDeltas = new String[deleteDeltaStat.length];
Path minorCompactedDeleteDelta = null;
for (int i = 0; i < deleteDeltas.length; i++) {
deleteDeltas[i] = deleteDeltaStat[i].getPath().getName();
if (deleteDeltas[i].equals("delete_delta_0000003_0000004")) {
minorCompactedDeleteDelta = deleteDeltaStat[i].getPath();
}
}
Arrays.sort(deleteDeltas);
String[] expectedDeleteDeltas = new String[]{"delete_delta_0000003_0000004"};
if (!Arrays.deepEquals(expectedDeleteDeltas, deleteDeltas)) {
Assert.fail("Expected: " + Arrays.toString(expectedDeleteDeltas) + ", found: " + Arrays.toString(deleteDeltas));
}
// There should be no rows in the delete_delta because there have been no delete events.
checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, 0, 0L, 0L);
}
@Test
public void minorCompactWhileStreamingWithSplitUpdate() throws Exception {
String dbName = "default";
String tblName = "cws";
List<String> colNames = Arrays.asList("a", "b");
String columnNamesProperty = "a,b";
String columnTypesProperty = "int:string";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 1 BUCKETS" + //currently ACID requires table to be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true',"
+ "'transactional_properties'='default')", driver);
HiveEndPoint endPt = new HiveEndPoint(null, dbName, tblName, null);
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] {"a","b"},",", endPt);
StreamingConnection connection = endPt.newConnection(false, "UT_" + Thread.currentThread().getName());
try {
// Write a couple of batches
for (int i = 0; i < 2; i++) {
writeBatch(connection, writer, false);
}
// Start a third batch, but don't close it.
writeBatch(connection, writer, true);
// Now, compact
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
txnHandler.compact(new CompactionRequest(dbName, tblName, CompactionType.MINOR));
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean(true);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
// Find the location of the table
IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
Table table = msClient.getTable(dbName, tblName);
FileSystem fs = FileSystem.get(conf);
FileStatus[] stat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deltaFileFilter);
String[] names = new String[stat.length];
Path resultFile = null;
for (int i = 0; i < names.length; i++) {
names[i] = stat[i].getPath().getName();
if (names[i].equals("delta_0000003_0000006")) {
resultFile = stat[i].getPath();
}
}
Arrays.sort(names);
String[] expected = new String[]{"delta_0000003_0000004",
"delta_0000003_0000006", "delta_0000005_0000006", "delta_0000007_0000008"};
if (!Arrays.deepEquals(expected, names)) {
Assert.fail("Expected: " + Arrays.toString(expected) + ", found: " + Arrays.toString(names));
}
checkExpectedTxnsPresent(null, new Path[]{resultFile},columnNamesProperty, columnTypesProperty, 0, 3L, 6L);
// Verify that we have got correct set of delete_deltas also
FileStatus[] deleteDeltaStat =
fs.listStatus(new Path(table.getSd().getLocation()), AcidUtils.deleteEventDeltaDirFilter);
String[] deleteDeltas = new String[deleteDeltaStat.length];
Path minorCompactedDeleteDelta = null;
for (int i = 0; i < deleteDeltas.length; i++) {
deleteDeltas[i] = deleteDeltaStat[i].getPath().getName();
if (deleteDeltas[i].equals("delete_delta_0000003_0000006")) {
minorCompactedDeleteDelta = deleteDeltaStat[i].getPath();
}
}
Arrays.sort(deleteDeltas);
String[] expectedDeleteDeltas = new String[]{"delete_delta_0000003_0000006"};
if (!Arrays.deepEquals(expectedDeleteDeltas, deleteDeltas)) {
Assert.fail("Expected: " + Arrays.toString(expectedDeleteDeltas) + ", found: " + Arrays.toString(deleteDeltas));
}
// There should be no rows in the delete_delta because there have been no delete events.
checkExpectedTxnsPresent(null, new Path[]{minorCompactedDeleteDelta}, columnNamesProperty, columnTypesProperty, 0, 0L, 0L);
} finally {
connection.close();
}
}
/**
* Users have the choice of specifying compaction related tblproperties either in CREATE TABLE
* statement or in ALTER TABLE .. COMPACT statement. This tests both cases.
* @throws Exception
*/
@Test
public void testTableProperties() throws Exception {
String tblName1 = "ttp1"; // plain acid table
String tblName2 = "ttp2"; // acid table with customized tblproperties
executeStatementOnDriver("drop table if exists " + tblName1, driver);
executeStatementOnDriver("drop table if exists " + tblName2, driver);
executeStatementOnDriver("CREATE TABLE " + tblName1 + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("CREATE TABLE " + tblName2 + "(a INT, b STRING) " +
" CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES (" +
"'transactional'='true'," +
"'compactor.mapreduce.map.memory.mb'='2048'," + // 2048 MB memory for compaction map job
"'compactorthreshold.hive.compactor.delta.num.threshold'='4'," + // minor compaction if more than 4 delta dirs
"'compactorthreshold.hive.compactor.delta.pct.threshold'='0.49'" + // major compaction if more than 49%
")", driver);
// Insert 5 rows to both tables
executeStatementOnDriver("insert into " + tblName1 + " values (1, 'a')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (2, 'b')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (3, 'c')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (4, 'd')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (5, 'e')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (1, 'a')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (2, 'b')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (3, 'c')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (4, 'd')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (5, 'e')", driver);
runInitiator(conf);
// Compactor should only schedule compaction for ttp2 (delta.num.threshold=4), not ttp1
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(1, rsp.getCompacts().size());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(CompactionType.MAJOR, rsp.getCompacts().get(0).getType()); // type is MAJOR since there's no base yet
// Finish the scheduled compaction for ttp2, and manually compact ttp1, to make them comparable again
executeStatementOnDriver("alter table " + tblName1 + " compact 'major'", driver);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(2, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp1", rsp.getCompacts().get(1).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(1).getState());
// compact ttp2, by running the Worker explicitly, in order to get the reference to the compactor MR job
AtomicBoolean stop = new AtomicBoolean(true);
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
JobConf job = t.getMrJob();
Assert.assertEquals("2048", job.get("mapreduce.map.memory.mb")); // 2048 comes from tblproperties
// Compact ttp1
stop = new AtomicBoolean(true);
t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
job = t.getMrJob();
Assert.assertEquals("1024", job.get("mapreduce.map.memory.mb")); // 1024 is the default value
// Clean up
runCleaner(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(2, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp1", rsp.getCompacts().get(1).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(1).getState());
// Insert one more row - this should trigger hive.compactor.delta.pct.threshold to be reached for ttp2
executeStatementOnDriver("insert into " + tblName1 + " values (6, 'f')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (6, 'f')", driver);
// Intentionally set this high so that it will not trigger major compaction for ttp1.
// Only trigger major compaction for ttp2 (delta.pct.threshold=0.5) because of the newly inserted row (actual pct: 0.66)
conf.setFloatVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_PCT_THRESHOLD, 0.8f);
runInitiator(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(3, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
// Finish the scheduled compaction for ttp2
runWorker(conf);
runCleaner(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(3, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(0).getState());
// Now test tblproperties specified on ALTER TABLE .. COMPACT .. statement
executeStatementOnDriver("insert into " + tblName2 + " values (7, 'g')", driver);
executeStatementOnDriver("alter table " + tblName2 + " compact 'major'" +
" with overwrite tblproperties (" +
"'compactor.mapreduce.map.memory.mb'='3072'," +
"'tblprops.orc.compress.size'='8192')", driver);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(4, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
// Run the Worker explicitly, in order to get the reference to the compactor MR job
stop = new AtomicBoolean(true);
t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
job = t.getMrJob();
Assert.assertEquals("3072", job.get("mapreduce.map.memory.mb"));
Assert.assertTrue(job.get("hive.compactor.table.props").contains("orc.compress.size4:8192"));
}
private void writeBatch(StreamingConnection connection, DelimitedInputWriter writer,
boolean closeEarly)
throws InterruptedException, StreamingException {
TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer);
txnBatch.beginNextTransaction();
txnBatch.write("50,Kiev".getBytes());
txnBatch.write("51,St. Petersburg".getBytes());
txnBatch.write("44,Boston".getBytes());
txnBatch.commit();
if (!closeEarly) {
txnBatch.beginNextTransaction();
txnBatch.write("52,Tel Aviv".getBytes());
txnBatch.write("53,Atlantis".getBytes());
txnBatch.write("53,Boston".getBytes());
txnBatch.commit();
txnBatch.close();
}
}
private void checkExpectedTxnsPresent(Path base, Path[] deltas, String columnNamesProperty,
String columnTypesProperty, int bucket, long min, long max)
throws IOException {
ValidTxnList txnList = new ValidTxnList() {
@Override
public boolean isTxnValid(long txnid) {
return true;
}
@Override
public RangeResponse isTxnRangeValid(long minTxnId, long maxTxnId) {
return RangeResponse.ALL;
}
@Override
public String writeToString() {
return "";
}
@Override
public void readFromString(String src) {
}
@Override
public long getHighWatermark() {
return Long.MAX_VALUE;
}
@Override
public long[] getInvalidTransactions() {
return new long[0];
}
@Override
public boolean isValidBase(long txnid) {
return true;
}
@Override
public boolean isTxnAborted(long txnid) {
return true;
}
@Override
public RangeResponse isTxnRangeAborted(long minTxnId, long maxTxnId) {
return RangeResponse.ALL;
}
};
OrcInputFormat aif = new OrcInputFormat();
Configuration conf = new Configuration();
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, columnNamesProperty);
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, columnTypesProperty);
HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
AcidInputFormat.RawReader<OrcStruct> reader =
aif.getRawReader(conf, false, bucket, txnList, base, deltas);
RecordIdentifier identifier = reader.createKey();
OrcStruct value = reader.createValue();
long currentTxn = min;
boolean seenCurrentTxn = false;
while (reader.next(identifier, value)) {
if (!seenCurrentTxn) {
Assert.assertEquals(currentTxn, identifier.getTransactionId());
seenCurrentTxn = true;
}
if (currentTxn != identifier.getTransactionId()) {
Assert.assertEquals(currentTxn + 1, identifier.getTransactionId());
currentTxn++;
}
}
Assert.assertEquals(max, currentTxn);
}
/**
* convenience method to execute a select stmt and dump results to log file
*/
private static List<String> execSelectAndDumpData(String selectStmt, Driver driver, String msg)
throws Exception {
executeStatementOnDriver(selectStmt, driver);
ArrayList<String> valuesReadFromHiveDriver = new ArrayList<String>();
driver.getResults(valuesReadFromHiveDriver);
int rowIdx = 0;
LOG.debug(msg);
for(String row : valuesReadFromHiveDriver) {
LOG.debug(" rowIdx=" + rowIdx++ + ":" + row);
}
return valuesReadFromHiveDriver;
}
/**
* Execute Hive CLI statement
* @param cmd arbitrary statement to execute
*/
static void executeStatementOnDriver(String cmd, Driver driver) throws IOException, CommandNeedRetryException {
LOG.debug("Executing: " + cmd);
CommandProcessorResponse cpr = driver.run(cmd);
if(cpr.getResponseCode() != 0) {
throw new IOException("Failed to execute \"" + cmd + "\". Driver returned: " + cpr);
}
}
static void createTestDataFile(String filename, String[] lines) throws IOException {
FileWriter writer = null;
try {
File file = new File(filename);
file.deleteOnExit();
writer = new FileWriter(file);
for (String line : lines) {
writer.write(line + "\n");
}
} finally {
if (writer != null) {
writer.close();
}
}
}
static void runInitiator(HiveConf hiveConf) throws MetaException {
AtomicBoolean stop = new AtomicBoolean(true);
Initiator t = new Initiator();
t.setThreadId((int) t.getId());
t.setHiveConf(hiveConf);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
}
static void runWorker(HiveConf hiveConf) throws MetaException {
AtomicBoolean stop = new AtomicBoolean(true);
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(hiveConf);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
}
static void runCleaner(HiveConf hiveConf) throws MetaException {
AtomicBoolean stop = new AtomicBoolean(true);
Cleaner t = new Cleaner();
t.setThreadId((int) t.getId());
t.setHiveConf(hiveConf);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
}
}