/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.sqoop.manager.oracle; import java.io.IOException; import java.sql.Connection; import java.sql.SQLException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import com.cloudera.sqoop.lib.SqoopRecord; import com.cloudera.sqoop.mapreduce.db.DBConfiguration; import com.cloudera.sqoop.mapreduce.db.DataDrivenDBInputFormat; /** * Reads data from Oracle table - data is divided between mappers based on ROWID * split. * * @param <T> Output type of the record reader */ public class OraOopDataDrivenDBInputFormat<T extends SqoopRecord> extends DataDrivenDBInputFormat<T> implements Configurable { public static final OraOopLog LOG = OraOopLogFactory .getLog(OraOopDataDrivenDBInputFormat.class.getName()); public OraOopDataDrivenDBInputFormat() { super(); OraOopUtilities.checkJavaSecurityEgd(); } @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { int desiredNumberOfMappers = getDesiredNumberOfMappers(jobContext); // Resolve the Oracle owner and name of the table we're importing... OracleTable table = identifyOracleTableFromJobContext(jobContext); List<String> partitionList = getPartitionList(jobContext); // Get our Oracle connection... Connection connection = getConnection(); List<InputSplit> splits = null; try { OracleConnectionFactory.initializeOracleConnection(connection, getConf()); // The number of chunks generated will *not* be a multiple of the number // of splits, // to ensure that each split doesn't always get data from the start of // each data-file... int numberOfChunksPerOracleDataFile = (desiredNumberOfMappers * 2) + 1; // Get the Oracle data-chunks for the table... List<? extends OraOopOracleDataChunk> dataChunks; if (OraOopUtilities.getOraOopOracleDataChunkMethod(getConf()).equals( OraOopConstants.OraOopOracleDataChunkMethod.PARTITION)) { dataChunks = OraOopOracleQueries.getOracleDataChunksPartition(connection, table, partitionList); } else { dataChunks = OraOopOracleQueries.getOracleDataChunksExtent(jobContext .getConfiguration(), connection, table, partitionList, numberOfChunksPerOracleDataFile); } if (dataChunks.size() == 0) { String errMsg; if (OraOopUtilities.getOraOopOracleDataChunkMethod(getConf()).equals( OraOopConstants.OraOopOracleDataChunkMethod.PARTITION)) { errMsg = String .format( "The table %s does not contain any partitions and you " + "have specified to chunk the table by partitions.", table.getName()); } else { errMsg = String.format("The table %s does not contain any data.", table .getName()); } LOG.fatal(errMsg); throw new RuntimeException(errMsg); } else { OraOopConstants.OraOopOracleBlockToSplitAllocationMethod blockAllocationMethod = OraOopUtilities .getOraOopOracleBlockToSplitAllocationMethod( jobContext.getConfiguration(), OraOopConstants. OraOopOracleBlockToSplitAllocationMethod.ROUNDROBIN); // Group the Oracle data-chunks into splits... splits = groupTableDataChunksIntoSplits(dataChunks, desiredNumberOfMappers, blockAllocationMethod); String oraoopLocations = jobContext.getConfiguration().get("oraoop.locations", ""); String[] locations = oraoopLocations.split(","); for (int idx = 0; idx < locations.length; idx++) { if (idx < splits.size()) { String location = locations[idx].trim(); if (!location.isEmpty()) { ((OraOopDBInputSplit) splits.get(idx)).setSplitLocation(location); LOG.info(String .format("Split[%d] has been assigned location \"%s\".", idx, location)); } } } } connection.commit(); } catch (SQLException ex) { try { connection.rollback(); } catch (SQLException e) { LOG.error("Cannot rollback transaction.", e); } throw new IOException(ex); } finally { closeConnection(); } return splits; } @Override protected RecordReader<LongWritable, T> createDBRecordReader( DBInputSplit split, Configuration conf) throws IOException { // This code is now running on a Datanode in the Hadoop cluster, so we need // to enable debug logging in this JVM... OraOopUtilities.enableDebugLoggingIfRequired(conf); // Retrieve the JDBC URL that should be used by this mapper. // We achieve this by modifying the JDBC URL property in the configuration, // prior to the // OraOopDBRecordReader (or its ancestors) using the configuration to // establish a connection // to the database - via DBConfiguration.getConnection()... OraOopDBInputSplit oraOopSplit = OraOopDBRecordReader.castSplit(split); int mapperId = oraOopSplit.getSplitId(); String mapperJdbcUrlPropertyName = OraOopUtilities.getMapperJdbcUrlPropertyName(mapperId, conf); // Get this mapper's JDBC URL String mapperJdbcUrl = conf.get(mapperJdbcUrlPropertyName, null); LOG.debug(String.format("Mapper %d has a JDBC URL of: %s", mapperId, mapperJdbcUrl == null ? "<null>" : mapperJdbcUrl)); DBConfiguration dbConf = getDBConf(); if (mapperJdbcUrl != null) { // Just changing the URL_PROPERTY in the conf object does not work - as // dbConf.getConf() // seems to refer to a separate instance of the configuration properties. // Therefore, we // need to update the URL_PROPERTY in dbConf so that we connect to the // appropriate instance // in the Oracle RAC. To help avoid confusion, we'll also update the // URL_PROPERTY in the // conf object to match... dbConf.getConf().set(DBConfiguration.URL_PROPERTY, mapperJdbcUrl); conf.set(DBConfiguration.URL_PROPERTY, mapperJdbcUrl); } @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); try { // Use Oracle-specific db reader // this.getConnection() will return the connection created when the // DBInputFormat ancestor // was created. This connection will be based on the URL_PROPERTY that was // current at that // time. We've just changed the URL_PROPERTY (if this is an Oracle RAC) // and therefore need // to use dbConf.getConnection() so that a new connection is created using // the current // value of the URL_PROPERTY... return new OraOopDBRecordReader<T>(split, inputClass, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf .getInputFieldNames(), dbConf.getInputTableName()); } catch (SQLException ex) { throw new IOException(ex); } } private OracleTable identifyOracleTableFromJobContext(JobContext jobContext) { OracleTable result = new OracleTable(); String dbUserName = jobContext.getConfiguration().get(DBConfiguration.USERNAME_PROPERTY); String tableName = getDBConf().getInputTableName(); result = OraOopUtilities.decodeOracleTableName(dbUserName, tableName, jobContext .getConfiguration()); return result; } private int getDesiredNumberOfMappers(JobContext jobContext) { int desiredNumberOfMappers = jobContext.getConfiguration().getInt( OraOopConstants.ORAOOP_DESIRED_NUMBER_OF_MAPPERS, -1); int minMappersAcceptedByOraOop = OraOopUtilities.getMinNumberOfImportMappersAcceptedByOraOop(jobContext .getConfiguration()); if (desiredNumberOfMappers < minMappersAcceptedByOraOop) { LOG.warn(String.format("%s should not be used to perform a sqoop import " + "when the number of mappers is %d\n " + "i.e. OraOopManagerFactory.accept() should only appect jobs " + "where the number of mappers is at least %d", OraOopConstants.ORAOOP_PRODUCT_NAME, desiredNumberOfMappers, minMappersAcceptedByOraOop)); } return desiredNumberOfMappers; } private List<String> getPartitionList(JobContext jobContext) { LOG.debug(OraOopConstants.ORAOOP_IMPORT_PARTITION_LIST + " = " + jobContext.getConfiguration().get( OraOopConstants.ORAOOP_IMPORT_PARTITION_LIST)); List<String> result = OraOopUtilities.splitOracleStringList(jobContext.getConfiguration() .get(OraOopConstants.ORAOOP_IMPORT_PARTITION_LIST)); if (result != null && result.size() > 0) { LOG.debug("Partition filter list: " + result.toString()); } return result; } protected List<InputSplit> groupTableDataChunksIntoSplits( List<? extends OraOopOracleDataChunk> dataChunks, int desiredNumberOfSplits, OraOopConstants.OraOopOracleBlockToSplitAllocationMethod blockAllocationMethod) { int numberOfDataChunks = dataChunks.size(); int actualNumberOfSplits = Math.min(numberOfDataChunks, desiredNumberOfSplits); int totalNumberOfBlocksInAllDataChunks = 0; for (OraOopOracleDataChunk dataChunk : dataChunks) { totalNumberOfBlocksInAllDataChunks += dataChunk.getNumberOfBlocks(); } String debugMsg = String.format( "The table being imported by sqoop has %d blocks " + "that have been divided into %d chunks " + "which will be processed in %d splits. " + "The chunks will be allocated to the splits using the method : %s", totalNumberOfBlocksInAllDataChunks, numberOfDataChunks, actualNumberOfSplits, blockAllocationMethod.toString()); LOG.info(debugMsg); List<InputSplit> splits = new ArrayList<InputSplit>(actualNumberOfSplits); for (int i = 0; i < actualNumberOfSplits; i++) { OraOopDBInputSplit split = new OraOopDBInputSplit(); split.setSplitId(i); split.setTotalNumberOfBlocksInAllSplits( totalNumberOfBlocksInAllDataChunks); splits.add(split); } switch (blockAllocationMethod) { case RANDOM: // Randomize the order of the data chunks and then "fall through" into // the ROUNDROBIN block below... Collections.shuffle(dataChunks); // NB: No "break;" statement here - we're intentionally falling into the // ROUNDROBIN block below... //$FALL-THROUGH$ case ROUNDROBIN: int idxSplitRoundRobin = 0; for (OraOopOracleDataChunk dataChunk : dataChunks) { if (idxSplitRoundRobin >= splits.size()) { idxSplitRoundRobin = 0; } OraOopDBInputSplit split = (OraOopDBInputSplit) splits.get(idxSplitRoundRobin++); split.getDataChunks().add(dataChunk); } break; case SEQUENTIAL: double dataChunksPerSplit = dataChunks.size() / (double) splits.size(); int dataChunksAllocatedToSplits = 0; int idxSplitSeq = 0; for (OraOopOracleDataChunk dataChunk : dataChunks) { OraOopDBInputSplit split = (OraOopDBInputSplit) splits.get(idxSplitSeq); split.getDataChunks().add(dataChunk); dataChunksAllocatedToSplits++; if (dataChunksAllocatedToSplits >= (dataChunksPerSplit * (idxSplitSeq + 1)) && idxSplitSeq < splits.size()) { idxSplitSeq++; } } break; default: throw new RuntimeException("Block allocation method not implemented."); } if (LOG.isDebugEnabled()) { for (int idx = 0; idx < splits.size(); idx++) { LOG.debug("\n\t" + ((OraOopDBInputSplit) splits.get(idx)).getDebugDetails()); } } return splits; } }