package com.splout.db.hadoop; /* * #%L * Splout SQL Hadoop library * %% * Copyright (C) 2012 Datasalt Systems S.L. * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.datasalt.pangool.io.Fields; import com.datasalt.pangool.io.Schema; import com.datasalt.pangool.tuplemr.OrderBy; import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat; import com.splout.db.engine.DefaultEngine; import com.splout.db.hadoop.TableBuilder.TableBuilderException; import com.splout.db.hadoop.TablespaceBuilder.TablespaceBuilderException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * A JSON-friendly bean that can map to a {@link TablespaceSpec} easily. It only supports text input. Use * {@link #build()} for obtaining a TablespaceSpec instance. * <p/> * For specifying compound indexes use "," separator in the string index definition. */ public class JSONTablespaceDefinition { private final static Log log = LogFactory.getLog(JSONTablespaceDefinition.class); private String engine; private String name; protected int nPartitions; protected List<JSONTableDefinition> partitionedTables; protected List<JSONTableDefinition> replicateAllTables = new ArrayList<JSONTableDefinition>(); /* * Inner static method for converting a {@link JSONTableDefinition} into a {@link Table} bean through {@link * TableBuilder}. */ protected static Table buildTable(JSONTableDefinition table, boolean isReplicateAll, Configuration hadoopConf) throws TableBuilderException, IOException { if (table.getName() == null) { throw new IllegalArgumentException("Must provide a name for all tables."); } if (!isReplicateAll && (table.getPartitionFields() == null || table.getPartitionFields() == null || table .getPartitionFields().length() == 0)) { throw new IllegalArgumentException("Partitioned table must be partitioned by some field."); } if (table.getTableInputs() == null || table.getTableInputs().size() == 0) { throw new IllegalArgumentException("Table must have some table inputs."); } // First we look to see if there is any HIVE or CASCADING tableInputs // So we can instantiate the TableBuilder in one way or another (explicit / implicit schema). boolean implicitSchema = false; for (JSONTableInputDefinition tableInput : table.getTableInputs()) { if (tableInput.getInputType().equals(InputType.HIVE) || tableInput.getInputType().equals(InputType.CASCADING)) { implicitSchema = true; break; } } TableBuilder tableBuilder; Schema schema = null; if (table.getSchema() != null) { if (implicitSchema) { tableBuilder = new TableBuilder(table.getName(), hadoopConf); log.warn("Ignoring explicit schema declaration in tablespace descriptor as there are other implicit schema input sources (HIVE; CASCADING)"); } else { schema = new Schema(table.getName(), Fields.parse(table.getSchema())); tableBuilder = new TableBuilder(schema); } } else { tableBuilder = new TableBuilder(table.getName(), hadoopConf); } for (String index : table.getIndexes()) { tableBuilder.createIndex(index.split(",")); } if (!isReplicateAll) { tableBuilder.partitionBy(table.getPartitionFields().split(",")); } // Adding pre and post SQL if (table.getInitialStatements().size() != 0) { tableBuilder.preInsertsSQL(table.getInitialStatements().toArray(new String[0])); } if (table.getPreInsertStatements().size() != 0) { tableBuilder.preInsertsSQL(table.getPreInsertStatements().toArray(new String[0])); } if (table.getPostInsertStatements().size() != 0) { tableBuilder.preInsertsSQL(table.getPostInsertStatements().toArray(new String[0])); } if (table.getFinalStatements().size() != 0) { tableBuilder.finalSQL(table.getFinalStatements().toArray(new String[0])); } for (JSONTableInputDefinition tableInput : table.getTableInputs()) { TextInputSpecs specs = tableInput.getInputSpecs(); if (specs == null) { specs = new TextInputSpecs(); // default specs (tabulated file) } if (!tableInput.getInputType().equals(InputType.HIVE) && (tableInput.getPaths() == null || tableInput.getPaths().size() == 0)) { throw new IllegalArgumentException("All table inputs except HIVE must have input paths."); } if (tableInput.getInputType().equals(InputType.TEXT)) { for (String file : tableInput.getPaths()) { // Text file - like in SimpleGeneratorCMD, need a Pangool schema for parsing it if (schema == null) { throw new IllegalArgumentException( "A Pangool schema must be provided when using InputType = TEXT"); } if (specs.getFixedWidthFields() != null) { // Fixed width fields definition. int[] fieldsArr = new int[specs.getFixedWidthFields().size()]; for (int i = 0; i < fieldsArr.length; i++) { fieldsArr[i] = specs.getFixedWidthFields().get(i); } tableBuilder.addFixedWidthTextFile(new Path(file), schema, fieldsArr, specs.isSkipHeader(), specs.getNullString(), null); } else { // CSV definition tableBuilder.addCSVTextFile(file, specs.getSeparatorChar(), specs.getQuotesChar(), specs.getEscapeChar(), specs.isSkipHeader(), specs.isStrictQuotes(), specs.getNullString()); } } } else if (tableInput.getInputType().equals(InputType.TUPLE)) { for (String file : tableInput.getPaths()) { // Pangool Tuple file tableBuilder.addTupleFile(new Path(file)); } } else if (tableInput.getInputType().equals(InputType.CASCADING)) { for (String file : tableInput.getPaths()) { // Cascading Tuple file if (tableInput.getCascadingColumns() == null) { throw new IllegalArgumentException( "Comma-separated column names (property cascadingColumns) must be specified when using InputType = CASCADING."); } tableBuilder.addCascadingTable(new Path(file), tableInput.getCascadingColumns().split(",")); } } else if (tableInput.getInputType().equals(InputType.HIVE)) { if (tableInput.getHiveDbName() == null || tableInput.getHiveTableName() == null) { throw new IllegalArgumentException( "hiveDbName and hiveTableName properties must be specified when using InputType = HIVE."); } tableBuilder.addHiveTable(tableInput.getHiveDbName(), tableInput.getHiveTableName()); } } if (table.getInsertionOrderBy() != null) { tableBuilder.insertionSortOrder(OrderBy.parse(table.getInsertionOrderBy())); } if (isReplicateAll) { tableBuilder.replicateToAll(); } return tableBuilder.build(); } /** * Use this method for obtaining a {@link TablespaceSpec} through {@link TablespaceBuilder}. * * @throws IOException */ public TablespaceSpec build(Configuration hadoopConf) throws TablespaceBuilderException, TableBuilderException, IOException { TablespaceBuilder builder = new TablespaceBuilder(); builder.setNPartitions(nPartitions); if (partitionedTables == null) { throw new IllegalArgumentException("Can't build a " + TablespaceSpec.class.getName() + " without any partitioned table."); } if (name == null) { throw new IllegalArgumentException("Must provide a name for the Tablespace."); } if (engine == null) { engine = DefaultEngine.class.getName(); } builder.setEngineClassName(engine); for (JSONTableDefinition table : partitionedTables) { builder.add(buildTable(table, false, hadoopConf)); } for (JSONTableDefinition table : replicateAllTables) { builder.add(buildTable(table, true, hadoopConf)); } return builder.build(); } public static class JSONTableDefinition { private String name; private List<JSONTableInputDefinition> tableInputs; private String schema; private String partitionFields; private String insertionOrderBy; private List<String> indexes = new ArrayList<String>(); private List<String> initialStatements = new ArrayList<String>(); private List<String> preInsertStatements = new ArrayList<String>(); private List<String> postInsertStatements = new ArrayList<String>(); private List<String> finalStatements = new ArrayList<String>(); public List<JSONTableInputDefinition> getTableInputs() { return tableInputs; } public void setTableInputs(List<JSONTableInputDefinition> tableInputs) { this.tableInputs = tableInputs; } public String getSchema() { return schema; } public void setSchema(String schema) { this.schema = schema; } public String getPartitionFields() { return partitionFields; } public void setPartitionFields(String partitionFields) { this.partitionFields = partitionFields; } public List<String> getIndexes() { return indexes; } public void setIndexes(List<String> indexes) { this.indexes = indexes; } public List<String> getInitialStatements() { return initialStatements; } public void setInitialStatements(List<String> initialStatements) { this.initialStatements = initialStatements; } public List<String> getPostInsertStatements() { return postInsertStatements; } public void setPostInsertStatements(List<String> postInsertStatements) { this.postInsertStatements = postInsertStatements; } public String getName() { return name; } public void setName(String name) { this.name = name; } public List<String> getPreInsertStatements() { return preInsertStatements; } public void setPreInsertStatements(List<String> preInsertStatements) { this.preInsertStatements = preInsertStatements; } public List<String> getFinalStatements() { return finalStatements; } public void setFinalStatements(List<String> finalStatements) { this.finalStatements = finalStatements; } String getInsertionOrderBy() { return insertionOrderBy; } void setInsertionOrderBy(String insertionOrderBy) { this.insertionOrderBy = insertionOrderBy; } } public static class JSONTableInputDefinition { private TextInputSpecs inputSpecs; private InputType inputType = InputType.TEXT; private String cascadingColumns; private String hiveTableName; private String hiveDbName; private List<String> paths; public TextInputSpecs getInputSpecs() { return inputSpecs; } public void setInputSpecs(TextInputSpecs inputSpecs) { this.inputSpecs = inputSpecs; } public List<String> getPaths() { return paths; } public void setPaths(List<String> paths) { this.paths = paths; } public InputType getInputType() { return inputType; } public void setInputType(InputType inputType) { this.inputType = inputType; } public String getCascadingColumns() { return cascadingColumns; } public void setCascadingColumns(String cascadingColumns) { this.cascadingColumns = cascadingColumns; } public String getHiveTableName() { return hiveTableName; } public void setHiveTableName(String hiveTableName) { this.hiveTableName = hiveTableName; } public String getHiveDbName() { return hiveDbName; } public void setHiveDbName(String hiveDbName) { this.hiveDbName = hiveDbName; } } public static class TextInputSpecs { private char separatorChar = '\t'; private char quotesChar = TupleTextInputFormat.NO_QUOTE_CHARACTER; private char escapeChar = TupleTextInputFormat.NO_ESCAPE_CHARACTER; private boolean skipHeader = false; private boolean strictQuotes = false; private String nullString = TupleTextInputFormat.NO_NULL_STRING; private ArrayList<Integer> fixedWidthFields = null; public char getSeparatorChar() { return separatorChar; } public void setSeparatorChar(char separatorChar) { this.separatorChar = separatorChar; } public char getQuotesChar() { return quotesChar; } public void setQuotesChar(char quotesChar) { this.quotesChar = quotesChar; } public char getEscapeChar() { return escapeChar; } public void setEscapeChar(char escapeChar) { this.escapeChar = escapeChar; } public boolean isSkipHeader() { return skipHeader; } public void setSkipHeader(boolean skipHeader) { this.skipHeader = skipHeader; } public boolean isStrictQuotes() { return strictQuotes; } public void setStrictQuotes(boolean strictQuotes) { this.strictQuotes = strictQuotes; } public String getNullString() { return nullString; } public void setNullString(String nullString) { this.nullString = nullString; } public ArrayList<Integer> getFixedWidthFields() { return fixedWidthFields; } public void setFixedWidthFields(ArrayList<Integer> fixedWidthFields) { this.fixedWidthFields = fixedWidthFields; } } public int getnPartitions() { return nPartitions; } public void setnPartitions(int nPartitions) { this.nPartitions = nPartitions; } public List<JSONTableDefinition> getPartitionedTables() { return partitionedTables; } public void setPartitionedTables(List<JSONTableDefinition> partitionedTables) { this.partitionedTables = partitionedTables; } public List<JSONTableDefinition> getReplicateAllTables() { return replicateAllTables; } public void setReplicateAllTables(List<JSONTableDefinition> replicateAllTables) { this.replicateAllTables = replicateAllTables; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getEngine() { return engine; } public void setEngine(String engine) { this.engine = engine; } }