package com.splout.db.hadoop; /* * #%L * Splout SQL Hadoop library * %% * Copyright (C) 2012 Datasalt Systems S.L. * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.datasalt.pangool.io.ITuple; import com.datasalt.pangool.io.Schema; import com.datasalt.pangool.io.Schema.Field; import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat; import com.google.common.collect.ImmutableList; import com.splout.db.engine.SploutEngine; import com.splout.db.hadoop.TableSpec.FieldIndex; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputFormat; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; /** * Immutable bean that defines a Tablespace whose view has to be generated. It may contain one or more {@link Table} beans. * It can be obtained by {@link TablespaceBuilder}. */ public class TablespaceSpec { private final ImmutableList<Table> partitionedTables; private final ImmutableList<Table> replicateAllTables; private final int nPartitions; private final List<String> initStatements; private final SploutEngine engine; TablespaceSpec(List<Table> partitionedTables, List<Table> replicateAllTables, int nPartitions, List<String> initStatements, SploutEngine engine) { this.partitionedTables = ImmutableList.copyOf(partitionedTables); this.replicateAllTables = ImmutableList.copyOf(replicateAllTables == null ? new ArrayList<Table>() : replicateAllTables); this.nPartitions = nPartitions; this.initStatements = initStatements; this.engine = engine; } /** * (Common case that can be built without using the builder) */ public static TablespaceSpec of(Schema schema, String partitionField, Path input, InputFormat<ITuple, NullWritable> inputFormat, int nPartitions) { return of(schema, new String[]{partitionField}, input, inputFormat, nPartitions); } /** * Schema-less quick tablespace builder that samples the first record of the first InputSplit in order to obtain the Table Schema. * Note that this will only work for InputFormats that can obtain the Schema implicitly (e.g. TupleInputFormat). */ public static TablespaceSpec of(Configuration conf, String[] partitionFields, Path input, InputFormat<ITuple, NullWritable> inputFormat, int nPartitions) throws IOException, InterruptedException { if (inputFormat instanceof TupleTextInputFormat) { throw new IllegalArgumentException("Can't derive an implicit schema from a text file."); } return of(SchemaSampler.sample(conf, input, inputFormat), partitionFields, input, inputFormat, nPartitions); } public static TablespaceSpec of(Schema schema, String[] partitionFields, Path input, InputFormat<ITuple, NullWritable> inputFormat, int nPartitions) { List<Table> partitionedTables = new ArrayList<Table>(); if (schema == null) { throw new IllegalArgumentException("Schema can't be null."); } if (partitionFields == null) { throw new IllegalArgumentException("Partition fields can't be null"); } if (input == null) { throw new IllegalArgumentException("Input path can't be null"); } if (inputFormat == null) { throw new IllegalArgumentException("Input format can't be null"); } List<Field> fields = new ArrayList<Field>(); for (String partitionField : partitionFields) { Field field = schema.getField(partitionField); if (field == null) { throw new IllegalArgumentException("Partition field not contained in input schema: " + partitionField); } fields.add(field); } Field[] partitionByFields = fields.toArray(new Field[0]); partitionedTables.add(new Table(new TableInput(inputFormat, new HashMap<String, String>(), schema, new IdentityRecordProcessor(), input), new TableSpec(schema, partitionByFields, new FieldIndex[]{new FieldIndex(partitionByFields)}, null, null, null, null, null))); TablespaceSpec tablespace = new TablespaceSpec(partitionedTables, new ArrayList<Table>(), nPartitions, null, SploutEngine.getDefault()); return tablespace; } // ---- Getters ---- // public ImmutableList<Table> getPartitionedTables() { return partitionedTables; } public ImmutableList<Table> getReplicateAllTables() { return replicateAllTables; } public int getnPartitions() { return nPartitions; } public List<String> getInitStatements() { return initStatements; } public SploutEngine getEngine() { return engine; } }