package com.thinkbiganalytics.spark; /*- * #%L * thinkbig-commons-spark-api * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Row; import org.apache.spark.sql.types.StructType; import java.util.List; /** * A collection of objects that can be transformed using Spark functions. */ public interface DataSet { /** * Returns the content of this data set as a Spark RDD. * * @return a Spark RDD */ RDD<Row> rdd(); /** * Returns the content of this data set as a Spark RDD. * * @return a Spark RDD */ JavaRDD<Row> javaRDD(); /** * Filters rows using the specified SQL expression. * * @param condition a SQL expression * @return the filtered data set */ DataSet filter(String condition); /** * Drops the specified column from this data set. * * @param condition the column to be dropped * @return the data set without the column */ DataSet drop(String condition); /** * Converts this strongly-typed data set to a generic data set. * * @return the generic data set */ DataSet toDF(); /** * Returns the number of rows in this data set. * * @return the row count */ long count(); /** * Registers this data set as a temporary table with the specified name. * * @param tableName the name for the temporary table */ void registerTempTable(String tableName); /** * Returns the schema of this data set. * * @return the schema */ StructType schema(); /** * Returns a list that contains all rows in this data set. * * @return the rows */ List<Row> collectAsList(); /** * Saves the content of this data set as the specified table. * * @param partitionColumn the name of the partition column * @param fqnTable the name for the table */ void writeToTable(String partitionColumn, String fqnTable); }