/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.hpg.bigdata.core.lib;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.storage.StorageLevel;
import org.opencb.biodata.models.core.Region;
import org.opencb.commons.datastore.core.Query;
import scala.Symbol;
import scala.Tuple2;
import scala.collection.Seq;
import java.util.List;
import java.util.Map;
/**
* Created by imedina on 04/08/16.
*/
public abstract class ParentDataset<T> {
protected Query query;
protected String viewName;
protected Dataset<Row> ds;
protected SQLContext sqlContext;
protected SparkSession sparkSession;
public ParentDataset() {
ds = null;
sqlContext = null;
query = new Query();
}
public void load(String filename, SparkSession sparkSession) throws Exception {
this.sparkSession = sparkSession;
sqlContext = new SQLContext(sparkSession);
if (StringUtils.endsWithAny(filename, "avro", "avro.gz", "avro.sz")) {
ds = sqlContext.read().format("com.databricks.spark.avro").load(filename);
} else if (StringUtils.endsWithAny(filename, "json", "json.gz")) {
ds = sqlContext.read().json(filename);
} else {
ds = sqlContext.read().load(filename);
}
}
protected abstract void updateDataset(Query query);
public void update() {
updateDataset(query);
}
// region filter
public ParentDataset<T> regionFilter(String regions) {
query.put("region", regions);
return this;
}
public ParentDataset<T> regionFilter(Region region) {
return regionFilter(region.toString());
}
public ParentDataset<T> regionFilter(List<Region> regions) {
return regionFilter(StringUtils.join(",", regions).replace(",[", "").replace("[", "").replace("]", ""));
}
public ParentDataset<T> agg(Column expr, Column... exprs) {
ds = ds.agg(expr, exprs);
return this;
}
public ParentDataset<T> agg(Column expr, Seq<Column> exprs) {
ds = ds.agg(expr, exprs);
return this;
}
public ParentDataset<T> agg(scala.collection.immutable.Map<String, String> exprs) {
ds = ds.agg(exprs);
return this;
}
public ParentDataset<T> agg(Map<String, String> exprs) {
ds = ds.agg(exprs);
return this;
}
public ParentDataset<T> agg(Tuple2<String, String> aggExpr, Seq<Tuple2<String, String>> aggExprs) {
ds = ds.agg(aggExpr, aggExprs);
return this;
}
public ParentDataset<T> alias(String alias) {
ds = ds.alias(alias);
return this;
}
public ParentDataset<T> alias(Symbol alias) {
ds = ds.alias(alias);
return this;
}
public Column apply(String colName) {
return ds.apply(colName);
}
public ParentDataset<T> as(String alias) {
ds = ds.as(alias);
return this;
}
public ParentDataset<T> as(scala.Symbol alias) {
ds = ds.as(alias);
return this;
}
public ParentDataset<T> cache() {
ds = ds.cache();
return this;
}
public ParentDataset<T> coalesce(int numPartitions) {
ds = ds.coalesce(numPartitions);
return this;
}
public Column col(String colName) {
return ds.col(colName);
}
public Object collect() {
return ds.collect();
}
public List<Row> collectAsList() {
return ds.collectAsList();
}
protected int collectToPython() {
return ds.collectToPython();
}
public String[] columns() {
return ds.columns();
}
long count() {
updateDataset(query);
return ds.count();
}
public RelationalGroupedDataset cube(Column... cols) {
return ds.cube(cols);
}
public RelationalGroupedDataset cube(Seq<Column> cols) {
return ds.cube(cols);
}
public RelationalGroupedDataset cube(String col1, Seq<String> cols) {
return ds.cube(col1, cols);
}
public RelationalGroupedDataset cube(String col1, String... cols) {
return ds.cube(col1, cols);
}
public ParentDataset<T> describe(Seq<String> cols) {
ds = ds.describe(cols);
return this;
}
public ParentDataset<T> describe(String... cols) {
ds = ds.describe(cols);
return this;
}
public ParentDataset<T> distinct() {
ds = ds.distinct();
return this;
}
public ParentDataset<T> drop(Column col) {
ds = ds.drop(col);
return this;
}
public ParentDataset<T> drop(String colName) {
ds = ds.drop(colName);
return this;
}
public ParentDataset<T> dropDuplicates() {
ds = ds.dropDuplicates();
return this;
}
public ParentDataset<T> dropDuplicates(Seq<String> colNames) {
ds = ds.dropDuplicates(colNames);
return this;
}
public ParentDataset<T> dropDuplicates(String[] colNames) {
ds = ds.dropDuplicates(colNames);
return this;
}
public Tuple2<String, String>[] dtypes() {
return ds.dtypes();
}
public ParentDataset<T> except(Dataset<Row> other) {
ds = ds.except(other);
return this;
}
public void explain() {
ds.explain();
}
public void explain(boolean extended) {
ds.explain(extended);
}
// public ParentDataset<T> explode(Seq<Column> input, scala.Function1<Row,TraversableOnce<A>> f,
// scala.reflect.api.TypeTags.TypeTag<A> evidence) {
// ds = ds.explode(input, evidence);
// return this;
// }
// public <A,B> DataFrame explode(String inputColumn, String outputColumn, scala.Function1<A,TraversableOnce<B>> f,
// scala.reflect.api.TypeTags.TypeTag<B> evidence)
public ParentDataset<T> filter(Column condition) {
ds = ds.filter(condition);
return this;
}
public ParentDataset<T> filter(String conditionExpr) {
updateDataset(query);
ds = ds.filter(conditionExpr);
return this;
}
public Row first() {
return ds.first();
}
public void foreach(scala.Function1<Row, scala.runtime.BoxedUnit> f) {
ds.foreach(f);
}
public void foreachPartition(scala.Function1<scala.collection.Iterator<Row>, scala.runtime.BoxedUnit> f) {
ds.foreachPartition(f);
}
public RelationalGroupedDataset groupBy(Column... cols) {
return ds.groupBy(cols);
}
public RelationalGroupedDataset groupBy(Seq<Column> cols) {
return ds.groupBy(cols);
}
public RelationalGroupedDataset groupBy(String col1, Seq<String> cols) {
return ds.groupBy(col1, cols);
}
public RelationalGroupedDataset groupBy(String col1, String... cols) {
return ds.groupBy(col1, cols);
}
public Row head() {
return ds.head();
}
public Object head(int n) {
return ds.head(n);
}
public String[] inputFiles() {
return ds.inputFiles();
}
public ParentDataset<T> intersect(Dataset<Row> other) {
ds = ds.intersect(other);
return this;
}
boolean isLocal() {
return ds.isLocal();
}
public JavaRDD<Row> javaRDD() {
return ds.javaRDD();
}
protected JavaRDD<byte[]> javaToPython() {
return ds.javaToPython();
}
public ParentDataset<T> join(Dataset<Row> right) {
ds = ds.join(right);
return this;
}
public ParentDataset<T> join(Dataset<Row> right, Column joinExprs) {
ds = ds.join(right, joinExprs);
return this;
}
public ParentDataset<T> join(Dataset<Row> right, Column joinExprs, String joinType) {
ds = ds.join(right, joinExprs, joinType);
return this;
}
public ParentDataset<T> join(Dataset<Row> right, Seq<String> usingColumns) {
ds = ds.join(right, usingColumns);
return this;
}
public ParentDataset<T> join(Dataset<Row> right, Seq<String> usingColumns, String joinType) {
ds = ds.join(right, usingColumns, joinType);
return this;
}
public ParentDataset<T> join(Dataset<Row> right, String usingColumn) {
ds = ds.join(right, usingColumn);
return this;
}
public ParentDataset<T> limit(int n) {
ds = ds.limit(n);
return this;
}
protected org.apache.spark.sql.catalyst.plans.logical.LogicalPlan logicalPlan() {
return ds.logicalPlan();
}
/*
public Dataset<U> map(Function1<Row, U> f, Encoder<U> encoder) {
return ds.map(f, encoder);
}
public Dataset<U> map(MapFunction<Row, U> f, Encoder<U> encoder) {
return ds.map(f, encoder);
}
public <R> RDD<R> mapPartitions(scala.Function1<scala.collection.Iterator<Row>, scala.collection.Iterator<R>> f,
scala.reflect.ClassTag<R> evidence) {
return ds.mapPartitions(f, evidence);
}
*/
public DataFrameNaFunctions na() {
return ds.na();
}
protected Seq<org.apache.spark.sql.catalyst.expressions.Expression> numericColumns() {
return ds.numericColumns();
}
public ParentDataset<T> orderBy(Column... sortExprs) {
ds = ds.orderBy(sortExprs);
return this;
}
public ParentDataset<T> orderBy(Seq<Column> sortExprs) {
ds = ds.orderBy(sortExprs);
return this;
}
public ParentDataset<T> orderBy(String sortCol, Seq<String> sortCols) {
ds = ds.orderBy(sortCol, sortCols);
return this;
}
public ParentDataset<T> orderBy(String sortCol, String... sortCols) {
ds = ds.orderBy(sortCol, sortCols);
return this;
}
public ParentDataset<T> persist() {
ds = ds.persist();
return this;
}
public ParentDataset<T> persist(StorageLevel newLevel) {
ds = ds.persist(newLevel);
return this;
}
public void printSchema() {
ds.printSchema();
}
/*
public T[] randomSplit(double[] weighscala.reflect.ClassTagts) {
ds = ds.randomSplit();
return this;
}
public T[] randomSplit(double[] weights, long seed)
*/
public RDD<Row> rdd() {
return ds.rdd();
}
@Deprecated
public void registerTempTable(String tableName) {
this.viewName = tableName;
ds.registerTempTable(tableName);
}
public void createOrReplaceTempView(String viewName) {
this.viewName = viewName;
ds.createOrReplaceTempView(viewName);
}
public void createTempView(String viewName) throws AnalysisException {
this.viewName = viewName;
ds.createTempView(viewName);
}
public ParentDataset<T> repartition(Column... partitionExprs) {
ds = ds.repartition(partitionExprs);
return this;
}
public ParentDataset<T> repartition(int numPartitions) {
ds = ds.repartition(numPartitions);
return this;
}
public ParentDataset<T> repartition(int numPartitions, Column... partitionExprs) {
ds = ds.repartition(numPartitions, partitionExprs);
return this;
}
public ParentDataset<T> repartition(int numPartitions, Seq<Column> partitionExprs) {
ds = ds.repartition(numPartitions, partitionExprs);
return this;
}
public ParentDataset<T> repartition(Seq<Column> partitionExprs) {
ds = ds.repartition(partitionExprs);
return this;
}
protected org.apache.spark.sql.catalyst.expressions.NamedExpression resolve(String colName) {
return ds.resolve(colName);
}
public RelationalGroupedDataset rollup(Column... cols) {
return ds.rollup(cols);
}
public RelationalGroupedDataset rollup(Seq<Column> cols) {
return ds.rollup(cols);
}
public RelationalGroupedDataset rollup(String col1, Seq<String> cols) {
return ds.rollup(col1, cols);
}
public RelationalGroupedDataset rollup(String col1, String... cols) {
return ds.rollup(col1, cols);
}
public ParentDataset<T> sample(boolean withReplacement, double fraction) {
ds = ds.sample(withReplacement, fraction);
return this;
}
public ParentDataset<T> sample(boolean withReplacement, double fraction, long seed) {
ds = ds.sample(withReplacement, fraction, seed);
return this;
}
public StructType schema() {
return ds.schema();
}
public ParentDataset<T> select(Column... cols) {
ds = ds.select(cols);
return this;
}
public ParentDataset<T> select(Seq<Column> cols) {
ds = ds.select(cols);
return this;
}
public ParentDataset<T> select(String col, Seq<String> cols) {
ds = ds.select(col, cols);
return this;
}
public ParentDataset<T> select(String col, String... cols) {
ds = ds.select(col, cols);
return this;
}
public ParentDataset<T> selectExpr(Seq<String> exprs) {
ds = ds.selectExpr(exprs);
return this;
}
public ParentDataset<T> selectExpr(String... exprs) {
ds = ds.selectExpr(exprs);
return this;
}
public void show() {
this.show(20);
}
public void show(boolean truncate) {
ds.show(truncate);
}
public void show(int numRows) {
updateDataset(query);
ds.show(numRows);
}
public void show(int numRows, boolean truncate) {
ds.show(numRows, truncate);
}
public ParentDataset<T> sort(Column... sortExprs) {
ds = ds.sort(sortExprs);
return this;
}
public ParentDataset<T> sort(Seq<Column> sortExprs) {
ds = ds.sort(sortExprs);
return this;
}
public ParentDataset<T> sort(String sortCol, Seq<String> sortCols) {
ds = ds.sort(sortCol, sortCols);
return this;
}
public ParentDataset<T> sort(String sortCol, String... sortCols) {
ds = ds.sort(sortCol, sortCols);
return this;
}
public ParentDataset<T> sortWithinPartitions(Column... sortExprs) {
ds = ds.sortWithinPartitions(sortExprs);
return this;
}
public ParentDataset<T> sortWithinPartitions(Seq<Column> sortExprs) {
ds = ds.sortWithinPartitions(sortExprs);
return this;
}
public ParentDataset<T> sortWithinPartitions(String sortCol, Seq<String> sortCols) {
ds = ds.sortWithinPartitions(sortCol, sortCols);
return this;
}
public ParentDataset<T> sortWithinPartitions(String sortCol, String... sortCols) {
ds = ds.sortWithinPartitions(sortCol, sortCols);
return this;
}
public SQLContext sqlContext() {
return ds.sqlContext();
}
public DataFrameStatFunctions stat() {
return ds.stat();
}
public Object take(int n) {
return ds.take(n);
}
public List<Row> takeAsList(int n) {
return ds.takeAsList(n);
}
public ParentDataset<T> toDF() {
ds = ds.toDF();
return this;
}
public ParentDataset<T> toDF(Seq<String> colNames) {
ds = ds.toDF(colNames);
return this;
}
public ParentDataset<T> toDF(String... colNames) {
ds = ds.toDF(colNames);
return this;
}
public JavaRDD<Row> toJavaRDD() {
return ds.toJavaRDD();
}
public Dataset<String> toJSON() {
return ds.toJSON();
}
// <U> DataFrame transform(scala.Function1<DataFrame,DataFrame> t)
public ParentDataset<T> union(Dataset<Row> other) {
ds = ds.union(other);
return this;
}
@Deprecated
public ParentDataset<T> unionAll(Dataset<Row> other) {
ds = ds.unionAll(other);
return this;
}
public ParentDataset<T> unpersist() {
ds = ds.unpersist();
return this;
}
public ParentDataset<T> unpersist(boolean blocking) {
ds = ds.unpersist(blocking);
return this;
}
public ParentDataset<T> where(Column condition) {
ds = ds.where(condition);
return this;
}
public ParentDataset<T> where(String conditionExpr) {
ds = ds.where(conditionExpr);
return this;
}
public ParentDataset<T> withColumn(String colName, Column col) {
ds = ds.withColumn(colName, col);
return this;
}
public ParentDataset<T> withColumnRenamed(String existingName, String newName) {
ds = ds.withColumnRenamed(existingName, newName);
return this;
}
public DataFrameWriter write() {
return ds.write();
}
}