/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.hpg.bigdata.core.lib;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.Test;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created by imedina on 04/08/16.
*/
public class VariantDatasetTest {
@Test
public void execute() {
// it doesn't matter what we set to spark's home directory
SparkConf sparkConf = SparkConfCreator.getConf("AlignmentDatasetTest", "local", 1, true, "");
System.out.println("sparkConf = " + sparkConf.toDebugString());
SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf));
System.out.println(">>>> opening file...");
//String filename = "/home/imedina/data/CEU-1409-01_20000.vcf.avro";
//String filename = "/home/jtarraga/data150/spark/10k.variants.avro";
try {
Path inputPath = Paths.get(getClass().getResource("/100.variants.avro").toURI());
long count;
VariantDataset vd = new VariantDataset();
vd.load(inputPath.toString(), sparkSession);
vd.printSchema();
vd.createOrReplaceTempView("vcf");
System.out.println("--------------------------------------");
// long count = vd.annotationfilter("consequenceTypes.sequenceOntologyTerms.accession", "SO:0001566").count();
// count = vd.annotationFilter("populationFrequencies.altAlleleFreq", "1000G:CEU < 1.2,1000G:ASW < 1.25")
// .count();
// vd.annotationfilter("populationFrequencies", "1000G:ASW < 0.2").count();
// long count = vd.annotationfilter("consequenceTypes.sequenceOntologyTerms.accession", "SO:0001566")
// long count = vd//.annotationfilter("consequenceTypes.sequenceOntologyTerms.name", "missense_variant")
// .annotationFilter("conservation.phylop", "< 0.2")
// .annotationFilter("conservation.phastCons", "< 0.4")
// .idFilter("rs587604674")
// .count();
// String ids = "rs587604674,rs587603352";
// count = vd.idFilter(Arrays.asList(StringUtils.split(ids, ","))).count();
// count = vd.annotationFilter("conservation", "phylop<0.3,phastCons<0.1").count();
String types = "SNP,SNV";
count = vd.typeFilter(new ArrayList<>(Arrays.asList(StringUtils.split(types, ",")))).count();
// System.out.println(vd.annotationfilter("consequenceTypes.sequenceOntologyTerms.name", "missense_variant")
// .select("annotation.consequenceTypes.sequenceOntologyTerms").count());
// System.out.println(vd.idfilter("rs587604674").count());
// System.out.println(vd.annotationfilter("id", "ENSG00000233866").count());
//scala> spark.sql("select * from v10k lateral view explode(annotation.consequenceTypes) act as ct lateral view explode(ct.sequenceOntologyTerms) ctso as so where so.accession = 'SO:0001566'").count()
//res6: Long = 4437
// vd.select(vd.col("studies")).show(2);
System.out.println("--------------------------------------");
System.out.println("count = " + count);
System.out.println("--------------------------------------");
// System.out.println(vd.count());
// System.out.println(vd.filter("start >= 564477").filter("end <= 729948").count());
// System.out.println(vd.groupBy("chromosome").avg("start").sort("chromosome").take(3)[0]);
// System.out.println(vd.describe("studies"));
// System.out.println("---->>> " + vd.select("studies.files").head());
// System.out.println("---->>> " + vd.select("studies.files").select("attributes").head());
// System.out.println(vd.filter("studies.files[0].attributes.AF = '0.009'"));
// System.out.println(vd.filter("studies.files.attributes.AF = '0.009'"));
vd.sparkSession.sparkContext().stop();
assert(count == 98);
} catch (Exception e) {
e.printStackTrace();
}
}
}