/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.hpg.bigdata.core.avro;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.specific.SpecificDatumWriter;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.biodata.models.variant.avro.VariantAvro;
import org.opencb.cellbase.client.config.ClientConfiguration;
import org.opencb.cellbase.client.config.RestConfig;
import org.opencb.cellbase.client.rest.CellBaseClient;
import org.opencb.cellbase.client.rest.VariationClient;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResponse;
import org.opencb.commons.utils.FileUtils;
import java.io.*;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Created by imedina on 09/08/16.
*/
public class VariantAvroAnnotator {
private CellBaseClient cellBaseClient;
public VariantAvroAnnotator() {
ClientConfiguration clientConfiguration = new ClientConfiguration();
clientConfiguration.setVersion("v4");
clientConfiguration.setRest(new RestConfig(Collections
.singletonList("http://bioinfodev.hpc.cam.ac.uk/cellbase-4.1.0-beta"), 30000));
cellBaseClient = new CellBaseClient("hsapiens", clientConfiguration);
}
public void annotate(Path avroPath, Path annotatedAvroPath) throws IOException {
FileUtils.checkFile(avroPath);
FileUtils.checkDirectory(annotatedAvroPath.getParent(), true);
InputStream inputStream = new FileInputStream(avroPath.toFile());
DatumReader<VariantAvro> datumReader = new SpecificDatumReader<>(VariantAvro.SCHEMA$);
DataFileStream<VariantAvro> dataFileStream = new DataFileStream<>(inputStream, datumReader);
OutputStream outputStream = new FileOutputStream(annotatedAvroPath.toFile());
DatumWriter<VariantAvro> datumWriter = new SpecificDatumWriter<>();
DataFileWriter<VariantAvro> dataFileWriter = new DataFileWriter<>(datumWriter);
dataFileWriter.create(VariantAvro.SCHEMA$, outputStream);
// dataFileWriter.setCodec(CodecFactory.deflateCodec(CodecFactory.DEFAULT_DEFLATE_LEVEL));
VariationClient variationClient = cellBaseClient.getVariationClient();
List<String> variants = new ArrayList<>(2000);
List<VariantAvro> records = new ArrayList<>(2000);
VariantAvro record;
int counter = 1;
while (dataFileStream.hasNext()) {
record = dataFileStream.next();
records.add(record);
variants.add(record.getChromosome() + ":" + record.getStart() + ":" + record.getReference() + ":" + record.getAlternate());
if (counter++ % 2000 == 0) {
System.out.println("Annotating 2000 variants batch...");
QueryResponse<VariantAnnotation> annotations = variationClient.getAnnotations(variants,
new QueryOptions(QueryOptions.EXCLUDE, "expression"));
for (int i = 0; i < annotations.getResponse().size(); i++) {
records.get(i).setAnnotation(annotations.getResponse().get(i).first());
dataFileWriter.append(records.get(i));
}
dataFileWriter.flush();
records.clear();
variants.clear();
}
}
// annotate remaining variants
if (records.size() > 0) {
QueryResponse<VariantAnnotation> annotations = variationClient.getAnnotations(variants,
new QueryOptions(QueryOptions.EXCLUDE, "expression"));
for (int i = 0; i < annotations.getResponse().size(); i++) {
records.get(i).setAnnotation(annotations.getResponse().get(i).first());
dataFileWriter.append(records.get(i));
}
dataFileWriter.flush();
}
dataFileWriter.close();
inputStream.close();
dataFileStream.close();
}
}