/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
package org.opencb.hpg.bigdata.core.io;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderVersion;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.NotImplementedException;
import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec;
import org.opencb.commons.io.DataReader;
import java.io.*;
import java.nio.CharBuffer;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.GZIPInputStream;
/**
* @author mh719
*
*/
public class VcfBlockIterator implements AutoCloseable, Iterator<List<CharBuffer>>, Iterable<List<CharBuffer>> {
private static final long DEFAULT_64KB_BLOCK = 64L * 1024L;
private final File file;
private final InputStream in;
private final LineIterator iter;
private final AtomicLong charBlockSize = new AtomicLong(DEFAULT_64KB_BLOCK);
private final VCFHeader header;
private final VCFHeaderVersion version;
public VcfBlockIterator(File vcfFile) throws IOException {
this(vcfFile, new FullVcfCodec());
}
public VcfBlockIterator(File vcfFile, FullVcfCodec codec) throws IOException {
this.file = vcfFile;
this.in = buildInputStream(this.file);
this.iter = codec.makeSourceFromStream(this.in);
this.header = (VCFHeader) codec.readActualHeader(this.iter);
this.version = codec.getVCFHeaderVersion();
}
public VcfBlockIterator(InputStream in, FullVcfCodec codec) throws IOException {
this.file = null;
this.in = in;
this.iter = codec.makeSourceFromStream(this.in);
this.header = (VCFHeader) codec.readActualHeader(this.iter);
this.version = codec.getVCFHeaderVersion();
}
public VCFHeader getHeader() {
return this.header;
}
public VCFHeaderVersion getVersion() {
return version;
}
@Override
public List<CharBuffer> next() {
return next(this.charBlockSize.get());
}
public List<CharBuffer> next(long blockSize) {
long cnt = 0L;
List<CharBuffer> next = new LinkedList<>(); // linked list faster at creation time
while (iter.hasNext() && cnt < blockSize) {
String line = iter.next();
CharBuffer buff = CharBuffer.wrap(line.toCharArray()); //FIXME! Avoid char array copy
next.add(buff);
cnt += buff.length();
}
return next;
}
@Override
public boolean hasNext() {
return iter.hasNext();
}
protected InputStream buildInputStream(File inFile) throws IOException {
InputStream inputStream = new FileInputStream(inFile);
String name = inFile.getName();
String ext = FilenameUtils.getExtension(name);
switch (ext) {
case "gz":
case "gzip":
inputStream = new GZIPInputStream(inputStream);
break;
case "vcf":
case "txt":
case "tsv":
//nothing to do
break;
default:
throw new NotImplementedException(String.format("Compression extension %s not yet supported!!!", ext));
}
return new BufferedInputStream(inputStream);
}
@Override
public void close() throws IOException {
this.in.close();
}
@Override
public void remove() {
throw new NotImplementedException("Remove not implemented");
}
@Override
public Iterator<List<CharBuffer>> iterator() {
return this;
}
public DataReader<CharBuffer> toCharBufferDataReader() {
return new DataReader<CharBuffer>() {
@Override
public List<CharBuffer> read(int size) {
return (hasNext() ? next(size) : Collections.<CharBuffer>emptyList());
}
@Override
public boolean close() {
try {
VcfBlockIterator.this.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
}
};
}
public DataReader<CharSequence> toLineDataReader() {
return new DataReader<CharSequence>() {
@Override
public List<CharSequence> read(int size) {
List<CharSequence> batch = new ArrayList<>(size);
for (int i = 0; i < size && iter.hasNext(); i++) {
batch.add(iter.next());
}
return batch;
}
@Override
public boolean close() {
try {
VcfBlockIterator.this.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
}
};
}
}