package com.github.lindenb.jvarkit.tools.vcfbiomart;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalTreeMap;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.tribble.readers.AsciiLineReader;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.LineIteratorImpl;
import htsjdk.tribble.readers.LineReader;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import com.beust.jcommander.Parameter;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import com.github.lindenb.jvarkit.util.vcf.VcfIterator;
@Program(name="vcfbiomart",description="BiomartQueries with VCF")
public class VcfBiomart extends Launcher
{
private static final Logger LOG = Logger.build(VcfBiomart.class).make();
@Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout")
private File outputFile = null;
private List<Element> attributes=new ArrayList<Element>();
private Document dom=null;
private Element queryElement=null;
private Element dataSetElement=null;
private Element filterElementChromosomalLocation=null;
private Set<Integer> visibleIndexes0=new HashSet<Integer>();
private String escapeInfo(String s)
{
if(s==null || s.isEmpty()) return "";
return s.replaceAll("[ =;\t]","_").replaceAll("[_]+", " ");
}
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
try {
Pattern tab=Pattern.compile("[\t]");
final String encoding="UTF-8";
TransformerFactory factory=TransformerFactory.newInstance();
Transformer transformer=null;
try
{
transformer=factory.newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
}
catch(Exception err)
{
LOG.error(err);
throw new IOException(err);
}
VCFHeader header=in.getHeader();
StringBuilder desc=new StringBuilder("Biomart query.");
if(!this.visibleIndexes0.isEmpty())
{
boolean first=true;
desc.append(" Format:");
for(Integer col:this.visibleIndexes0)
{
if(!first) desc.append("|"); first=false;
desc.append(this.attributes.get(col).getAttribute("name"));
}
}
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName()+"CmdLine",String.valueOf(getProgramCommandLine())));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName()+"Version",String.valueOf(getVersion())));
if(this.visibleIndexes0.isEmpty())
{
header.addMetaDataLine(new VCFInfoHeaderLine(TAG,
0,
VCFHeaderLineType.Flag,
desc.toString()
));
}
else
{
header.addMetaDataLine(new VCFInfoHeaderLine(TAG,
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.String,
desc.toString()
));
}
out.writeHeader(header);
List<VariantContext> buffer=new ArrayList<VariantContext>(this.batchSize);
for(;;)
{
if(!in.hasNext() || buffer.size()>=this.batchSize)
{
if(!buffer.isEmpty())
{
Set<String> locations=new HashSet<String>();
for(VariantContext ctx:buffer)
{
locations.add(
ctx.getContig()+":"+ctx.getStart()+":"+ctx.getEnd()+":1"
);
}
StringBuilder sb=new StringBuilder();
for(String loc:locations)
{
if(sb.length()!=0) sb.append(",");
sb.append(loc);
}
locations=null;
IntervalTreeMap<String> treemap=new IntervalTreeMap<String>();
this.filterElementChromosomalLocation.setAttribute("value",sb.toString());
StringWriter xmlToSend=new StringWriter();
try
{
transformer.transform(new DOMSource(this.dom), new StreamResult(xmlToSend));
}
catch (Exception e)
{
LOG.error(e);
throw new IOException(e);
}
LOG.info("POSTing to "+this.serviceUrl+" buffer.size="+buffer.size());
URLConnection connection = new URL(this.serviceUrl).openConnection();
connection.setDoOutput(true);
connection.setRequestProperty("Accept-Charset",encoding);
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded;charset=" +encoding);
if(connection instanceof HttpURLConnection )
{
HttpURLConnection httpConnection = (HttpURLConnection)connection;
httpConnection.setRequestMethod("POST");
httpConnection.setInstanceFollowRedirects(true);
}
String q="query="+URLEncoder.encode(xmlToSend.toString(),encoding);
LOG.debug(q);
OutputStream output = null;
try {
output = connection.getOutputStream();
output.write(q.getBytes(encoding));
output.flush();
}
finally
{
CloserUtil.close(output);
}
InputStream response = connection.getInputStream();
LineReader r=new AsciiLineReader(response);
@SuppressWarnings("resource")
LineIterator li=new LineIteratorImpl(r);
while(li.hasNext())
{
String line=li.next();
LOG.debug(line);
String tokens[]=tab.split(line);
LOG.debug(line+" L="+tokens.length );
Interval interval=new Interval(
tokens[this.chromColumn1-1],
Integer.parseInt(tokens[this.startColumn1-1]),
Integer.parseInt(tokens[this.endColumn1-1])
);
boolean foundSomething=false;
StringBuilder content=new StringBuilder();
for(Integer col: this.visibleIndexes0)
{
if(content.length()!=0) content.append("|");
String s2=col>=tokens.length?"":tokens[col];
if(!s2.trim().isEmpty()) foundSomething=true;
content.append(escapeInfo(s2));
}
if(foundSomething || this.visibleIndexes0.isEmpty())
{
treemap.put(interval,content.toString());
}
}
CloserUtil.close(r);
CloserUtil.close(response);
for(VariantContext ctx:buffer)
{
List<String> array=new ArrayList<String>(new HashSet<String>(treemap.getOverlapping(new Interval(ctx.getContig(),ctx.getStart(),ctx.getEnd()))));
if(!array.isEmpty())
{
VariantContextBuilder vcb=new VariantContextBuilder(ctx);
if(this.visibleIndexes0.isEmpty())//just a TAG
{
vcb.attribute(TAG, true);
}
else
{
vcb.attribute(TAG, array);
}
ctx=vcb.make();
}
out.add(ctx);
}
}
if(!in.hasNext()) break;
buffer.clear();
}
buffer.add(in.next());
}
return 0;
}
catch(Exception err) {
LOG.error(err);
return -1;
}
}
private int findColumn1(String tag)
{
int column=-1;
for(int i=0;i< attributes.size();++i)
{
Element e=attributes.get(i);
Attr att=e.getAttributeNode(tag);
if(att!=null && att.getValue().equals("true"))
{
LOG.info("Attribute @"+tag+" was specified in the XML");
if(column!=-1)
{
LOG.error("XML: Two @"+tag+"=true ?");
return -1;
}
column=(i+1);
}
}
for(int i=0; column==-1 && i< attributes.size();++i)
{
Element e=attributes.get(i);
Attr att=e.getAttributeNode("name");
if(att==null) throw new IllegalStateException("@name ?");
if(tag.equals("chrom"))
{
if(att.getValue().equals("chromosome_name") ||
att.getValue().equals("chrom_name"))
{
column=(i+1);
}
}
else if(tag.equals("start"))
{
if(att.getValue().equals("start_position"))
{
column=(i+1);
}
}
else if(tag.equals("end"))
{
if(att.getValue().equals("end_position"))
{
column=(i+1);
}
}
}
if(column<1 || column> this.attributes.size())
{
LOG.error("Cannot use column for \""+tag+"\".");
return -1;
}
return column;
}
@Parameter(names="-X",description=" (XML-file) XML biomart template.")
String xmlTemplate=null;
@Parameter(names="-n",description=" (int) batch size.);")
private int batchSize=200;
@Parameter(names="-T",description=" (string) VCF output tag.")
private String TAG="BIOMART";
@Parameter(names="-C",description=" (int) column index (1-based) for chromosome . Optional")
private int chromColumn1=-1;
@Parameter(names="-S",description=" (int) column index (1-based) for start . Optional")
private int startColumn1=-1;
@Parameter(names="-E",description=" (int) column index (1-based) for end . Optional")
private int endColumn1=-1;
@Parameter(names="-u",description=" (url) biomart service url")
private String serviceUrl="http://www.biomart.org/biomart/martservice/result";
@Override
public int doWork(List<String> args) {
if(xmlTemplate==null)
{
LOG.error("Undefined XML template");
return -1;
}
try
{
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance();
f.setCoalescing(true);
f.setNamespaceAware(false);
f.setValidating(false);
f.setExpandEntityReferences(true);
f.setIgnoringComments(false);
f.setIgnoringElementContentWhitespace(true);
DocumentBuilder docBuilder= f.newDocumentBuilder();
LOG.info("Parsing xml "+xmlTemplate);
InputStream in=IOUtils.openURIForReading(xmlTemplate);
this.dom=docBuilder.parse(in);
in.close();
this.queryElement=this.dom.getDocumentElement();
if(this.queryElement==null || !this.queryElement.getNodeName().equals("Query"))
{
LOG.error("XML root is not <Query/> but "+this.queryElement.getNodeName());
return -1;
}
this.queryElement.setAttribute("formatter", "TSV");
this.queryElement.setAttribute("header", "0");
this.queryElement.setAttribute("uniqueRows", "1");
this.queryElement.setAttribute("count", "");
for(Node c1=this.queryElement.getFirstChild();c1!=null;c1=c1.getNextSibling())
{
if(c1.getNodeType()!=Node.ELEMENT_NODE) continue;
if(c1.getNodeName().equals("Dataset"))
{
if(this.dataSetElement!=null)
{
LOG.error("XML: Two <DataSet> elements ?");
return -1;
}
this.dataSetElement=Element.class.cast(c1);
for(Node c2=this.dataSetElement.getFirstChild();c2!=null;c2=c2.getNextSibling())
{
if(c2.getNodeType()!=Node.ELEMENT_NODE) continue;
Element e2=Element.class.cast(c2);
if(e2.getNodeName().equals("Attribute"))
{
if(e2.getAttributeNode("name")==null)
{
LOG.error("XML: Attribute without @name ?");
return -1;
}
this.attributes.add(Element.class.cast(c2));
}
else if(c2.getNodeName().equals("Filter"))
{
Attr att=null;
if((att=e2.getAttributeNode("name"))!=null && att.getValue().equals("chromosomal_region"))
{
if(this.filterElementChromosomalLocation!=null)
{
LOG.error("XML: two Filter@chromosomal_region ?");
return -1;
}
this.filterElementChromosomalLocation=e2;
}
}
}
if(this.filterElementChromosomalLocation==null)
{
this.filterElementChromosomalLocation=this.dom.createElement("Filter");
this.filterElementChromosomalLocation.setAttribute("name","chromosomal_region");
this.dataSetElement.insertBefore(
this.filterElementChromosomalLocation,
this.dataSetElement.getFirstChild()
);
}
}
}
if(this.dataSetElement==null)
{
LOG.error("DataSet element missing");
return -1;
}
if(this.attributes.size()<2)
{
LOG.error("Expected at least two elementss <Attribute>");
return -1;
}
/* find attribute column for CHROMOSOME */
if(chromColumn1==-1)
{
this.chromColumn1=findColumn1("chrom");
if(this.chromColumn1<1) return -1;
}
if(this.chromColumn1< 1 || this.chromColumn1> attributes.size())
{
LOG.error("CHROM column index out of range");
return -1;
}
if(startColumn1==-1)
{
this.startColumn1=findColumn1("start");
if(this.startColumn1<1) return -1;
}
if(this.startColumn1< 1 || this.startColumn1> attributes.size())
{
LOG.error("START column index out of range");
return -1;
}
if(endColumn1==-1)
{
this.endColumn1=findColumn1("end");
if(this.startColumn1<1) endColumn1=startColumn1;
}
if(this.endColumn1< 1 || this.endColumn1> attributes.size())
{
LOG.error("END column index out of range");
return -1;
}
}
catch(Exception err)
{
LOG.error(err);
return -1;
}
for(int i=0;i< this.attributes.size();++i)
{
Element E=this.attributes.get(i);
Attr att=E.getAttributeNode("visible");
if(att!=null && att.getValue().equals("false")) continue;
this.visibleIndexes0.add(i);
}
return doVcfToVcf(args, outputFile);
}
/**
* @param args
*/
public static void main(String[] args)
{
new VcfBiomart().instanceMainWithExit(args);
}
}