/** * */ package de.uni_luebeck.inb.krabbenhoeft.eQTL.server.processors; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.mortbay.jetty.HttpException; import de.uni_luebeck.inb.krabbenhoeft.eQTL.entities.Category; import de.uni_luebeck.inb.krabbenhoeft.eQTL.entities.ColumnForDataSetLayer; import de.uni_luebeck.inb.krabbenhoeft.eQTL.entities.HajoEntity; import de.uni_luebeck.inb.krabbenhoeft.eQTL.entities.ColumnForDataSetLayer.ColumType; import de.uni_luebeck.inb.krabbenhoeft.eQTL.server.helpers.persistence.CreateAndModifyEntities; public class AnnotateGenesFromEnsemblBiomartProcessor extends BaseProcessorImplementation { @Override public int getPreferredNumberOfParallelRunningProcessors() { return 2; } public void addNewColumns(List<ColumnForDataSetLayer> columns) { columns.add(new ColumnForDataSetLayer("ensemblGeneId", ColumType.Name)); columns.add(new ColumnForDataSetLayer("ensemblGeneName", ColumType.Name)); columns.add(new ColumnForDataSetLayer("ensemblTranscriptId", ColumType.Name)); columns.add(new ColumnForDataSetLayer("geneChromosome", ColumType.Category)); final ColumnForDataSetLayer col = new ColumnForDataSetLayer("geneStartBP", ColumType.Location); col.setIndexme(true); col.setIndexChromosomeField("geneChromosome"); col.setIndexRangeEndField("geneEndBP"); columns.add(col); columns.add(new ColumnForDataSetLayer("geneEndBP", ColumType.Location)); } private static Map<String, String[]> accession2parts = Collections.synchronizedMap(new HashMap<String, String[]>()); public int doWork(CreateAndModifyEntities modifier, Iterator<HajoEntity> iter) { final List<HajoEntity> entities = new ArrayList<HajoEntity>(); final Set<String> geneBankDnaIdStrings = new HashSet<String>(); while (iter.hasNext()) { final HajoEntity source = iter.next(); geneBankDnaIdStrings.add(source.getName("geneBankDnaId")); entities.add(source); } StringBuilder request = new StringBuilder(); request.append("<Query virtualSchemaName = \"default\" formatter = \"TSV\" header = \"0\" uniqueRows = \"0\" count = \"\" datasetConfigVersion = \"0.6\" >" + "<Dataset name = \"mmusculus_gene_ensembl\" interface = \"default\" >" + "<Filter name = \"refseq_dna\" value = \""); boolean needToAsk = false; for (String string : geneBankDnaIdStrings) { string = accessionDropDot(string); if (accession2parts.containsKey(string)) continue; request.append(string); request.append(","); needToAsk = true; } request.append("\"/>" + "<Attribute name = \"refseq_dna\" />" + "<Attribute name = \"ensembl_gene_id\" />" + "<Attribute name = \"ensembl_transcript_id\" />" + "<Attribute name = \"external_gene_id\" />" + "<Attribute name = \"chromosome_name\" />" + "<Attribute name = \"start_position\" />" + "<Attribute name = \"end_position\" />" + "</Dataset></Query>"); if (needToAsk) { String response = null; Exception exception = null; for (int i = 0; i < 5; i++) { try { final String biomartUrl = "http://www.ensembl.org/biomart/martservice"; final String payload = "query=" + URLEncoder.encode(request.toString(), "UTF-8"); final HttpURLConnection connection = (HttpURLConnection) new URL(biomartUrl).openConnection(); connection.setDoOutput(true); final OutputStream outputStream = connection.getOutputStream(); outputStream.write(payload.getBytes()); outputStream.close(); if (connection.getResponseCode() != 200) throw new HttpException(connection.getResponseCode(), connection.getResponseMessage()); byte[] data = new byte[1024]; ByteArrayOutputStream baos = new ByteArrayOutputStream(); final InputStream inputStream = connection.getInputStream(); while (true) { int nr = inputStream.read(data); if (nr == -1) break; baos.write(data, 0, nr); } inputStream.close(); response = baos.toString(); break; } catch (MalformedURLException e) { exception = e; } catch (IOException e) { exception = e; } } if (response == null) throw new RuntimeException("Fetching results from ensembl failed!", exception); String[] lines = response.split("\n"); for (String line : lines) { String[] parts = line.split("\t"); accession2parts.put(parts[0], parts); } } int count = 0; for (HajoEntity target : entities) { String[] parts = accession2parts.get(accessionDropDot(target.getName("geneBankDnaId"))); if (parts != null) { target.setName("ensemblGeneId", parts[1]); target.setName("ensemblTranscriptId", parts[2]); target.setName("ensemblGeneName", parts[3]); target.setCategory("geneChromosome", Category.wrap(parts[4])); target.setLocation("geneStartBP", Integer.parseInt(parts[5])); target.setLocation("geneEndBP", Integer.parseInt(parts[6])); } else { target.setName("ensemblGeneId", "MISSING"); target.setName("ensemblTranscriptId", "MISSING"); target.setName("ensemblGeneName", "MISSING"); target.setCategory("geneChromosome", Category.wrap("MISSING")); target.setLocation("geneStartBP", -1); target.setLocation("geneEndBP", -1); } modifier.put(target); count++; } return count; } private String accessionDropDot(String string) { final int dotIndex = string.indexOf('.'); if (dotIndex != -1) string = string.substring(0, dotIndex); return string; } }