package de.uni_luebeck.inb.krabbenhoeft.eQTL.server; import java.net.URL; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.cassandra.service.Column; import org.apache.cassandra.service.ColumnOrSuperColumn; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import com.google.gwt.user.server.rpc.RemoteServiceServlet; import de.uni_luebeck.inb.krabbenhoeft.eQTL.api.gwt.DasTrackEntry; import de.uni_luebeck.inb.krabbenhoeft.eQTL.api.gwt.FetchDasTrack; import de.uni_luebeck.inb.krabbenhoeft.eQTL.server.helpers.persistence.CassandraSession; import de.uni_luebeck.inb.krabbenhoeft.eQTL.server.helpers.persistence.GeoBoxHelper; public class FetchDasTrackService extends RemoteServiceServlet implements FetchDasTrack { private static final long serialVersionUID = 1L; public String[] listAvailableTracks() { return new String[] { "karyotype", "reference", "transcript" }; } private static NodeList fetchNodes(String type, String segment) { Document doc; try { URL url = new URL("http://www.ensembl.org/das/Mus_musculus.NCBIM37." + type + "/features?segment=" + segment); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); doc = db.parse(url.toString()); doc.getDocumentElement().normalize(); } catch (Exception e) { throw new RuntimeException(e); } return doc.getElementsByTagName("FEATURE"); } public List<DasTrackEntry> getCompleteTrack(String type, String chromosome) { ArrayList<DasTrackEntry> list = new ArrayList<DasTrackEntry>(); NodeList nodeLst = fetchNodes(type, chromosome); for (int s = 0; s < nodeLst.getLength(); s++) { Node featureNode = nodeLst.item(s); Element featureElement = (Element) featureNode; DasTrackEntry band = new DasTrackEntry(); band.label = featureElement.getAttribute("id"); band.from = Long.parseLong(featureElement.getElementsByTagName("START").item(0).getTextContent()); band.to = Long.parseLong(featureElement.getElementsByTagName("END").item(0).getTextContent()); if (type.equals("karyotype")) { band.type = 1; if (featureElement.getElementsByTagName("TYPE").item(0).getTextContent().contains("gvar")) band.type = 2; if (featureElement.getElementsByTagName("TYPE").item(0).getTextContent().contains("gpos")) band.type = 3; } else if (type.equals("reference")) { if (featureElement.getElementsByTagName("TYPE").item(0).getTextContent().equals("chromosome")) continue; band.type = s % 2; } else if (type.equals("transcript")) { band.label = ((Element) featureElement.getElementsByTagName("GROUP").item(0)).getAttribute("label"); band.type = featureElement.getElementsByTagName("ORIENTATION").item(0).getTextContent().contains("+") ? 0 : 1; } list.add(band); } return list; } // NOTE: ensure ordering based on from field public DasTrackEntry[] getTrackForSegment(String trackName, String chromosome, long fromBP, long toBP) { int shiftToUse; for (shiftToUse = GeoBoxHelper.minShift; shiftToUse <= GeoBoxHelper.maxShift; shiftToUse++) { final long boxSize = GeoBoxHelper.getSizeForBox(shiftToUse); if (boxSize >= toBP - fromBP) break; } long boxToUse = GeoBoxHelper.getBoxForValue(shiftToUse, fromBP); if (shiftToUse > GeoBoxHelper.maxShift) boxToUse = 0; CassandraSession cassandra = new CassandraSession(); final String rowKey = trackName + "#" + chromosome + "#" + shiftToUse + "#" + boxToUse; List<ColumnOrSuperColumn> completeRow = cassandra.getCompleteRow("dastracks", rowKey); if (completeRow.size() == 0) { boolean modified = cacheDasTrackForChromosome(cassandra, trackName, chromosome); if (modified) completeRow = cassandra.getCompleteRow("dastracks", rowKey); } final List<DasTrackEntry> completeTrack = new ArrayList<DasTrackEntry>(); for (ColumnOrSuperColumn columnOrSuperColumn : completeRow) { final ByteBuffer bb = ByteBuffer.wrap(columnOrSuperColumn.column.value); int strlen = bb.limit() - 4 - 8 - 8; DasTrackEntry entry = new DasTrackEntry(); byte[] label = new byte[strlen]; bb.get(label); entry.label = new String(label, CassandraSession.charset); entry.type = bb.getInt(); entry.from = bb.getLong(); entry.to = bb.getLong(); completeTrack.add(entry); } cassandra.close(); return completeTrack.toArray(new DasTrackEntry[0]); } private boolean cacheDasTrackForChromosome(CassandraSession cassandra, String trackName, String chromosome) { final String trackNameStateRowKey = trackName + "#" + chromosome + "#state"; final Column statusColumn = new Column("state".getBytes(CassandraSession.charset), "preparing".getBytes(CassandraSession.charset), CassandraSession.ts()); // read status column, to check if track is done ColumnOrSuperColumn statusColRead = cassandra.getColumn("dastracks", trackNameStateRowKey, statusColumn.name); if (statusColRead != null) { final String stateString = new String(statusColRead.column.value, CassandraSession.charset); if (stateString.equals("done")) return false; } else { // since there is no column status // store that we would like to prepare it cassandra.addToStoreQueue(trackNameStateRowKey, "dastracks", new ColumnOrSuperColumn(statusColumn, null)); cassandra.flush(); } // wait a second to allow for multiple threads to offer preparing try { Thread.sleep(1000); } catch (InterruptedException e) { } // re-read the column status statusColRead = cassandra.getColumn("dastracks", trackNameStateRowKey, statusColumn.name); String stateString = new String(statusColRead.column.value, CassandraSession.charset); // column is done, we're done if (stateString.equals("done")) return true; // column is in preparing, but not by us if (stateString.equals("preparing") && statusColRead.column.timestamp != statusColumn.timestamp) { while (stateString.equals("preparing")) { try { Thread.sleep(1000); } catch (InterruptedException e) { } statusColRead = cassandra.getColumn("dastracks", trackNameStateRowKey, statusColumn.name); stateString = new String(statusColRead.column.value, CassandraSession.charset); } return true; } try { // prepare final List<DasTrackEntry> completeTrack = getCompleteTrack(trackName, chromosome); int counter = 0; for (DasTrackEntry entry : completeTrack) { final byte[] name = ByteBuffer.allocate(4).putInt(counter++).array(); final byte[] label = entry.label.getBytes(CassandraSession.charset); final byte[] value = ByteBuffer.allocate(label.length + 4 + 8 + 8).put(label).putInt(entry.type).putLong(entry.from).putLong(entry.to).array(); for (int shift = GeoBoxHelper.minShift; shift <= GeoBoxHelper.maxShift; shift++) { final long boxFrom = GeoBoxHelper.getBoxForValue(shift, entry.from); final long boxTo = GeoBoxHelper.getBoxForValue(shift, entry.to); for (long box = boxFrom; box <= boxTo; box++) { final String geobox = trackName + "#" + chromosome + "#" + shift + "#" + box; cassandra.addToStoreQueue(geobox, "dastracks", new ColumnOrSuperColumn(new Column(name, value, CassandraSession.ts()), null)); } } final String overflowGeobox = trackName + "#" + chromosome + "#" + (GeoBoxHelper.maxShift + 1) + "#0"; cassandra.addToStoreQueue(overflowGeobox, "dastracks", new ColumnOrSuperColumn(new Column(name, value, CassandraSession.ts()), null)); } } catch (Throwable e) { } // mark as done, even if we encountered an error statusColumn.setValue("done".getBytes(CassandraSession.charset)); statusColumn.setTimestamp(CassandraSession.ts()); cassandra.addToStoreQueue(trackNameStateRowKey, "dastracks", new ColumnOrSuperColumn(statusColumn, null)); cassandra.flush(); return true; } }