package com.illumina.basespace.igv.wiggle;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import org.apache.commons.math.stat.StatUtils;
import org.broad.igv.Globals;
import org.broad.igv.data.WiggleDataset;
import org.broad.igv.exceptions.ParserException;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackProperties;
import org.broad.igv.track.TrackType;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.util.collections.DownsampledDoubleArrayList;
import org.broad.igv.util.collections.FloatArrayList;
import org.broad.igv.util.collections.IntArrayList;
import org.broad.tribble.readers.AsciiLineReader;
import com.illumina.basespace.ApiClient;
import com.illumina.basespace.igv.BaseSpaceMain;
import com.illumina.basespace.igv.BaseSpaceResourceLocator;
public class BaseSpaceWiggleParser
{
private static final Logger log = Logger.getLogger(BaseSpaceWiggleParser.class.getPackage().getName());
private int chrColumn = 0;
private int startColumn = 1;
private int endColumn = 2;
private int dataColumn = 3;
private enum Type
{
FIXED, VARIABLE, BED_GRAPH, CPG, EXPR
}
Genome genome;
WiggleDataset dataset;
/**
* The type of wiggle locator (see UCSC documentation).
*/
private Type type = Type.BED_GRAPH;
// State variables. This is a serial type parser, these variables are used
// to hold temporary
// state.
private String chr;
String lastChr = "";
int lastPosition = 0;
private int start;
private int step = 1;
private int windowSpan = 1;
private int startBase = 1; // <- set to zero for zero based coordinates
IntArrayList startLocations = null;
IntArrayList endLocations = null;
FloatArrayList data = null;
BaseSpaceResourceLocator resourceLocator;
Set<String> unsortedChromosomes;
int estArraySize;
Map<String, Integer> longestFeatureMap = new HashMap<String, Integer>();
// Used to estimate percentiles
private static final int maxSamples = 1000;
DownsampledDoubleArrayList sampledData = new DownsampledDoubleArrayList(maxSamples, maxSamples);
public BaseSpaceWiggleParser(BaseSpaceResourceLocator locator, Genome genome)
{
this.genome = genome;
this.resourceLocator = locator;
this.estArraySize = estArraySize(locator, genome);
dataset = new WiggleDataset(genome, locator.getTrackName());
if (locator.getPath().endsWith("CpG.txt"))
{
type = Type.CPG;
}
else if (locator.getPath().toLowerCase().endsWith(".expr"))
{
// gene_id bundle_id chr left right FPKM FPKM_conf_lo FPKM_conf_hi
type = Type.EXPR;
chrColumn = 2;
startColumn = 3;
endColumn = 4;
dataColumn = 5;
startBase = 1;
dataset.setType(TrackType.EXPR);
}
}
private int estArraySize(ResourceLocator locator, Genome genome)
{
int estLines = 100000;
if (locator.getServerURL() == null)
{
estLines = ParsingUtils.estimateLineCount(locator.getPath());
}
int nChromosomes = genome == null ? 24 : genome.getAllChromosomeNames().size();
return Math.max(1000, (int) (estLines / nChromosomes));
}
/**
* Utility method. Returns true if this looks like a wiggle locator. The
* criteria is to scan the first 100 lines looking for a valid "track" line.
* According to UCSC documentation track lines must contain a type
* attribute, which must be equal to "wiggle_0".
*
* @param file
* @return
*/
public static boolean isWiggle(ResourceLocator file)
{
if (file.getPath().endsWith("CpG.txt") || file.getPath().endsWith(".expr") || file.getPath().endsWith(".wig"))
{
return true;
}
else
{
return false;
}
}
protected AsciiLineReader openAsciiReader(BaseSpaceResourceLocator locator) throws IOException
{
try
{
ApiClient client = BaseSpaceMain.instance().getApiClient(locator.getClientId());
return new AsciiLineReader(new GZIPInputStream(client.getFileInputStream(locator.getFile())));
}
catch(Throwable t)
{
throw new IOException("Error opening Ascii Reader",t);
}
}
public WiggleDataset parse()
{
lastPosition = -1;
unsortedChromosomes = new HashSet<String>();
AsciiLineReader reader = null;
String nextLine = null;
int lineNumber = 0;
try
{
reader = openAsciiReader(resourceLocator);
if (type == Type.EXPR)
{
reader.readLine(); // Skip header line
}
int position = -1;
while ((nextLine = reader.readLine()) != null)
{
lineNumber++;
if (nextLine.startsWith("#") || nextLine.startsWith("data") || nextLine.startsWith("browser")
|| nextLine.trim().length() == 0)
{
continue;
// Skip
}
if (nextLine.startsWith("track") && type != Type.CPG)
{
type = Type.BED_GRAPH;
ParsingUtils.parseTrackLine(nextLine, dataset.getTrackProperties());
if (dataset.getTrackProperties().getBaseCoord() == TrackProperties.BaseCoord.ZERO)
{
this.startBase = 0;
}
}
else if (nextLine.startsWith("fixedStep"))
{
type = Type.FIXED;
parseStepLine(nextLine);
position = start;
if (start < lastPosition)
{
unsortedChromosomes.add(chr);
}
}
else if (nextLine.startsWith("variableStep"))
{
type = Type.VARIABLE;
parseStepLine(nextLine);
if (start < lastPosition)
{
unsortedChromosomes.add(chr);
}
}
else
{
// Must be data
String[] tokens = Globals.singleTabMultiSpacePattern.split(nextLine);
int nTokens = tokens.length;
if (nTokens == 0)
{
continue;
}
try
{
if (type.equals(Type.CPG))
{
if (nTokens > 3)
{
chr = tokens[1].trim();
if (!chr.equals(lastChr))
{
changedChromosome(dataset, lastChr);
}
lastChr = chr;
int endPosition = -1;
try
{
endPosition = Integer.parseInt(tokens[2].trim());
}
catch (NumberFormatException numberFormatException)
{
log.severe("Column 2 is not a number");
throw new ParserException("Column 2 must be numeric." + " Found: " + tokens[1],
lineNumber, nextLine);
}
int startPosition = endPosition - 1;
if (startPosition < lastPosition)
{
unsortedChromosomes.add(chr);
}
lastPosition = startPosition;
startLocations.add(startPosition);
endLocations.add(endPosition);
float value = Float.parseFloat(tokens[4].trim());
if (tokens[3].trim().equals("R"))
{
value = -value;
}
data.add(value);
}
}
else if (type.equals(Type.BED_GRAPH) || type.equals(Type.EXPR))
{
if (nTokens > 3)
{
chr = tokens[chrColumn].trim();
if (!chr.equals(lastChr))
{
changedChromosome(dataset, lastChr);
}
lastChr = chr;
int startPosition = -1;
try
{
startPosition = Integer.parseInt(tokens[startColumn].trim());
}
catch (NumberFormatException numberFormatException)
{
log.severe("Column " + (startColumn + 1) + " is not a number");
throw new ParserException("Column (startColumn + 1) must be numeric." + " Found: "
+ tokens[startColumn], lineNumber, nextLine);
}
if (startPosition < lastPosition)
{
unsortedChromosomes.add(chr);
}
lastPosition = startPosition;
startLocations.add(startPosition);
try
{
int endPosition = Integer.parseInt(tokens[endColumn].trim());
endLocations.add(endPosition);
int length = endPosition - startPosition;
updateLongestFeature(length);
}
catch (NumberFormatException numberFormatException)
{
log.severe("Column " + (endColumn + 1) + " is not a number");
throw new ParserException("Column " + (endColumn + 1) + " must be numeric."
+ " Found: " + tokens[endColumn], lineNumber, nextLine);
}
data.add(Float.parseFloat(tokens[dataColumn].trim()));
}
}
else if (type.equals(Type.VARIABLE))
{
if (nTokens > 1)
{
// Per UCSC specification variable and fixed
// step coordinates are "1" based.
// We need to subtract 1 to convert to the
// internal "zero" based coordinates.
int startPosition = Integer.parseInt(tokens[0]) - 1;
if (startPosition < lastPosition)
{
unsortedChromosomes.add(chr);
}
lastPosition = startPosition;
int end = startPosition + windowSpan;
startLocations.add(startPosition);
endLocations.add(end);
data.add(Float.parseFloat(tokens[1]));
}
}
else
{ // Fixed step -- sorting is checked when step line is
// parsed
if (position >= 0)
{
startLocations.add(position);
endLocations.add(position + windowSpan);
data.add(Float.parseFloat(tokens[0]));
}
position += step;
lastPosition = position;
}
}
catch (NumberFormatException e)
{
log.severe(e.toString());
throw new ParserException(e.getMessage(), lineNumber, nextLine);
}
}
}
// The last chromosome
changedChromosome(dataset, lastChr);
}
catch (ParserException pe)
{
throw (pe);
}
catch (Exception e)
{
if (nextLine != null && lineNumber != 0)
{
throw new ParserException(e.getMessage(), e, lineNumber, nextLine);
}
else
{
throw new RuntimeException(e);
}
}
finally
{
if (reader != null)
{
reader.close();
}
}
dataset.sort(unsortedChromosomes);
dataset.setLongestFeatureMap(longestFeatureMap);
double[] sd = sampledData.toArray();
double percent10 = StatUtils.percentile(sd, 10.0);
double percent90 = StatUtils.percentile(sd, 90.0);
dataset.setPercent10((float) percent10);
dataset.setPercent90((float) percent90);
return dataset;
}
private void updateLongestFeature(int length)
{
if (longestFeatureMap.containsKey(chr))
{
longestFeatureMap.put(chr, Math.max(longestFeatureMap.get(chr), length));
}
else
{
longestFeatureMap.put(chr, length);
}
}
// fixedStep chrom=chrM strt=1 step=1
private void parseStepLine(String header)
{
String[] tokens = header.split("\\s+");
for (String token : tokens)
{
String[] keyValue = token.split("=");
if (keyValue.length >= 2)
{
if (keyValue[0].equalsIgnoreCase("chrom"))
{
chr = keyValue[1];
if (!chr.equals(lastChr))
{
changedChromosome(dataset, lastChr);
}
lastChr = chr;
}
else if (keyValue[0].equalsIgnoreCase("start"))
{
// Per UCSC specification variable and fixed step
// coordinates are "1" based.
// We need to subtract 1 to convert to the internal "zero"
// based coordinates.
start = Integer.parseInt(keyValue[1]) - startBase;
if (start < lastPosition)
{
unsortedChromosomes.add(chr);
}
}
else if (keyValue[0].equalsIgnoreCase("step"))
{
step = Integer.parseInt(keyValue[1]);
}
else if (keyValue[0].equalsIgnoreCase("span"))
{
windowSpan = Integer.parseInt(keyValue[1]);
updateLongestFeature(windowSpan);
}
}
}
}
private void changedChromosome(WiggleDataset dataset, String lastChr)
{
if (startLocations != null && startLocations.size() > 0)
{
String convertedChr = genome == null ? lastChr : genome.getChromosomeAlias(lastChr);
dataset.addDataChunk(convertedChr, startLocations, endLocations, data);
// sz = startLocations.size();
float[] f = data.toArray();
for (int i = 0; i < f.length; i++)
{
sampledData.add(f[i]);
}
}
startLocations = new IntArrayList(estArraySize);
endLocations = new IntArrayList(estArraySize);
data = new FloatArrayList(estArraySize);
lastPosition = -1;
}
}