/*
* The MIT License (MIT)
*
* Copyright (c) 2007-2015 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broad.igv.tools.converters;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.SortingCollection;
import org.apache.log4j.Logger;
import org.broad.igv.data.expression.ExpressionFileParser;
import org.broad.igv.data.expression.GeneToLocusHelper;
import org.broad.igv.feature.Locus;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.tools.sort.SortableRecord;
import org.broad.igv.tools.sort.SortableRecordCodec;
import org.broad.igv.track.TrackType;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import java.io.*;
import java.util.Comparator;
import java.util.List;
/**
* @author jrobinso
* @date Oct 9, 2010
*/
public class GCTtoIGVConverter {
private static Logger log = Logger.getLogger(GCTtoIGVConverter.class);
/**
* Parse the file and output in ".igv" format
*
* @return
*/
public static void convert(ResourceLocator resourceLocator, File outputFile, String probeResource,
int maxRecords, File tmpDir, Genome genome) throws IOException {
ExpressionFileParser.FileType type = ExpressionFileParser.determineType(resourceLocator);
GeneToLocusHelper locusHelper = new GeneToLocusHelper(probeResource);
BufferedReader reader = null;
PrintWriter writer = null;
SortingCollection cltn = getSortingCollection(maxRecords, tmpDir);
try {
reader = new BufferedReader(new InputStreamReader(ParsingUtils.openInputStream(resourceLocator.getPath())));
writer = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
ExpressionFileParser.FormatDescriptor formatDescriptor = ExpressionFileParser.parseHeader (reader, type, null);
String [] dataHeadings = formatDescriptor.getDataHeaders();
// Need a better way to determine type!
String dataType = resourceLocator.getPath().contains("methylation") ? TrackType.DNA_METHYLATION.toString()
: TrackType.GENE_EXPRESSION.toString();
writer.println("#type=" + dataType);
writer.print("Chr\tStart\tEnd\tProbe");
for (String s : dataHeadings) {
writer.print("\t" + s);
}
writer.println();
String nextLine = null;
while ((nextLine = reader.readLine()) != null) {
// A gct row can map to multiple loci, normally this indicates a problem with the probe
DataRow row = new DataRow(nextLine, formatDescriptor);
String probe = row.getProbe();
List<Locus> loci = locusHelper.getLoci(probe, row.getDescription(), genome.getId());
if (loci == null || loci.isEmpty()) {
log.warn("No locus found for: " + probe + " " + row.getDescription());
} else {
for (Locus locus : loci) {
String igvLine = locus.getChr() + "\t" + locus.getStart() + "\t" + locus.getEnd() + "\t" + probe +
"\t" + row.getData();
cltn.add(new SortableRecord(locus.getChr(), locus.getStart(), igvLine));
}
}
}
// Ouputput the sorted file
CloseableIterator<SortableRecord> iter = cltn.iterator();
while (iter.hasNext()) {
SortableRecord al = iter.next();
writer.println(al.getText());
}
} finally {
if (reader != null) {
reader.close();
}
if (writer != null) {
writer.close();
}
}
}
static SortingCollection getSortingCollection(int maxRecords, File tmpDir) {
SortableRecordCodec codec = new SortableRecordCodec();
Comparator<SortableRecord> comp = new Comparator<SortableRecord>() {
public int compare(SortableRecord o1, SortableRecord o2) {
String chr1 = o1.getChromosome().replaceFirst("chr", "");
String chr2 = o2.getChromosome().replaceFirst("chr", "");
int s1 = Integer.MAX_VALUE;
try {
s1 = Integer.parseInt(chr1);
} catch (Exception e) {
// ignore
}
int s2 = Integer.MAX_VALUE;
try {
s2 = Integer.parseInt(chr2);
} catch (Exception e) {
// ignre
}
int t1 = s1 - s2;
if (t1 == 0) {
chr1 = chr1.replace("M", "Z");
chr2 = chr2.replace("M", "Z");
t1 = chr1.compareTo(chr2);
}
if (t1 == 0) {
return (int) (o1.getStart() - o2.getStart());
} else {
return t1;
}
}
};
return SortingCollection.newInstance(SortableRecord.class, codec, comp, maxRecords, tmpDir);
}
/**
* Represents a row of data from a GCT or mage-tab file. Using this class if more effecient than tokeninzing
* the entire line. Some GCT files have over a thousand columns and we're only interested in the first 2
*/
static class DataRow {
private String probe;
private String description;
private String data;
DataRow(String string, ExpressionFileParser.FormatDescriptor formatDescriptor) {
String [] tokens = string.split("\t");
probe = tokens[formatDescriptor.getProbeColumn()];
int descriptionColumn = formatDescriptor.getDescriptionColumn();
description = descriptionColumn < 0 ? "" : tokens[descriptionColumn];
StringBuffer dataBuffer = new StringBuffer();
final int[] dataColumns = formatDescriptor.getDataColumns();
dataBuffer.append(tokens[dataColumns[0]]);
for(int i=1; i<dataColumns.length; i++) {
dataBuffer.append('\t');
dataBuffer.append(tokens[dataColumns[i]]);
}
data = dataBuffer.toString();
}
private String getProbe() {
return probe;
}
public String getDescription() {
return description;
}
public String getData() {
return data;
}
}
}