package org.molgenis.euratrans.pilot;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.DataFormatException;
import matrix.general.VerifyCsv;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.molgenis.data.Data;
import org.molgenis.euratrans.ChipPeak;
import org.molgenis.euratrans.Promoter;
import org.molgenis.pheno.Measurement;
import org.molgenis.pheno.Panel;
import org.molgenis.util.CsvFileReader;
import org.molgenis.util.CsvFileWriter;
import org.molgenis.util.CsvReader;
import org.molgenis.util.CsvWriter;
import org.molgenis.util.SimpleTuple;
import org.molgenis.util.Tuple;
import org.molgenis.xgap.Chromosome;
import org.molgenis.xgap.Gene;
import org.molgenis.xgap.InvestigationFile;
import org.molgenis.xgap.Locus;
import org.molgenis.xgap.Marker;
import org.molgenis.xgap.Sample;
public class ConvertPilot
{
static String source = "/Users/mswertz/Downloads/EuratransPilot/XQTL_pilot/";
static String target = "/Users/mswertz/Downloads/EuratransPilot/converted/";
static List<Data> dataMatrices = new ArrayList<Data>();
static Map<String, Chromosome> chromosomes = new LinkedHashMap<String, Chromosome>();
static Map<String, Sample> samples = new LinkedHashMap<String, Sample>();
static Map<String, Gene> genes = new LinkedHashMap<String, Gene>();
static Map<String, Promoter> promoters = new LinkedHashMap<String, Promoter>();
static Map<String, Measurement> measurements = new LinkedHashMap<String, Measurement>();
static Map<String, String> nameAndfiles = new LinkedHashMap<String, String>();
static Logger logger = Logger.getLogger("convert");
// static Map<String, Panel> panels = new LinkedHashMap<String, Panel>();
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception
{
BasicConfigurator.configure();
File dataDir = new File(target + "/data");
dataDir.mkdirs();
File binDir = new File(target + "/binarydatamatrix");
binDir.mkdirs();
File fileDir = new File(target + "/investigationfile");
fileDir.mkdirs();
// genotyping
convertMarkers();
convertGenotypes();
// convert histone enriched gene bodies
convertHistoneGeneBodies("histone_modifications_and_rna_seq/H3K27me3/liver/ensembl-59-gene-bodies.gff");
convertHistoneGeneBodies("histone_modifications_and_rna_seq/H3K27me3/lv/ensembl-59-gene-bodies.gff");
convertHistoneData("H3K27me3_liver_data_matrix",
"histone_modifications_and_rna_seq/H3K27me3/liver/ensembl-59-gene-bodies-bound-data-matrix.txt",
Gene.class);
convertHistoneData("H3K27me3_lv_data_matrix",
"histone_modifications_and_rna_seq/H3K27me3/lv/ensembl-59-gene-bodies-bound-data-matrix.txt",
Gene.class);
// convertQtlResults("H3K27me3_liver_qtl_results",
// "histone_modifications_and_rna_seq/H3K27me3/liver/ensembl-59-gene-bodies-bound-qtl-results.txt");
convertGffToCsvMatrix("H3K27me3_liver_qtl_intervals",
"histone_modifications_and_rna_seq/H3K27me3/liver/ensembl-59-gene-bodies-bound-qtl-results.gff");
convertGffToCsvMatrix("H3K27me3_lv_qtl_intervals",
"histone_modifications_and_rna_seq/H3K27me3/lv/ensembl-59-gene-bodies-bound-qtl-results.gff");
// convert histone enriched promotors
convertHistonePromoters("histone_modifications_and_rna_seq/H3K4me3/liver/ensembl-59-promoters.gff");
convertHistonePromoters("histone_modifications_and_rna_seq/H3K4me3/lv/ensembl-59-promoters.gff");
convertHistoneData("H3K4me3_liver_data_matrix",
"histone_modifications_and_rna_seq/H3K4me3/liver/ensembl-59-promoters-bound-data-matrix.txt",
Promoter.class);
convertHistoneData("H3K4me3_lv_data_matrix",
"histone_modifications_and_rna_seq/H3K4me3/lv/ensembl-59-promoters-bound-data-matrix.txt",
Promoter.class);
convertGffToCsvMatrix("H3K4me3_liver_qtl_intervals",
"histone_modifications_and_rna_seq/H3K4me3/liver/ensembl-59-promoters-bound-qtl-results.gff");
convertGffToCsvMatrix("H3K4me3_lv_qtl_intervals",
"histone_modifications_and_rna_seq/H3K4me3/lv/ensembl-59-promoters-bound-qtl-results.gff");
// load the CHIP MAC peaks data
convertChipData("bnlx2H3K27ac_peaks.bed");
convertChipData("bnlx2H3K4me1_peaks.bed");
convertChipData("bnlx2H3K4me3_peaks.bed");
convertChipData("bnlx3H3K27ac_peaks.bed");
convertChipData("bnlx3H3K4me1_peaks.bed");
convertChipData("bnlx3H3K4me3_peaks.bed");
convertChipData("shr2H3K27ac_peaks.bed");
convertChipData("shr2H3K4me1_peaks.bed");
convertChipData("shr2H3K4me3_peaks.bed");
convertChipData("shr3H3K27ac_peaks.bed");
convertChipData("shr3H3K4me1_peaks.bed");
convertChipData("shr3H3K4me3_peaks.bed");
// convert to bin
for (Data d : dataMatrices)
{
d.setInvestigation_Name("EURATRANS");
convertCsvToBin(new File(target + "data/" + d.getName() + ".txt"), d);
}
// write out annotations
writeChromosomes();
writeSamples();
writeGenes();
writePromotors();
writeData();
writeFiles(nameAndfiles);
}
/**
* Because peaks don't have meaning between data sets we don't consider the
* Peak numbers 'targets'
*/
static void convertChipData(String path) throws IOException, DataFormatException
{
// simply convert BED to data (null X Measurement)
Data d = new Data();
d.setName(path.substring(0, path.length() - 4));
d.setFeatureType(Measurement.class.getSimpleName());
d.setTargetType(ChipPeak.class.getSimpleName());
d.setValueType("Text");
dataMatrices.add(d);
CsvReader in = new CsvFileReader(new File(source + "chip/peaks/" + path));
CsvWriter out = new CsvFileWriter(new File(target + "data/" + d.getName() + ".txt"));
out.setHeaders(Arrays.asList(new String[]
{ "", "chr", "start", "end", "p_value_10_log" }));
out.writeHeader();
for (Tuple row : in)
{
Tuple result = new SimpleTuple();
result.set("", row.getString(3));
result.set("chr", row.getString(0));
result.set("start", row.getString(1));
result.set("end", row.getString(2));
result.set("p_value_10_log", row.getString(4));
out.writeRow(result);
}
out.close();
}
static void convertQtlResults(String name, String file) throws Exception
{
Data qtls = new Data();
qtls.setName(name);
qtls.setFeatureType(Measurement.class.getSimpleName());
// ai!
qtls.setTargetType(Measurement.class.getSimpleName());
qtls.setValueType("Text");
dataMatrices.add(qtls);
CsvReader in = new CsvFileReader(new File(source + file));
// extract Measurement
for (String measurement : in.colnames())
{
if (!measurements.containsKey(measurement))
{
Measurement m = new Measurement();
m.setName(measurement);
measurements.put(m.getName(), m);
}
}
// convert the csv; need to add artificial column with numbers :-(
CsvWriter matrixCsv = new CsvFileWriter(new File(target + "data/" + qtls.getName() + ".txt"));
List<String> names = new ArrayList<String>(in.colnames());
names.add(0, "");
matrixCsv.setHeaders(names);
matrixCsv.writeHeader();
int i = 1;
for (Tuple row : in)
{
// add count
((SimpleTuple) row).set("", "qtl" + i);
matrixCsv.writeRow(row);
i++;
}
matrixCsv.close();
}
static void writeFiles(Map<String, String> files) throws IOException
{
CsvWriter dataCsv = new CsvFileWriter(new File(target + "investigationfile.txt"));
dataCsv.setHeaders(Arrays.asList(new String[]
{ InvestigationFile.NAME, InvestigationFile.EXTENSION }));
dataCsv.writeHeader();
for (String file : files.keySet())
{
File src = new File(source + files.get(file));
InvestigationFile f = new InvestigationFile();
f.setName(file.trim());
f.setExtension(src.getName().substring(src.getName().lastIndexOf(".") + 1));
FileUtils.copyFile(src, new File(target + "/investigationfile/" + f.getName()));
dataCsv.writeRow(f);
}
dataCsv.close();
}
// sdps2snps.csv -> marker.txt + chromosome.txt
// TODO: details on the chromosomes
public static void convertMarkers() throws IOException, DataFormatException
{
logger.info("convert markers:");
// chrom,representative,marker,physical.pos,cM
CsvReader in = new CsvFileReader(new File(source + "histone_modifications_and_rna_seq/Genotypes/sdps2snps.csv"));
// chromosome_name,name,symbol,bpStart,cm
CsvWriter markerCsv = new CsvFileWriter(new File(target + "marker.txt"));
markerCsv.setHeaders(Arrays.asList(new String[]
{ Marker.CHROMOSOME_NAME, Marker.NAME, Marker.BPSTART, Marker.CM }));
markerCsv.writeHeader();
for (Tuple t : in)
{
Marker m = new Marker();
m.setName(t.getString("marker"));
m.setChromosome_Name("chr" + t.getString("chrom"));
// todo m.setReportsFor_Name(("representative"));
if (!"NA".equals(t.getString("physical.pos"))) m.setBpStart(t.getLong("physical.pos"));
else
m.setBpStart(null);
m.setCM(t.getDouble("cM"));
markerCsv.writeRow(m);
if (chromosomes.get(m.getChromosome_Name()) == null)
{
Chromosome c = new Chromosome();
c.setName(m.getChromosome_Name());
chromosomes.put(m.getChromosome_Name(), c);
}
}
markerCsv.close();
logger.info("convert markers complete");
}
public static void writeChromosomes() throws IOException
{
logger.info("write chromosomes ...");
CsvWriter chromosomeCsv = new CsvFileWriter(new File(target + "chromosome.txt"));
chromosomeCsv.setHeaders(Arrays.asList(new String[]
{ Chromosome.NAME, Chromosome.ORDERNR, Chromosome.ISAUTOSOMAL }));
chromosomeCsv.writeHeader();
for (Chromosome c : chromosomes.values())
{
if (c.getName().contains("M"))
{
c.setOrderNr(22);
c.setIsAutosomal(false);
}
else if (c.getName().contains("X"))
{
c.setOrderNr(22);
c.setIsAutosomal(false);
}
else
{
c.setOrderNr(Integer.parseInt(c.getName().substring(3)));
c.setIsAutosomal(true);
}
chromosomeCsv.writeRow(c);
}
chromosomeCsv.close();
logger.info("write chromosomes complete");
}
public static void writeData() throws IOException
{
CsvWriter dataCsv = new CsvFileWriter(new File(target + "data.txt"));
dataCsv.setHeaders(Arrays.asList(new String[]
{ Data.NAME, Data.FEATURETYPE, Data.TARGETTYPE, Data.VALUETYPE }));
dataCsv.writeHeader();
for (Data c : dataMatrices)
{
dataCsv.writeRow(c);
}
dataCsv.close();
}
// star-genetic-map-20071109.csv -> panel.txt, genotypes.txt, data.txt
// TODO: externalid different from other file??
public static void convertGenotypes() throws Exception
{
logger.info("convert genotypes ...");
Data genotypes = new Data();
genotypes.setName("genetic_map_20071109");
genotypes.setFeatureType(Panel.class.getSimpleName());
genotypes.setTargetType(Marker.class.getSimpleName());
genotypes.setValueType("Text");
dataMatrices.add(genotypes);
// we need to filter away the marker info
CsvReader in = new CsvFileReader(new File(source
+ "histone_modifications_and_rna_seq/Genotypes/star-genetic-map-20071109.csv"));
List<String> skip = Arrays.asList(new String[]
{ "chrom", "externalid", "physical.pos", "physical.pos", "cM" });
List<Panel> panels = new ArrayList<Panel>();
List<String> names = new ArrayList<String>();
// names.add("");
for (String name : in.colnames())
{
if (!skip.contains(name))
{
names.add(name);
Panel p = new Panel();
p.setName(name);
panels.add(p);
}
}
CsvWriter panelCsv = new CsvFileWriter(new File(target + "panel.txt"));
panelCsv.setHeaders(Arrays.asList(new String[]
{ Panel.NAME }));
panelCsv.writeHeader();
for (Panel p : panels)
panelCsv.writeRow(p);
panelCsv.close();
CsvWriter matrixCsv = new CsvFileWriter(new File(target + "data/" + genotypes.getName() + ".txt"));
names.set(0, "");
matrixCsv.setHeaders(names);
matrixCsv.writeHeader();
for (Tuple row : in)
{
// rename first col to empty
List<String> cols = row.getFields();
cols.set(cols.indexOf("markeralias"), "");
((SimpleTuple) row).setFields(cols);
matrixCsv.writeRow(row);
}
matrixCsv.close();
logger.info("convert genotypes complete");
}
public static void convertHistonePromoters(String path) throws IOException, DataFormatException
{
logger.info("convert promoters " + path + " ...");
// we need to extract individual, gene
CsvReader in = new CsvFileReader(new File(source + path));
// CsvWriter geneCsv = new CsvFileWriter(new File(target + "gene.txt"));
// geneCsv.setHeaders(Arrays.asList(new String[]
// { Gene.CHROMOSOME_NAME, Gene.NAME, Gene.SYMBOL, Gene.BPSTART,
// Gene.BPEND }));
//
// geneCsv.writeHeader();
for (Tuple t : in)
{
Promoter p = new Promoter();
// strip the leading 'Chr'
p.setChromosome_Name(t.getString(0));
String[] geneNames = t.getString(8).substring(3).split(";Name=");
// strip leading 'ID=', split symbo
p.setName("PROMOTER_" + geneNames[0]);
if (geneNames.length > 1) p.setSymbol(geneNames[1]);
p.setBpStart(t.getLong(3));
p.setBpEnd(t.getLong(4));
if (!promoters.containsKey(p.getName()))
{
promoters.put(p.getName(), p);
}
// TODO strand / orientation
// geneCsv.writeRow(g);
if (chromosomes.get(p.getChromosome_Name()) == null)
{
Chromosome c = new Chromosome();
c.setName(p.getChromosome_Name());
chromosomes.put(p.getChromosome_Name(), c);
}
}
// geneCsv.close();
logger.info("convert promoters complete");
}
public static void convertHistoneGeneBodies(String path) throws IOException, DataFormatException
{
logger.info("convert gene bodies " + path + " ...");
// we need to extract individual, gene
CsvReader in = new CsvFileReader(new File(source + path));
// CsvWriter geneCsv = new CsvFileWriter(new File(target + "gene.txt"));
// geneCsv.setHeaders(Arrays.asList(new String[]
// { Gene.CHROMOSOME_NAME, Gene.NAME, Gene.SYMBOL, Gene.BPSTART,
// Gene.BPEND }));
//
// geneCsv.writeHeader();
for (Tuple t : in)
{
Gene g = new Gene();
// strip the leading 'Chr'
g.setChromosome_Name(t.getString(0));
String[] geneNames = t.getString(8).substring(3).split(";Name=");
// strip leading 'ID=', split symbo
g.setName(geneNames[0]);
if (geneNames.length > 1) g.setSymbol(geneNames[1]);
g.setBpStart(t.getLong(3));
g.setBpEnd(t.getLong(4));
if (!genes.containsKey(g.getName()))
{
genes.put(g.getName(), g);
}
// TODO strand / orientation
// geneCsv.writeRow(g);
if (chromosomes.get(g.getChromosome_Name()) == null)
{
Chromosome c = new Chromosome();
c.setName(g.getChromosome_Name());
chromosomes.put(g.getChromosome_Name(), c);
}
}
// geneCsv.close();
logger.info("convert gene bodies complete");
}
public static void convertHistoneData(String name, String path, Class<? extends Locus> klazz) throws Exception
{
logger.info("convert data " + path + " ...");
Data genotypes = new Data();
genotypes.setName(name);
genotypes.setFeatureType(Sample.class.getSimpleName());
genotypes.setTargetType(klazz.getSimpleName());
genotypes.setValueType("Decimal");
dataMatrices.add(genotypes);
// we need to extract individual
// TODO: are individuals shared over these sets??? And what about the
// 'merged'? Are these 'strain' level info?
CsvReader in = new CsvFileReader(new File(source + path));
List<String> indNames = in.colnames();
for (String iName : indNames)
{
iName = iName.replace(".", "_");
if (!"".equals(iName) && !samples.containsKey(iName))
{
Sample i = new Sample();
i.setName(iName);
samples.put(i.getName(), i);
}
}
// assume gene bodies complete; only push headers a bit
CsvWriter out = new CsvFileWriter(new File(target + "data/" + name + ".txt"));
List<String> headers = new ArrayList<String>();
for (String n : in.colnames())
headers.add(n.replace(".", "_"));
out.setHeaders(headers);
out.writeHeader();
for (Tuple t : in)
{
for (int i = 0; i < t.size(); i++)
{
if (i > 0) out.writeSeparator();
out.writeValue(t.getString(i));
}
out.writeEndOfLine();
}
out.close();
logger.info("convert data complete");
}
static void writeSamples() throws IOException
{
logger.info("write samples ...");
CsvWriter indCsv = new CsvFileWriter(new File(target + "sample.txt"));
indCsv.setHeaders(Arrays.asList(new String[]
{ Sample.NAME }));
indCsv.writeHeader();
for (Sample s : samples.values())
{
indCsv.writeRow(s);
}
indCsv.close();
logger.info("write samples complete ...");
}
static void writeGenes() throws IOException
{
logger.info("write genes ...");
CsvWriter indCsv = new CsvFileWriter(new File(target + "gene.txt"));
indCsv.setHeaders(Arrays.asList(new String[]
{ Gene.CHROMOSOME_NAME, Gene.NAME, Gene.SYMBOL, Gene.BPSTART, Gene.BPEND }));
indCsv.writeHeader();
for (Gene s : genes.values())
{
indCsv.writeRow(s);
}
indCsv.close();
logger.info("write genes complete ...");
}
static void writePromotors() throws IOException
{
logger.info("write promoters:");
CsvWriter indCsv = new CsvFileWriter(new File(target + "promoter.txt"));
indCsv.setHeaders(Arrays.asList(new String[]
{ Promoter.CHROMOSOME_NAME, Promoter.NAME, Promoter.BPSTART, Promoter.BPEND }));
indCsv.writeHeader();
for (Promoter s : promoters.values())
{
indCsv.writeRow(s);
}
indCsv.close();
logger.info("write promoters complete");
}
static void writeMeasurements() throws IOException
{
logger.info("write measurements:");
CsvWriter indCsv = new CsvFileWriter(new File(target + "measurement.txt"));
indCsv.setHeaders(Arrays.asList(new String[]
{ Measurement.NAME }));
indCsv.writeHeader();
for (Measurement s : measurements.values())
{
indCsv.writeRow(s);
}
indCsv.close();
logger.info("write measurements complete");
}
static void convertCsvToBin(File src, Data d) throws Exception
{
logger.info("converting csv to bin " + src.getPath());
// verify the CSV file to be a correct matrix and get the dimensions
int[] dims = VerifyCsv.verify(src, d.getValueType());
logger.info("verified");
// convert to binary
File dest = new File(target + "binarydatamatrix/" + d.getName() + ".bin");
logger.info("Starting conversion..");
new MakeBinary().makeBinaryBackend(d, src, dest, dims[0], dims[1]);
logger.info("..done!");
}
static void convertGffToCsvMatrix(String dataName, String src) throws Exception
{
nameAndfiles.put(dataName, src);
CsvReader in = new CsvFileReader(new File(source + src));
Data data = new Data();
data.setName(dataName);
data.setFeatureType(Measurement.class.getSimpleName());
data.setTargetType(Measurement.class.getSimpleName());
data.setValueType("Text");
dataMatrices.add(data);
CsvWriter out = new CsvFileWriter(new File(target + "data/" + data.getName() + ".txt"));
List<String> labels = Arrays.asList(new String[]
{ "", "start", "end", "peak", "minp", "trait", "distance", "cis", "BN", "SHR", "log2_BN_over_SHR" });
out.setHeaders(labels);
out.writeHeader();
int i = 1;
for (Tuple row : in)
{
SimpleTuple result = new SimpleTuple();
result.set("", "qtl" + i++);
result.set("start", row.getString(3));
result.set("end", row.getString(4));
String[] valuePairs = row.getString(8).split(";");
for (String vp : valuePairs)
{
String[] labelValue = vp.split("=");
if (!labels.contains(labelValue[0])) throw new Exception("Label=" + labelValue[0] + " not known");
result.set(labelValue[0], labelValue[1]);
}
out.writeRow(result);
}
out.close();
}
}