/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package edu.mayo.bior.pipeline.Treat;
/**
*
* @author dquest
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.tinkerpop.pipes.Pipe;
import com.tinkerpop.pipes.transform.IdentityPipe;
import com.tinkerpop.pipes.transform.TransformFunctionPipe;
import com.tinkerpop.pipes.util.Pipeline;
import edu.mayo.bior.util.BiorProperties;
import edu.mayo.pipes.JSON.DrillPipe;
import edu.mayo.pipes.JSON.RemoveAllJSONPipe;
import edu.mayo.pipes.JSON.tabix.SameVariantPipe;
import edu.mayo.pipes.bioinformatics.VCF2VariantPipe;
import edu.mayo.pipes.history.ColumnMetaData;
import edu.mayo.pipes.history.History;
import edu.mayo.pipes.history.HistoryMetaData;
/**
*
* @author m102417
*/
@SuppressWarnings ({"rawtypes", "unchecked"})
public class AlleleFrequenciesPipeline extends Pipeline implements Cleaner
{
private BiorProperties biorProps;
private String baseDir;
private boolean header = true;
private int deleteColCount = 0;
private int numCols = 0;
private int firstCol = 0;
private int refPos = 0;
private int altPos = 0;
private static final int kBGIMajorAllele = 0;
private static final int kBGIMinorAllele = kBGIMajorAllele + 1;
private static final int kBGIMajorFreq = kBGIMinorAllele + 1;
private static final int kBGIMinorFreq = kBGIMajorFreq + 1;
private static final int kBGICols = kBGIMinorFreq + 1;
private static final String[] kBgiDrill = {"major_allele", "minor_allele", "estimated_major_allele_freq",
"estimated_minor_allele_freq"};
private static final int kESPCeuCounts = 0;
private static final int kESPAfrCounts = kESPCeuCounts + 1;
private static final int kESPTotalCounts = kESPAfrCounts + 1;
private static final int kESPMAF = kESPTotalCounts + 1;
private static final int kESPRefAllele = kESPMAF + 1;
private static final int kESPAltAllele = kESPRefAllele + 1;
private static final int kESPCols = kESPAltAllele + 1;
private static final String[] kEspDrill = {"INFO.EA_AC", "INFO.AA_AC", "INFO.TAC", "INFO.MAF", "_refAllele", "_altAlleles"};
private static final int kHapMapRefAllele = 0;
private static final int kHapMapAltAllele = kHapMapRefAllele + 1;
private static final int kHapMapCeuRefFreq = kHapMapAltAllele + 1;
private static final int kHapMapCeuAltFreq = kHapMapCeuRefFreq + 1;
private static final int kHapMapYriRefFreq = kHapMapCeuAltFreq + 1;
private static final int kHapMapYriAltFreq = kHapMapYriRefFreq + 1;
private static final int kHapMapJptRefCount = kHapMapYriAltFreq + 1;
private static final int kHapMapJptAltCount = kHapMapJptRefCount + 1;
private static final int kHapMapJptTotalCount = kHapMapJptAltCount + 1;
private static final int kHapMapChbRefCount = kHapMapJptTotalCount + 1;
private static final int kHapMapChbAltCount = kHapMapChbRefCount + 1;
private static final int kHapMapChbTotalCount = kHapMapChbAltCount + 1;
private static final int kHapMapCols = kHapMapChbTotalCount + 1;
private static final String[] kHapMapDrill = {"refallele", "otherallele", "CEU.refallele_freq", "CEU.otherallele_freq",
"YRI.refallele_freq", "YRI.otherallele_freq", "JPT.refallele_count",
"JPT.otherallele_count", "JPT.totalcount", "CHB.refallele_count",
"CHB.otherallele_count", "CHB.totalcount"};
private static final int k1kGenomeAll = 0;
private static final int k1kGenomeEUR = k1kGenomeAll + 1;
private static final int k1kGenomeASN = k1kGenomeEUR + 1;
private static final int k1kGenomeAFR = k1kGenomeASN + 1;
private static final int k1kGenomeAMR = k1kGenomeAFR + 1;
private static final int k1kGenomeRef = k1kGenomeAMR + 1;
private static final int k1kGenomeAlt = k1kGenomeRef + 1;
private static final int k1kGenomeCols = k1kGenomeAlt + 1;
private static final String[] k1kGenomeDrill = {"INFO.AF", "INFO.EUR_AF", "INFO.ASN_AF", "INFO.AFR_AF", "INFO.AMR_AF",
"_refAllele", "_altAlleles"};
private static final int kRefOffset = 0;
private static final int kAltOffset = kRefOffset + 1;
private static final int kTotalOffset = kAltOffset + 1;
private static final int kNumOffsets = kTotalOffset + 1;
private static final int kNumResults = kAltOffset + 1;
private static final String kBlank = ".";
private static final String[] kBaseStrs = {"CHROM", "POS", "REF", "ALT"};
private static final int kChromPos = 0;
private static final int kPositionPos = kChromPos + 1;
private static final int kRefPos = kPositionPos + 1;
private static final int kAltPos = kRefPos + 1;
private static final String kEspEurMaf = "ESP6500_EUR_maf";
private static final String kEspAfrMaf = "ESP6500_AFR_maf";
private static final String kBGIFreq = "BGI200_Danish";
private static final String[] kHapMapFreq = {"HapMap_CEU_allele_freq", "HapMap_YRI_allele_freq", "HapMap_JPT+CHB_allele_freq"};
private static final String[] k1kGenomeFreq = {"1kGenome_EUR_allele_freq", "1kGenome_ASN_allele_freq",
"1kGenome_AFR_allele_freq", "1kGenome_AMR_allele_freq"};
private static final double kPercentAdjust = 100.0;
protected static final boolean kConvertFromPercent = true;
private static final boolean kDoNotConvert = false;
private static final String kMajorSplit = ",";
private static final String kMinorSplit = "/";
/**
* Default constructor, IdentityPipe in and out, no cleaning
*
* @throws IOException
*/
public AlleleFrequenciesPipeline () throws IOException
{
biorProps = new BiorProperties ();
baseDir = biorProps.get ("fileBase");
init (new IdentityPipe (), new IdentityPipe (), false);
}
/**
* Standard Constructor, gets data from input, sends it to output, doesn't do any cleaning
*
* @param input Where gets vcf information from
* @param output Where sends the resulting information
* @throws IOException
*/
public AlleleFrequenciesPipeline (Pipe input, Pipe output) throws IOException
{
biorProps = new BiorProperties ();
baseDir = biorProps.get ("fileBase");
init (input, output, false);
}
/**
* Full constructor, gets data from input, sends it to output, does cleaning if clean is true
*
* @param input Where gets vcf information from
* @param output Where sends the resulting information
* @param clean If true, trims off the catalog data and adds one column per frequency source
* (i.e. 1kGenome_EUR, etc), with the information in the TREAT format
* @throws IOException
*/
public AlleleFrequenciesPipeline (Pipe input, Pipe output, boolean clean) throws IOException
{
biorProps = new BiorProperties ();
baseDir = biorProps.get ("fileBase");
init (input, output, clean);
}
/**
* Full constructor, gets data from input, sends it to output, does cleaning if clean is true
*
* @param input Where gets vcf information from
* @param output Where sends the resulting information
* @param baseDir Base directory that holds all the catalog files, or null to use the one from properties
* @param clean If true, trims off the catalog data and adds one column per frequency source
* (i.e. 1kGenome_EUR, etc), with the information in the TREAT format
* @throws IOException
*/
public AlleleFrequenciesPipeline (Pipe input, Pipe output, String baseDir, boolean clean) throws IOException
{
biorProps = new BiorProperties ();
if (baseDir != null)
this.baseDir = baseDir;
init (input, output, clean);
}
/**
* Sets up the catalogs and drills necessary to get the data
*
* @param input assumes that somehow input is converted to a history
* @param output Where the results will go
* @param clean If true, trims off the catalog data and adds one column per frequency source
* (i.e. 1kGenome_EUR, etc), with the information in the TREAT format
* @throws IOException
*/
public void init (Pipe input, Pipe output, boolean clean) throws IOException
{
Pipe cleaner;
if (clean)
cleaner = new TransformFunctionPipe<History, History> (new TreatPipe (this));
else
cleaner = new IdentityPipe<History> ();
if (baseDir == null)
baseDir = biorProps.get ("fileBase");
if (baseDir == null)
baseDir = "";
String bgiFile = baseDir + biorProps.get ("bgiFile");
String espFile = baseDir + biorProps.get ("espFile");
String hapMapFile = baseDir + biorProps.get ("hapMapFile");
String genomeFile = baseDir + biorProps.get ("kGenomeFile"); // genomeFile
String[] bgiDrill = kBgiDrill;
String[] espDrill = kEspDrill;
String[] hapMapDrill = kHapMapDrill;
String[] genomeDrill = k1kGenomeDrill;
int posCol = -1;
// ??-history better convert input to a history coming into the pipeline e.g. HistoryInPipe
Pipeline p = new Pipeline (input,
new VCF2VariantPipe (), new SameVariantPipe (bgiFile, posCol),
new DrillPipe (false, bgiDrill),
new SameVariantPipe (espFile, posCol -= bgiDrill.length),
new DrillPipe (false, espDrill),
new SameVariantPipe (hapMapFile, posCol -= espDrill.length),
new DrillPipe (false, hapMapDrill),
new SameVariantPipe (genomeFile, posCol -= hapMapDrill.length),
new DrillPipe (false, genomeDrill),
new RemoveAllJSONPipe (), cleaner, output);
this.setPipes (p.getPipes ());
deleteColCount = genomeDrill.length - posCol;
}
/* (non-Javadoc)
* @see edu.mayo.bior.pipeline.Treat.Cleaner#doClean(edu.mayo.pipes.history.History)
*/
public History doClean (History history)
{
if (header)
{
header = false;
numCols = history.size ();
firstCol = numCols - deleteColCount + 1; // Skip the history column
int[] poses = getPositions (kBaseStrs);
refPos = poses[kRefPos];
altPos = poses[kAltPos];
// Now clean up the history metadata
HistoryMetaData metaData = History.getMetaData ();
List<ColumnMetaData> columns = metaData.getColumns ();
for (int i = numCols - 1; i >= firstCol; --i)
columns.remove (i);
addAlleleColumns (columns);
}
int startCol = firstCol;
String ref = history.get (refPos);
String alt = history.get (altPos);
String bgiMajAllele = history.get (startCol + kBGIMajorAllele);
String bgiMinAllele = history.get (startCol + kBGIMinorAllele);
double bgiMajFreq = parseDouble (history.get (startCol + kBGIMajorFreq), kDoNotConvert);
double bgiMinFreq = parseDouble (history.get (startCol + kBGIMinorFreq), kDoNotConvert);
startCol += kBGICols;
String espMafs = history.get (startCol + kESPMAF);
startCol += kESPCols;
String hapRefAllele = history.get (startCol + kHapMapRefAllele);
String hapAltAllele = history.get (startCol + kHapMapAltAllele);
double hapCeuRefFreq = parseDouble (history.get (startCol + kHapMapCeuRefFreq), kDoNotConvert);
double hapCeuAltFreq = parseDouble (history.get (startCol + kHapMapCeuAltFreq), kDoNotConvert);
double hapYriRefFreq = parseDouble (history.get (startCol + kHapMapYriRefFreq), kDoNotConvert);
double hapYriAltFreq = parseDouble (history.get (startCol + kHapMapYriAltFreq), kDoNotConvert);
double[] hapJptChbFreq = getCombinedFreq (history, startCol);
startCol += kHapMapCols;
double genomeEURFreq = parseDouble (history.get (startCol + k1kGenomeEUR), kDoNotConvert);
double genomeASNFreq = parseDouble (history.get (startCol + k1kGenomeASN), kDoNotConvert);
double genomeAFRFreq = parseDouble (history.get (startCol + k1kGenomeAFR), kDoNotConvert);
double genomeAMRFreq = parseDouble (history.get (startCol + k1kGenomeAMR), kDoNotConvert);
String genomeRefAllele = history.get (startCol + k1kGenomeRef);
String genomeAltAllele = history.get (startCol + k1kGenomeAlt);
startCol += k1kGenomeCols;
// Got the information we needed, now clear it all out
for (int i = startCol - 1; i >= firstCol; --i)
history.remove (i);
history.addAll (getESPMAFs (espMafs, ref, alt));
history.add (getBGIMAFs (bgiMajAllele, bgiMinAllele, bgiMajFreq, bgiMinFreq));
history.add (getHapMapMAFs (hapRefAllele, hapAltAllele, hapCeuRefFreq, hapCeuAltFreq));
history.add (getHapMapMAFs (hapRefAllele, hapAltAllele, hapYriRefFreq, hapYriAltFreq));
history.add (getHapMapMAFs (hapRefAllele, hapAltAllele, hapJptChbFreq[kRefOffset], hapJptChbFreq[kAltOffset]));
history.add (get1kGenomeMAFs (genomeRefAllele, genomeAltAllele, genomeEURFreq));
history.add (get1kGenomeMAFs (genomeRefAllele, genomeAltAllele, genomeASNFreq));
history.add (get1kGenomeMAFs (genomeRefAllele, genomeAltAllele, genomeAFRFreq));
history.add (get1kGenomeMAFs (genomeRefAllele, genomeAltAllele, genomeAMRFreq));
return history;
}
/**
* Add the Allele Frequency Source columns to the history metadata
*
* @param columns Where to add them
*/
private void addAlleleColumns (List<ColumnMetaData> columns)
{
columns.add (new ColumnMetaData (kEspEurMaf));
columns.add (new ColumnMetaData (kEspAfrMaf));
columns.add (new ColumnMetaData (kBGIFreq));
for (String columnName : kHapMapFreq)
columns.add (new ColumnMetaData (columnName));
for (String columnName : k1kGenomeFreq)
columns.add (new ColumnMetaData (columnName));
}
/**
* Return a list holding CEU and AFR ESP frequency results if they're there, or two empty strings if they aren't
*
* @param espMafs String holding ESP Minor allele frequencies in the format ["CEU", "AFR", "Total"]
* @param ref Reference base, will use first char, so can not be null or empty
* @param alt Alternate base, will use first char, so can not be null or empty
* @return List of two Strings, possibly with frequency data in them in the format "Alt/Ref,MinorAF/MajorAF"
*/
private List<String> getESPMAFs (String espMafs, String ref, String alt)
{
int len = espMafs.length ();
if (espMafs.equals (kBlank) || (len < 4))
return makeEmptiesList (2);
espMafs = espMafs.substring (2, len - 2); // Trim off [""]
int pos = espMafs.indexOf ('"');
if (pos < 0)
return makeEmptiesList (2);
double ceuMaf = parseDouble (espMafs.substring (0, pos), kDoNotConvert);
double afrMaf = Double.NaN;
pos = espMafs.indexOf ('"', pos + 1) + 1;
if (pos > 0)
{
int end = espMafs.indexOf ('"', pos + 1);
if (end > 0)
afrMaf = parseDouble (espMafs.substring (pos, end), kDoNotConvert);
}
// Make sure have something to report
boolean hasCEU = !((ceuMaf == 0.0) || (Double.isNaN (ceuMaf)));
boolean hasAFR = !((afrMaf == 0.0) || (Double.isNaN (afrMaf)));
if (!hasCEU && !hasAFR)
return makeEmptiesList (2);
List<String> results = new ArrayList<String> (2);
char majorBase = ref.charAt (0);
char minorBase = getFirstBase (alt);
if (hasCEU)
results.add (makeAlleleFrequency (majorBase, minorBase, ceuMaf, 1.0 - ceuMaf));
else
results.add (kBlank);
if (hasAFR)
results.add (makeAlleleFrequency (majorBase, minorBase, afrMaf, 1.0 - afrMaf));
else
results.add (kBlank);
return results;
}
/**
* Return a list holding BGI Danish frequency results if they're there, or one empty string if they aren't
*
* @param chromosome The chromosome the variant is on
* @param pos The position of the variant in the chromosome
* @param bgiMajAllele Major / Ref Allele
* @param bgiMinAllele Minor / Alt Allele
* @param bgiMajFreq Major / Ref Frequency
* @param bgiMinFreq Minor / Alt Frequency
* @return A String, possibly with frequency data in the format "Alt/Ref,MinorAF/MajorAF"
*/
private String getBGIMAFs (String bgiMajAllele, String bgiMinAllele, double bgiMajFreq, double bgiMinFreq)
{
if (bgiMajAllele.equals (kBlank) || bgiMinAllele.equals (kBlank))
return kBlank;
boolean hasRef = !((bgiMajFreq == 0.0) || (Double.isNaN (bgiMajFreq)));
boolean hasAlt = !((bgiMinFreq == 0.0) || (Double.isNaN (bgiMinFreq)));
if (!hasRef || !hasAlt)
return kBlank;
char majorBase = bgiMajAllele.charAt (0);
char minorBase = getFirstBase (bgiMinAllele);
return makeAlleleFrequency (majorBase, minorBase, bgiMinFreq, bgiMajFreq);
}
/**
* Return a list holding HapMap frequency results if they're there, or one empty string if they aren't
*
* @param hapRefAllele Major / Ref Allele
* @param hapAltAllele Minor / Alt Allele
* @param hapRefFreq Major / Ref Frequency
* @param hapAltFreq Minor / Alt Frequency
* @return A String, possibly with frequency data in the format "Alt/Ref,MinorAF/MajorAF"
*/
private String getHapMapMAFs (String hapRefAllele, String hapAltAllele, double hapRefFreq, double hapAltFreq)
{
if (hapRefAllele.equals (kBlank) || hapAltAllele.equals (kBlank))
return kBlank;
boolean hasRef = !((hapRefFreq == 0.0) || (Double.isNaN (hapRefFreq)));
boolean hasAlt = !((hapAltFreq == 0.0) || (Double.isNaN (hapAltFreq)));
if (!hasRef || !hasAlt)
return kBlank;
char majorBase = hapRefAllele.charAt (0);
char minorBase = getFirstBase (hapAltAllele);
return makeAlleleFrequency (majorBase, minorBase, hapAltFreq, hapRefFreq);
}
/**
* Return a list holding Thousand Genomes AlleleFreq objects if they're there, or null if they aren't
*
* @param chromosome The chromosome the variant is on
* @param pos The position of the variant in the chromosome
* @param pop Which population the data is for
* @param ref Reference base, will use first char, so can not be null or empty
* @param alt Alternate base, will use first char, so can not be null or empty
* @param altFreq Minor / Alt Frequency
* @return List of one AlleleFreq object, or null if there aren't any specified
*/
private String get1kGenomeMAFs (String ref, String alt, double altFreq)
{
if ((altFreq == 0.0) || (Double.isNaN (altFreq)))
return kBlank;
char majorBase = ref.charAt (0);
char minorBase = getFirstBase (alt);
return makeAlleleFrequency (majorBase, minorBase, altFreq, 1.0 - altFreq);
}
/**
* Go through the History metadata finding which columns hold the Alt and Ref positions
*
* @param baseStrs Strings to look for matches for
* @return Array of ints, same size as baseStrs, holding the matches, or -1 if not matched
*/
private static final int[] getPositions (String[] baseStrs)
{
HistoryMetaData metaData = History.getMetaData ();
List<ColumnMetaData> columns = metaData.getColumns ();
int numStrs = baseStrs.length;
int[] results = new int[numStrs];
int leftToFind = numStrs;
for (int i = 0; i < numStrs; ++i)
results[i] = -1;
int pos = 0;
for (ColumnMetaData column : columns)
{
String test = column.getColumnName ().toUpperCase ();
for (int i = 0; i < numStrs; ++i)
{
if (test.equals (baseStrs[i]))
{
results[i] = pos;
--leftToFind;
if (leftToFind == 0)
return results;
break; // Strings are different, so if found a match done looking for this test string
}
}
++pos;
}
return results;
}
/**
* Get the Ref and Alt frequencies for HapMap JPT+CHB populations
*
* @param history Object to get the relevant strings from
* @param startCol Where to start looking in the History
* @return Array of two doubles. Both might be NaN, or double between 0 and 1
*/
private static final double[] getCombinedFreq (History history, int startCol)
{
int[] jptCounts = getCounts (history, startCol + kHapMapJptRefCount);
int[] chbCounts = getCounts (history, startCol + kHapMapChbRefCount);
int ref = jptCounts[kRefOffset] + chbCounts[kRefOffset];
int alt = jptCounts[kAltOffset] + chbCounts[kAltOffset];
int total = jptCounts[kTotalOffset] + chbCounts[kTotalOffset];
double[] results = new double[kNumResults];
if ((total == 0) || (alt == 0)) // If alt is zero, there's nothing to report
results[kRefOffset] = results[kRefOffset] = Double.NaN;
else
{
results[kRefOffset] = ((double) ref) / total;
results[kAltOffset] = ((double) alt) / total;
}
return results;
}
/**
* Get the counts for a population group
*
* @param history Where to get the counts from
* @param startCol First column to look at
* @return An array of three ints. It will have all 0s if there are no value, but it will
* always exist and have three entries
*/
private static final int[] getCounts (History history, int startCol)
{
int[] results = new int[kNumOffsets];
results[kRefOffset] = parseInt (history.get (startCol + kRefOffset));
results[kAltOffset] = parseInt (history.get (startCol + kAltOffset));
results[kTotalOffset] = parseInt (history.get (startCol + kTotalOffset));
return results;
}
/**
* Look through a String for the first A, C, G, T. Failing to find any of those, look for hte first
* letter. Failing that, return 'A'
*
* @param baseStr String to look through. Must not be null
* @return A Character, guaranteed to be a capital letter
*/
private static final char getFirstBase (String baseStr)
{
baseStr = baseStr.toUpperCase (); // Make sure everything's upper case
int numBases = baseStr.length ();
char base;
for (int i = 0; i < numBases; ++i)
{
base = baseStr.charAt (i);
switch (base)
{
case 'A':
case 'C':
case 'G':
case 'T':
return base;
}
}
// Couldn't find an A, T, C, or G, go for any letter
for (int i = 0; i < numBases; ++i)
{
base = baseStr.charAt (i);
if (Character.isLetter (base))
return base;
}
// Couldn't find anything, return the default
return 'A';
}
/**
* Make an allele frequency string
*
* @param ref The reference char
* @param alt The alternate char
* @param minor The minor allele frequency, a number between 0 and 1
* @param major The major allele frequency, a number between 0 and 1
* @return String of frequency data in them in the format "Alt/Ref,MinorAF/MajorAF"
*/
private static final String makeAlleleFrequency (char ref, char alt, double minor, double major)
{
StringBuilder result = new StringBuilder ();
result.append (alt);
result.append (kMinorSplit);
result.append (ref);
result.append (kMajorSplit);
result.append (minor);
result.append (kMinorSplit);
result.append (major);
return result.toString ();
}
/**
* Make a List with the specified number of blank Strings in it
*
* @param count Number of blank strings to add to the list
* @return List with Math.max(0, count) blank strings in it
*/
private static final List<String> makeEmptiesList (int count)
{
List<String> results = new ArrayList<String> (count);
for (int i = 0; i < count; ++i)
results.add (kBlank);
return results;
}
/**
* Parse a String, returning the int represented, or 0 if not an int
*
* @param theInt String to parse. Must not be null
* @return An integer, 0 if parsing failed
*/
private static final int parseInt (String theInt)
{
int result = 0;
if (!theInt.equals (kBlank))
{
try
{
result = Integer.parseInt (theInt);
}
catch (NumberFormatException oops)
{
// Do nothing
}
}
return result;
}
/**
* Parse a String, returning the double represented, or NaN if not a double
*
* @param theDouble String to parse. Must not be null
* @return An double, NaN if parsing failed
*/
private static final double parseDouble (String theDouble, boolean convertFromPercent)
{
double result = Double.NaN;
if (!theDouble.equals (kBlank))
{
try
{
result = Double.parseDouble (theDouble);
if (convertFromPercent)
result /= kPercentAdjust;
}
catch (NumberFormatException oops)
{
// Do nothing
}
}
return result;
}
}