/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.statistics; import org.apache.log4j.Logger; import org.fhcrc.cpl.toolbox.gui.chart.ScatterPlotDialog; import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot; import org.fhcrc.cpl.toolbox.datastructure.Pair; import java.util.Map; import java.util.HashMap; import java.util.List; import java.util.ArrayList; import java.io.IOException; import java.io.InputStream; /** * Utilities for regression analysis */ public class RegressionUtilities { protected static Logger _log = Logger.getLogger(RegressionUtilities.class); //key definitions for accessing regression results public static final String REGRESSION_SLOPE_KEY = "REGRESSION_SLOPE_KEY"; public static final String REGRESSION_INTERCEPT_KEY = "REGRESSION_INTERCEPT_KEY"; public static final String REGRESSION_SIGMA_KEY = "REGRESSION_SIGMA_KEY"; public static final int DEFAULT_MAX_MILLIS_FOR_ROBUST_REGRESSION = 180000; /** * Cover method. Inefficiently converts to doubles * @param categories these should be distinct values -- practically speaking, ints would have been better. * double[] for simplicity * @param outcomes * @return */ public static AnovaResult oneWayAnova(List<? extends Number> categories, List<? extends Number> outcomes) { double[] categoriesDouble = new double[categories.size()]; double[] outcomesDouble = new double[categories.size()]; for (int i=0; i<categories.size(); i++) { categoriesDouble[i] = categories.get(i).doubleValue(); outcomesDouble[i] = outcomes.get(i).doubleValue(); } return oneWayAnova(categoriesDouble, outcomesDouble); } /** * * @param categories these should be distinct values -- practically speaking, ints would have been better. * double[] for simplicity * @param outcomes * @return */ public static AnovaResult oneWayAnova(double[] categories, double[] outcomes) { Map<String,double[]> variableValueMap = new HashMap<String,double[]>(2); variableValueMap.put("x",categories); variableValueMap.put("y",outcomes); String rResponse = RInterface.evaluateRExpression("summary(aov(y~x))", variableValueMap, new String[] {"MASS"}); AnovaResult result = new AnovaResult(); String textLine2 = rResponse.split("\n")[1]; String[] stringChunks = textLine2.split("\\s+"); result.setFValue(Float.parseFloat(stringChunks[4])); //If there's an extremely small p-value, "<" will appear in position 5 if ("<".equals(stringChunks[5])) result.setPValue(0f); else result.setPValue(Float.parseFloat(stringChunks[5])); return result; } public static double[] robustRegression(double[] xset, double[] yset) { return robustRegression(xset, yset, DEFAULT_MAX_MILLIS_FOR_ROBUST_REGRESSION); } public static double[] robustRegression(double[] xset, double[] yset, int millis) { Map<String,double[]> variableValueMap = new HashMap<String,double[]>(2); variableValueMap.put("x",xset); variableValueMap.put("y",yset); //robust regression can be a time-consuming business, need to allow //plenty of time for R to return //TODO: parameterize time to wait? return RInterface.processRCoefficientResponse(RInterface.evaluateRExpression("coefficients(rlm((y~x)));", variableValueMap, new String[] {"MASS"}, millis)); } /** * Given a slope and intercept of a line relating x and y, predict x from y * @param slope * @param intercept * @param x * @return */ public static double predictYFromX(double slope, double intercept, double x) { return slope * x + intercept; } /** * Given a slope and intercept of a line relating x and y, predict y from x * @param slope * @param intercept * @return */ public static double predictXFromY(double slope, double intercept, double y) { return (y - intercept) / slope; } /** * Default: two coefficients * @param xset * @param yset * @return * @throws java.io.IOException */ public static double[] modalRegression(double[] xset, double[] yset) throws IOException { return modalRegression(xset, yset, 1); } public static double[] modalRegression(List<? extends Number> xset, List<? extends Number> yset, int degree) throws IOException { double[] xsetArray = new double[xset.size()]; double[] ysetArray = new double[xset.size()]; for (int i=0; i<xsetArray.length; i++) { xsetArray[i] = xset.get(i).doubleValue(); ysetArray[i] = yset.get(i).doubleValue(); } return modalRegression(xsetArray, ysetArray, degree); } public static double[] modalRegression(List<? extends Number> xset, List<? extends Number> yset, int degree, double maxLeverageNumerator, double maxStudentizedResidual) throws IOException { double[] xsetArray = new double[xset.size()]; double[] ysetArray = new double[xset.size()]; for (int i=0; i<xsetArray.length; i++) { xsetArray[i] = xset.get(i).doubleValue(); ysetArray[i] = yset.get(i).doubleValue(); } Pair<double[], double[]> passingXYValues = selectValuesWithLowLeverageAndStudentizedResidual(xsetArray, ysetArray, maxLeverageNumerator, maxStudentizedResidual, false, 1, false, true); return modalRegression(passingXYValues.first, passingXYValues.second, degree); } /** * Call Yan's Modal Regression R code. * This code is dependent on the "quantreg" library being installed. If that package * isn't installed, it will throw IOException. * * Steps to install: * * source("http://bioconductor.org/biocLite.R") * biocLite(c("quantreg")) * * todo: require some minum cardinality (5?), and throw an IllegalArg if not enough datapoints. Because R will fail * * @param xset * @param yset * @param degree The degree of the polynomial. Minimum 1 * @return * @throws IOException */ public static double[] modalRegression(double[] xset, double[] yset, int degree) throws IOException { if (degree < 1) throw new RuntimeException("Expected degree parameter >=1"); //first write the source, then run the function StringBuffer modalRegressionSourceCodeBuf = new StringBuffer(); InputStream in = RegressionUtilities.class.getResourceAsStream("modal_regression.R"); int inByte; while ((inByte = in.read()) != -1) modalRegressionSourceCodeBuf.append((char) inByte); in.close(); Map<String,double[]> variableValueMap = new HashMap<String,double[]>(2); variableValueMap.put("x",xset); variableValueMap.put("y",yset); //for higher-degree regression String expandXsetCommand = ""; if (degree > 1) { String expansionString = ""; for (int i=1; i<=(degree); i++) { expansionString = expansionString + "x^" + i; if (i < degree) expansionString = expansionString + ", "; } expandXsetCommand = "x<-matrix(c(" + expansionString + "), nrow=length(x), byrow=F);"; } String rCommand = modalRegressionSourceCodeBuf.toString() + expandXsetCommand + " modal_regress(x,y)"; //modal regression can be a time-consuming business, need to allow //plenty of time for R to return //TODO: parameterize time to wait? String rResponse = null; try { rResponse = RInterface.evaluateRExpression(rCommand, variableValueMap, new String[] {"quantreg"}, 150000); } catch (RuntimeException e) { throw new IOException("Failure running R for modal regression. Type: " + e.getClass().getName() + ", Message: " + e.getMessage() + ". Make sure quantreg package is installed"); } return RInterface.processRCoefficientResponse(rResponse); } public static int[] selectIndexesWithLowAbsoluteSomething(double[] somethings, double maxSomething) { List<Integer> resultList = new ArrayList<Integer>(); for (int i=0; i<somethings.length; i++) { if (Math.abs(somethings[i]) < maxSomething) { resultList.add(i); } } int[] result = new int[resultList.size()]; for (int i=0; i<resultList.size(); i++) result[i] = resultList.get(i); return result; } public static double mapValueUsingCoefficients(double[] coefficients, double valueToMap) { double result = 0; for (int j=0; j<coefficients.length; j++) result += coefficients[j] * Math.pow(valueToMap, j); return result; } public static List<Double> mapValuesUsingCoefficients(double[] coefficients, List<? extends Number> valuesToMap) { double[] valuesToMapArray = new double[valuesToMap.size()]; for (int i=0; i<valuesToMap.size(); i++) { valuesToMapArray[i] = valuesToMap.get(i).doubleValue(); } double[] resultArray = mapValuesUsingCoefficients(coefficients, valuesToMapArray); List<Double> result = new ArrayList<Double>(); for (double x : resultArray) result.add(x); return result; } public static double[] mapValuesUsingCoefficients(double[] coefficients, double[] valuesToMap) { double[] result = new double[valuesToMap.length]; for (int i=0; i<result.length; i++) result[i] = mapValueUsingCoefficients(coefficients, i); return result; } public static int[] selectIndexesWithLowLeverageAndStudentizedResidual( List<? extends Number> xValues, List<? extends Number> yValues, double leverageNumerator, double maxStudentizedResidual, boolean modalRegression, int degree, boolean showCharts, boolean assumePositiveCorr) { double[] xArray = new double[xValues.size()]; double[] yArray = new double[xValues.size()]; for (int i=0; i<xValues.size(); i++) { xArray[i] = xValues.get(i).doubleValue(); yArray[i] = yValues.get(i).doubleValue(); } return selectIndexesWithLowLeverageAndStudentizedResidual(xArray, yArray, leverageNumerator, maxStudentizedResidual, modalRegression, degree, showCharts, assumePositiveCorr); } public static Pair<double[], double[]> selectValuesWithLowLeverageAndStudentizedResidual( List<? extends Number> xValues, List<? extends Number> yValues, double leverageNumerator, double maxStudentizedResidual, boolean modalRegression, int degree, boolean showCharts, boolean assumePositiveCorr) { double[] xArray = new double[xValues.size()]; double[] yArray = new double[xValues.size()]; for (int i=0; i<xValues.size(); i++) { xArray[i] = xValues.get(i).doubleValue(); yArray[i] = yValues.get(i).doubleValue(); } return selectValuesWithLowLeverageAndStudentizedResidual(xArray, yArray, leverageNumerator, maxStudentizedResidual, modalRegression, degree, showCharts, assumePositiveCorr); } public static Pair<double[], double[]> selectValuesWithLowLeverageAndStudentizedResidual( double[] xValues, double[] yValues, double leverageNumerator, double maxStudentizedResidual, boolean modalRegression, int degree, boolean showCharts, boolean assumePositiveCorr) { int[] passingIndexes = selectIndexesWithLowLeverageAndStudentizedResidual(xValues, yValues, leverageNumerator, maxStudentizedResidual, modalRegression, degree, showCharts, assumePositiveCorr); double[] newX = new double[passingIndexes.length]; double[] newY = new double[passingIndexes.length]; for (int i=0; i<passingIndexes.length; i++) { newX[i] = xValues[passingIndexes[i]]; newY[i] = yValues[passingIndexes[i]]; } return new Pair<double[], double[]>(newX, newY); } /** * Does what it says on the tin * @param xValues * @param yValues * @param leverageNumerator * @param maxStudentizedResidual * @param modalRegression * @param degree * @param showCharts * @param assumePositiveCorr Assume positive correlation? If so, we can try to eliminate regression bias * @return */ public static int[] selectIndexesWithLowLeverageAndStudentizedResidual( double[] xValues, double[] yValues, double leverageNumerator, double maxStudentizedResidual, boolean modalRegression, int degree, boolean showCharts, boolean assumePositiveCorr) { int nAll = xValues.length; double maxLeverage = leverageNumerator / (double) nAll; _log.debug("selectFeaturesWithLowLeverageAndStudentizedResidual, starting with " + nAll + ", maxLeverageNumerator=" + leverageNumerator + ", maxLeverage=" + maxLeverage + ", maxStudRes=" + maxStudentizedResidual); double[] leverages = BasicStatistics.leverages(xValues); //for (double leverage : leverages) // System.err.println("" + leverage); //PanelWithHistogram pwh = new PanelWithHistogram(leverages); //pwh.displayDialog("leverages"); int[] indexesWithLowLeverage = selectIndexesWithLowAbsoluteSomething( leverages, maxLeverage); int nLowLeverage = indexesWithLowLeverage.length; double[] xValuesWithLowLeverage = new double[nLowLeverage]; double[] yValuesWithLowLeverage = new double[nLowLeverage]; for (int i=0; i<nLowLeverage; i++) { xValuesWithLowLeverage[i] = xValues[indexesWithLowLeverage[i]]; yValuesWithLowLeverage[i] = yValues[indexesWithLowLeverage[i]]; } double[] simpleRegressionResult = MatrixUtil.linearRegression(xValuesWithLowLeverage, yValuesWithLowLeverage); _log.debug("selectFeaturesWithLowLeverageAndStudentizedResidual, low leverage: " + nLowLeverage); if (showCharts && _log.isDebugEnabled()) { PanelWithScatterPlot pwsp = new PanelWithScatterPlot(xValuesWithLowLeverage, yValuesWithLowLeverage, "lowleverage"); pwsp.addLine(simpleRegressionResult[1], simpleRegressionResult[0], BasicStatistics.min(xValuesWithLowLeverage), BasicStatistics.max(xValuesWithLowLeverage)); pwsp.displayInTab(); } double[] regressionResult = null; if (!modalRegression) { //if not doing modal regression, we may not actually use simple regression, either. We can correct for //bias toward the lower right quadrant by performing the regression twice, once inverted, and averaging //the two sets of coefficients _log.debug("forward: " + simpleRegressionResult[1] + "x + " + simpleRegressionResult[0] + " = y"); if (!assumePositiveCorr) regressionResult = simpleRegressionResult; else { double[] regressionResultInverse = MatrixUtil.linearRegression(yValuesWithLowLeverage, xValuesWithLowLeverage); _log.debug("backward: " + regressionResultInverse[1] + "x + " + regressionResultInverse[0] + " = y"); regressionResult = new double[2]; double b1Inverse = 1.0 / regressionResultInverse[1]; double b0Inverse = -regressionResultInverse[0] / regressionResultInverse[1]; _log.debug("backward translated: " + b1Inverse + "x + " + b0Inverse + " = y"); regressionResult[1] = (simpleRegressionResult[1] + b1Inverse) / 2; regressionResult[0] = (simpleRegressionResult[0] + b0Inverse) / 2; _log.debug("average: " + regressionResult[1] + "x + " + regressionResult[0] + " = y"); } } else { try { regressionResult = modalRegression(xValuesWithLowLeverage, yValuesWithLowLeverage, degree); } catch (IOException e) { throw new RuntimeException(e); } } double[] residuals = new double[nLowLeverage]; for (int i=0; i<xValuesWithLowLeverage.length; i++) { double predictedValue = mapValueUsingCoefficients(regressionResult, xValuesWithLowLeverage[i]); residuals[i] = yValuesWithLowLeverage[i] - predictedValue; } double[] studentizedResiduals = BasicStatistics.studentizedResiduals(xValuesWithLowLeverage, residuals, leverages); int[] indexesWithLowStudentizedResidual = selectIndexesWithLowAbsoluteSomething(studentizedResiduals, maxStudentizedResidual); _log.debug("selectFeaturesWithLowLeverageAndStudentizedResidual, result: " + indexesWithLowStudentizedResidual.length); int nResult = indexesWithLowStudentizedResidual.length; int[] result = new int[nResult]; for (int i=0; i<nResult; i++) result[i] = indexesWithLowLeverage[indexesWithLowStudentizedResidual[i]]; if (showCharts) { int maxX = 0; int minX = Integer.MAX_VALUE; double[] xValuesWithLowStudRes = new double[indexesWithLowStudentizedResidual.length]; double[] yValuesWithLowStudRes = new double[indexesWithLowStudentizedResidual.length]; for (int i =0; i<indexesWithLowStudentizedResidual.length; i++) { xValuesWithLowStudRes[i] = xValuesWithLowLeverage[indexesWithLowStudentizedResidual[i]]; yValuesWithLowStudRes[i] = yValuesWithLowLeverage[indexesWithLowStudentizedResidual[i]]; } for (double x : xValuesWithLowStudRes) { if (x > maxX) maxX = (int) x; if (x < minX) minX = (int) x; } PanelWithScatterPlot pwsp = new PanelWithScatterPlot(); pwsp.setName("LevAndStudRes"); int numDotsOnChart = (maxX-minX+1) / 2; double[] chartXVals = new double[numDotsOnChart]; double[] chartYVals = new double[numDotsOnChart]; for (int j=0; j<numDotsOnChart; j++) { chartXVals[j] = minX + (2 * j); chartYVals[j] = mapValueUsingCoefficients(regressionResult, chartXVals[j]); } pwsp.addData(xValuesWithLowStudRes, yValuesWithLowStudRes, "Matches with low stud. res."); pwsp.addData(xValues, yValues, "all mass matches"); pwsp.addData(chartXVals, chartYVals, "regression function"); pwsp.setAxisLabels("X","Y"); pwsp.displayInTab(); } return result; } public static double[] symmetricalRegression(List<? extends Number> xset, List<? extends Number> yset) { double[] xsetArray = new double[xset.size()]; double[] ysetArray = new double[xset.size()]; for (int i=0; i<xsetArray.length; i++) { xsetArray[i] = xset.get(i).doubleValue(); ysetArray[i] = yset.get(i).doubleValue(); } return symmetricalRegression(xsetArray, ysetArray); } /** * If we assume a positive correlation between variables, we can correct for *bias toward the lower right quadrant by performing the regression twice, once inverted, and averaging *the two sets of coefficients * @param xValues * @param yValues * @return */ public static double[] symmetricalRegression(double[] xValues, double[] yValues) { double[] simpleRegressionResult = MatrixUtil.linearRegression(xValues, yValues); double[] regressionResultInverse = MatrixUtil.linearRegression(yValues, xValues); _log.debug("backward: " + regressionResultInverse[1] + "x + " + regressionResultInverse[0] + " = y"); double[] regressionResult = new double[2]; double b1Inverse = 1.0 / regressionResultInverse[1]; double b0Inverse = -regressionResultInverse[0] / regressionResultInverse[1]; _log.debug("backward translated: " + b1Inverse + "x + " + b0Inverse + " = y"); regressionResult[1] = (simpleRegressionResult[1] + b1Inverse) / 2; regressionResult[0] = (simpleRegressionResult[0] + b0Inverse) / 2; _log.debug("average: " + regressionResult[1] + "x + " + regressionResult[0] + " = y"); return regressionResult; } public static class AnovaResult { protected float fValue; protected float pValue; public float getFValue() { return fValue; } public void setFValue(float fValue) { this.fValue = fValue; } public float getPValue() { return pValue; } public void setPValue(float pValue) { this.pValue = pValue; } } }