/* * Concept profile generation and analysis for Gene-Disease paper * Copyright (C) 2015 Biosemantics Group, Leiden University Medical Center * Leiden, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package analysis; import static KnowledgeTransfer.PathConfigs.MATCH_SCORE_FILENAME; import static KnowledgeTransfer.PathConfigs.RESULTS_BASE_DIR; import java.math.BigDecimal; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.utilities.ReadCSVFile; import com.google.common.collect.Sets; public class Figure1h { private static final String OUTPUT = RESULTS_BASE_DIR + "Figure1h.txt"; private static final int BUCKET_DECIMAL_PLACE = 1; //private static final int NUM_EXPECTED_MATCHSCORES = 1899; // on inputs with 10 genes, diseases private static final int NUM_EXPECTED_MATCHSCORES = 204072353; // on full input creating ~400M pairs: also 23 negative infinite private static int lineCount = 0; public static void main(String[] args) { ReadCSVFile input = new ReadCSVFile(MATCH_SCORE_FILENAME + "-coOcc.txt"); Iterator<List<String>> it = input.iterator(); Map<Double, Integer> explicitHist = new HashMap<Double, Integer>(); Map<Double, Integer> implicitHist = new HashMap<Double, Integer>(); Integer implicitCnt = 0; Integer explicitCnt = 0; /* * Collect histogram data */ while(it.hasNext()) { List<String> fields = it.next(); Integer conceptA = Integer.parseInt(fields.get(0)); Integer conceptB = Integer.parseInt(fields.get(1)); Double matchScore = Double.parseDouble(fields.get(2)); String coOccurrences = fields.get(3); if( lineCount % 1000000 == 0 ) { System.out.println("Processed: " + lineCount); } lineCount++; if(!matchScore.isNaN()) { // null if one concept has no profile if(coOccurrences.equals("[]")) { // implicit link implicitCnt += addMatchScoreToHistogram(implicitHist, matchScore); } else { // explicit link explicitCnt += addMatchScoreToHistogram(explicitHist, matchScore); } } } /* * Print histograms */ //printHistogram(implicitHist); //printHistogram(explicitHist); relativeHistogram(explicitHist, implicitHist); System.out.println("Number of concept pairs with match score: " + (implicitCnt + explicitCnt)); System.out.println("Number of expected concept pairs used for calculating percentile (first column): " + NUM_EXPECTED_MATCHSCORES); System.out.println("If the former number does not match the latter, then percentile score is incorrect (use different constant for NUM_EXPECTED_MATCHSCORES or check for unexpected infinite valued match scores)."); } public static void printHistogram(Map<Double, Integer> histogram) { Double max = Collections.max(histogram.keySet()); Double min = Collections.min(histogram.keySet()); Integer foundInBuckets = 0; for(Double i=max; i>=min; i-=1.0/Math.pow(10,BUCKET_DECIMAL_PLACE)) { Double bucket = (new BigDecimal(i)).setScale(BUCKET_DECIMAL_PLACE, BigDecimal.ROUND_HALF_UP).doubleValue(); Integer cnt = null; if(histogram.containsKey(bucket)) { cnt = histogram.get(bucket); } else { cnt = 0; } foundInBuckets += cnt; System.out.println(bucket + "\t" + cnt + "\t" + foundInBuckets); } } /* * print the relative, normalized fraction of explicit to implicit associations */ public static Map<Double, Integer> relativeHistogram(Map<Double, Integer> explicitHist, Map<Double, Integer> implicitHist) { Map<Double, Integer> result = new HashMap<Double, Integer>(); Set<Double> allIndices = Sets.union(explicitHist.keySet(), implicitHist.keySet()); Double max = Collections.max(allIndices); Double min = Collections.min(allIndices); Integer explCumul = 0; Integer implCumul = 0; System.out.println("perc\tscore\t#impl\t#expl\t#implC\t#explC\t%impl"); for(Double i=max; i>=min; i-=1.0/Math.pow(10,BUCKET_DECIMAL_PLACE)) { Double bucket = (new BigDecimal(i)).setScale(BUCKET_DECIMAL_PLACE, BigDecimal.ROUND_HALF_UP).doubleValue(); Integer explCnt = 0; Integer implCnt = 0; if(explicitHist.containsKey(bucket)) explCnt = explicitHist.get(bucket); if(implicitHist.containsKey(bucket)) implCnt = implicitHist.get(bucket); implCumul += implCnt; explCumul += explCnt; //TODO: add %expl Double percentile = (1 - (((double) implCumul+explCumul) / NUM_EXPECTED_MATCHSCORES)) * 100; System.out.printf("%.3f\t" + bucket + "\t" + implCnt + "\t" + explCnt + "\t" + implCumul + "\t" + explCumul + "\t", percentile); if(explCumul+implCumul > 0) { System.out.printf("%.3f\n", (double)implCumul/(explCumul+implCumul)); } else { System.out.println("-"); } } return result; } public static Integer addMatchScoreToHistogram(Map<Double, Integer> histogram, Double matchScore) { assert matchScore <= 1 && matchScore >= 0 ; Double bucket = (double) Math.log10(matchScore); assert bucket <= 0; //TODO: count number of infinite values, since this influences NUM_EXPECTED_MATCHSCORES if( bucket.isNaN() || bucket.isInfinite() ) { System.out.println("NaN or inifinite matchscore in input line " + lineCount + " (skipping..)"); return 0; } BigDecimal bd = (new BigDecimal(bucket)).setScale(BUCKET_DECIMAL_PLACE, BigDecimal.ROUND_HALF_UP); bucket = bd.doubleValue(); if(!histogram.containsKey(bucket)) { histogram.put(bucket, 1); } else { histogram.put(bucket, histogram.get(bucket)+1); } return 1; } }