/* * Concept profile generation and analysis for Gene-Disease paper * Copyright (C) 2015 Biosemantics Group, Leiden University Medical Center * Leiden, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package KnowledgeTransfer; import static KnowledgeTransfer.ConceptProfileUtil.loadConceptFrequencies; import static KnowledgeTransfer.ConceptProfileUtil.readCidFile; import static KnowledgeTransfer.ConceptProfileUtil.readConceptProfilesByID; import static KnowledgeTransfer.PathConfigs.CONCEPT_FREQUENCIES_FILENAME; import static KnowledgeTransfer.PathConfigs.CONCEPT_PROFILES_DIR; import static KnowledgeTransfer.PathConfigs.CPGP_BASE_DIR; import static KnowledgeTransfer.PathConfigs.HPRD_GENE_CIDS; import static KnowledgeTransfer.PathConfigs.MATCH_SCORE_FILENAME; import static KnowledgeTransfer.PathConfigs.MEDLINE_GROUNDHOG_FOLDER_NAME; import static KnowledgeTransfer.PathConfigs.THESAURUS_DISEASE_CIDS; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.erasmusmc.collections.IntList; import org.erasmusmc.collections.SortedIntListSet; import org.erasmusmc.groundhog.Groundhog; import org.erasmusmc.groundhog.GroundhogManager; import org.erasmusmc.ontology.ConceptVector; import org.erasmusmc.ontology.ConceptVectorRecord; import org.erasmusmc.utilities.WriteCSVFile; import textmining.myprofiles.ContingencyTable; /** * @author * * Create concept profiles for all gene and disease concepts and calculate a match score for * each gene-disease combination (if both the gene and disease have a profile).<br><br> * * INPUT REQUIRED 1: concept frequencies * {@link KnowledgeTransfer.ConceptFrequencies}<br><br> * * INPUT REQUIRED 2: medline index * A BerkeleyDB (aka groundhog) that provides two lookup functions:<br> * 1) getRecordIDsForConcept(conceptid): return a list of all PMIDs in which this concept occurs<br> * 2) get(pmid): get a list of concepts that occur in this PMID<br><br> * * OUTPUT 1: concept profiles (each file named with conceptIdA)<br> * format: <code>conceptIdB , weight</code><br> * example row: <code>35203,4.4286979627989374E-5</code><br><br> * * OUTPUT 2: match scores<br> * format: <code>conceptIdA> , conceptIdB , match score</code><br> * example row: <code></code> * */ public class BuildConceptProfileAndMatch { static final int TOTAL_NUMBER_OF_ABSTRACTS = 17062496; static Groundhog documentProfilesGroundhog; static HashMap<Integer, Integer> conceptid2frequency = new HashMap<Integer, Integer>(); public static void main(String[] args) { conceptid2frequency = loadConceptFrequencies(CONCEPT_FREQUENCIES_FILENAME); // Declareer een medline groundhog volgens de legacy code. GroundhogManager groundhogmanager2 = new GroundhogManager(CPGP_BASE_DIR); documentProfilesGroundhog = groundhogmanager2.getGroundhog(MEDLINE_GROUNDHOG_FOLDER_NAME); List<Integer> genes = readCidFile(HPRD_GENE_CIDS); List<Integer> diseases = readCidFile(THESAURUS_DISEASE_CIDS); // System.out.println((new java.util.Date()) + // " start creating concept profiles"); List<Integer> allConceptIDS = new ArrayList<Integer>(); allConceptIDS.addAll(genes); allConceptIDS.addAll(diseases); generateAndWriteConceptProfiles(allConceptIDS, conceptid2frequency); Map<Integer, Map<Integer, Double>> conceptProfiles = readConceptProfilesByID( allConceptIDS, CONCEPT_PROFILES_DIR); writeMatchScores(genes, diseases, conceptProfiles); System.out.println((new java.util.Date()) + " Done!"); // TextFileUtilities.saveToFile(conceptprofile, fileout); } public static void writeMatchScores(List<Integer> genes, List<Integer> diseases, Map<Integer, Map<Integer, Double>> conceptProfiles) { // List<ConceptPairData> result = new ArrayList<ConceptPairData>(); int cnt = 0; System.out.println((new java.util.Date()) + " writing match scores to: " + MATCH_SCORE_FILENAME); WriteCSVFile output = new WriteCSVFile(MATCH_SCORE_FILENAME); for (Integer gene : genes) { for (Integer dis : diseases) { if ((++cnt % 10000) == 0) { System.out.println((new java.util.Date()) + " number of profiles matched: " + cnt); } // //3063788,1834821,NaN // if (gene == 3063788 && dis == 1834821) // { // System.out.println("Stop! Hammer time!"); // } Map<Integer, Double> cp1 = conceptProfiles.get(gene); Map<Integer, Double> cp2 = conceptProfiles.get(dis); double ip = SingleMatchscore.InnerProduct(cp1, cp2); output.write(Arrays.asList(String.valueOf(gene), String.valueOf(dis), String.valueOf(ip))); // result.add(new ConceptPairData(gene, dis, ip)); } } output.close(); // return result; } public static Map<Integer, Map<Integer, Double>> generateAndWriteConceptProfiles( List<Integer> cids, HashMap<Integer, Integer> cid2count) { Map<Integer, Map<Integer, Double>> result = new HashMap<Integer, Map<Integer, Double>>(); int cnt = 0; for (Integer cid : cids) { if ((++cnt % 10) == 0) { System.out.println((new java.util.Date()) + " number of profiles created: " + cnt); } Map<Integer, Double> cp = generateConceptProfile(cid, cid2count); if (cp != null) { result.put(cid, cp); } } return result; } public static Map<Integer, Double> generateConceptProfile( Integer conceptid, HashMap<Integer, Integer> cid2count) { HashMap<Integer, Double> result = new HashMap<Integer, Double>(); SortedIntListSet pmids = documentProfilesGroundhog .getRecordIDsForConcept(conceptid); if (pmids.size() < 6 || pmids.size() > 140000) return null; HashMap<Integer, Integer> concept2frequency = new HashMap<Integer, Integer>(); for (Integer pmid : pmids) { ConceptVectorRecord cvr = documentProfilesGroundhog.get(pmid); ConceptVector cv = cvr.getConceptVector(); IntList w = cv.values.keys(); for (Integer concept : w) { Integer frequency = concept2frequency.get(concept); if (frequency == null) frequency = 0; frequency++; concept2frequency.put(concept, frequency); } } int frequencyA = pmids.size(); for (Integer key : concept2frequency.keySet()) { int frequencyB = cid2count.get(key); double M11 = concept2frequency.get(key); double M01 = frequencyB - M11; double M10 = frequencyA - M11; double M00 = TOTAL_NUMBER_OF_ABSTRACTS - M11 - M01 - M10; double uc = ContingencyTable.UncertaintyCoefficient(M11, M10, M01, M00); result.put(key, uc); } WriteCSVFile output = new WriteCSVFile(CONCEPT_PROFILES_DIR + Integer.toString(conceptid)); for (Integer concept : result.keySet()) { output.write(Arrays.asList(Integer.toString(concept), Double.toString(result.get(concept)))); } output.close(); return result; } }