AddCoOccToMatchScore.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation and analysis for Gene-Disease paper
 * Copyright (C) 2015 Biosemantics Group, Leiden University Medical Center
 *  Leiden, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package analysis;

import static KnowledgeTransfer.PathConfigs.CPGP_BASE_DIR;
import static KnowledgeTransfer.PathConfigs.MATCH_SCORE_FILENAME;
import static KnowledgeTransfer.PathConfigs.MEDLINE_GROUNDHOG_FOLDER_NAME;
import static com.google.common.collect.Sets.intersection;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.erasmusmc.groundhog.Groundhog;
import org.erasmusmc.groundhog.GroundhogManager;
import org.erasmusmc.utilities.ReadCSVFile;
import org.erasmusmc.utilities.WriteCSVFile;

import KnowledgeTransfer.PathConfigs;

public class AddCoOccToMatchScore {

//	public static final String OUTPUT = MATCH_SCORE_FILENAME + "-coOcc.txt";
	public static final String OUTPUT = PathConfigs.RESULTS_BASE_DIR + "concept_pair_data.txt";
	public static Groundhog documentProfilesGroundhog;
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		Map<Integer, Set<Integer>> pubMedIdsPerConcept = new HashMap<Integer, Set<Integer>>();
		
		GroundhogManager groundhogmanager = new GroundhogManager(CPGP_BASE_DIR);
		documentProfilesGroundhog = groundhogmanager.getGroundhog(MEDLINE_GROUNDHOG_FOLDER_NAME);

		WriteCSVFile output = new WriteCSVFile(OUTPUT);
		
		ReadCSVFile input = new ReadCSVFile(MATCH_SCORE_FILENAME);
		Iterator<List<String>> it = input.iterator();
		
		int n = 1; 
		
		System.out.println("Start processing match score file.");
		
		while(it.hasNext()) {
			List<String> fields = it.next();
			
			Integer conceptA = Integer.parseInt(fields.get(0));
			Integer conceptB = Integer.parseInt(fields.get(1));
			Double matchScore = Double.parseDouble(fields.get(2));
			
			if (Double.isNaN(matchScore))
			{
				continue ;
			}

			Set<Integer> pmidsA = null;
			Set<Integer> pmidsB = null;
			
			if (pubMedIdsPerConcept.containsKey(conceptA))
			{
				pmidsA = pubMedIdsPerConcept.get(conceptA);
			}
			else
			{
				pmidsA = documentProfilesGroundhog.getRecordIDsForConcept(conceptA);
				pubMedIdsPerConcept.put(conceptA, pmidsA);
			}
			
			if (pubMedIdsPerConcept.containsKey(conceptB))
			{
				pmidsB = pubMedIdsPerConcept.get(conceptB);
			}
			else
			{
				pmidsB = documentProfilesGroundhog.getRecordIDsForConcept(conceptB);
				pubMedIdsPerConcept.put(conceptB, pmidsB);
			}

			Set<Integer> i2 = intersection(pmidsA, pmidsB);
//			output.write(Arrays.asList(fields.get(0), fields.get(1), fields.get(2), i2.toString()));
			output.write(Arrays.asList(fields.get(0), fields.get(1), String.valueOf(i2.size()), fields.get(2)));
			
			if (n++ % 1000 == 0)
			{
				System.out.println("Processed " + n + " concept pairs.");		
			}
		}

		output.close();
		//TODO: collecting statistics
		
		System.out.println("Done!");
	}
}