/*
* Concept profile generation and analysis for Gene-Disease paper
* Copyright (C) 2015 Biosemantics Group, Leiden University Medical Center
* Leiden, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package KnowledgeTransfer;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Iterator;
import org.erasmusmc.collections.IntList;
import org.erasmusmc.collections.SortedIntList2FloatMap;
import org.erasmusmc.groundhog.Groundhog;
import org.erasmusmc.groundhog.GroundhogManager;
import org.erasmusmc.ontology.ConceptVector;
import org.erasmusmc.ontology.ConceptVectorRecord;
import static KnowledgeTransfer.PathConfigs.CONCEPT_FREQUENCIES_FILENAME;
import static KnowledgeTransfer.PathConfigs.MEDLINE_GROUNDHOG_FOLDER_NAME;
import static KnowledgeTransfer.PathConfigs.CPGP_BASE_DIR;
public class ConceptFrequencies {
public static void main(String[] args) {
//This script is designed to parse the old format of the medline groundhog to
// a flat text (.csv) file with the following format:
// 12345 825
// where the first column is the CID and the second column are the number of articles that concept
// occurs is.
// This flat file is imported into SQLite3
System.out.println("dir " + CPGP_BASE_DIR);
GroundhogManager groundhogmanager2 = new GroundhogManager(CPGP_BASE_DIR);
Groundhog documentProfilesGroundhog = groundhogmanager2.getGroundhog(MEDLINE_GROUNDHOG_FOLDER_NAME);
Iterator<ConceptVectorRecord> iter = documentProfilesGroundhog.getIterator();
HashMap<Integer,Integer> cid2pmidcount = new HashMap<>();
int count = 0;
while(iter.hasNext()){
ConceptVectorRecord cvr = iter.next();
if(cvr!=null){
ConceptVector cv = cvr.getConceptVector();
SortedIntList2FloatMap silt = cv.values;
IntList keys = silt.keys();
for(Integer cid:keys){
Integer freq = cid2pmidcount.get(cid);
if(freq==null)
freq = 0;
freq++;
cid2pmidcount.put(cid, freq);
}
count++;
if(count%10000==0){
System.out.println(count);
}
}
}
try{
FileOutputStream output = new FileOutputStream(CONCEPT_FREQUENCIES_FILENAME);
PrintStream printer = new PrintStream(output);
for(Integer cid:cid2pmidcount.keySet()){
int freq = cid2pmidcount.get(cid);
printer.println(cid+"\t"+freq);
}
printer.flush();
}catch(Exception e){
e.printStackTrace();
}
}
}