/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.conceptprofilegenerator; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.util.Iterator; import java.util.Set; import org.erasmusmc.applications.conceptprofileevaluator.LoadDataFiles; import org.erasmusmc.applications.conceptprofileevaluator.SubGroundhogStatistics; import org.erasmusmc.collections.IntList; import org.erasmusmc.conceptprofilegenerator.generators.CPGeneratorUncertaintyCoefficientConceptFrequencies; import org.erasmusmc.databases.integersetstore.Integer2IntegerSet; import org.erasmusmc.databases.integersetstore.IntegerSetStore; import org.erasmusmc.groundhog.Groundhog; import org.erasmusmc.groundhog.GroundhogStatistics; import org.erasmusmc.ontology.ConceptProfile; import org.erasmusmc.ontology.ConceptVectorRecord; import com.sleepycat.je.DatabaseException; public class GenerateConceptProfilesFromIntegerSetStore { // parameters public String integerSetStoreFilename; public String sourceGroundhogName; public String targetGroundhogName; public String groundhogRoot = ""; public String groundhogStatisticsFilename; public String conceptsToBeFilteredFileName; public int minNumberOfPmidsForCp = 5; public Integer maximumNumberOfPmidsForCP = 10000; public Integer maximumNumberOfConceptsPerCP = 50000; public double cutoff = 10E-8; public boolean assumeEmptyGroundhog = true; //If false, will check if profile exists in target groundhog public static void main(String[] args) throws Exception { GenerateConceptProfilesFromIntegerSetStore scriptObject = new GenerateConceptProfilesFromIntegerSetStore(); scriptObject.integerSetStoreFilename = "/home/jelier/data/Projects/weighted_Globaltest/RandomIntSetStore/"; scriptObject.sourceGroundhogName = "Groundhog_Medline"; scriptObject.targetGroundhogName = "RandomCPs"; scriptObject.groundhogRoot = "/home/jelier/data/"; scriptObject.groundhogStatisticsFilename = "/home/jelier/data/Groundhog_Medline/GroundhogStatistics.txt"; scriptObject.conceptsToBeFilteredFileName = "/home/jelier/Projects/weighted_Globaltest/Excl. filter for genes.conceptset"; scriptObject.minNumberOfPmidsForCp = 5; scriptObject.maximumNumberOfPmidsForCP = 300000; scriptObject.maximumNumberOfConceptsPerCP = 50000; scriptObject.cutoff = 10E-8; scriptObject.run(); } private void initialize() { store = new IntegerSetStore(new File(integerSetStoreFilename)); try { sourceGroundhog = new Groundhog(new File(groundhogRoot+ sourceGroundhogName),9000000); File targetFile = new File(groundhogRoot+ targetGroundhogName); if (!targetFile.exists()) targetFile.mkdir(); targetGroundhog = new Groundhog(targetFile,1000000); } catch (DatabaseException e1) { e1.printStackTrace(); } wholeGroundhogStatistics = new GroundhogStatistics(); FileInputStream wholeCollexionStatisticsFileStream; try { wholeCollexionStatisticsFileStream = new FileInputStream(new File(groundhogStatisticsFilename)); wholeGroundhogStatistics.loadGroundhogStatisticsFromFile(wholeCollexionStatisticsFileStream); if (conceptsToBeFilteredFileName != null) { FileInputStream conceptsToBeFilteredFile = new FileInputStream(new File(conceptsToBeFilteredFileName)); conceptsToBeFiltered = LoadDataFiles.loadIDs(conceptsToBeFilteredFile); } conceptProfileGenerator = new CPGeneratorUncertaintyCoefficientConceptFrequencies(wholeGroundhogStatistics, conceptsToBeFiltered); conceptProfileGenerator.maxNumberOfConceptsPerProfile = maximumNumberOfConceptsPerCP; conceptProfileGenerator.cutoff = cutoff; } catch (FileNotFoundException e) { e.printStackTrace(); } } public void run() { initialize(); targetGroundhog.setBulkImportMode(true); Iterator<Integer2IntegerSet> iterator = store.iterator(); int count = 0; while (iterator.hasNext()) { Integer2IntegerSet entry = iterator.next(); if (assumeEmptyGroundhog || !targetGroundhog.hasEntry(entry.id)) { IntList pmids = entry.setofIntegers.getSortedList(); if (pmids.size() >= minNumberOfPmidsForCp) { if (pmids.size() > maximumNumberOfPmidsForCP) { pmids = pmids.subList(pmids.size() - maximumNumberOfPmidsForCP, pmids.size()); } //System.out.println(entry.id + "\t" + pmids.size() + "\t" + entry.setofIntegers.size()); SubGroundhogStatistics subGroundhogStatistics = new SubGroundhogStatistics(sourceGroundhog, pmids); ConceptProfile conceptProfile = conceptProfileGenerator.generateConceptProfile(subGroundhogStatistics, entry.id); ConceptVectorRecord record = conceptProfile.conceptProfileToRecord(); targetGroundhog.saveConceptVectorRecord(record); //target.saveEntryNoCaching(record); count++; if (count % 1000 == 0) System.out.println("Created " + count + " concept profiles"); } } } System.out.println("Created " + count + " concept profiles"); //targetGroundhog.setReindexBatchSize(1000); //targetGroundhog.setBulkImportMode(false); } private IntegerSetStore store; private Groundhog targetGroundhog; private Groundhog sourceGroundhog; private GroundhogStatistics wholeGroundhogStatistics; private Set<Integer> conceptsToBeFiltered = null; private CPGeneratorUncertaintyCoefficientConceptFrequencies conceptProfileGenerator; }