/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package Anni; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.util.List; import org.erasmusmc.applications.indexer.IndexerMainForGroundhog; import org.erasmusmc.groundhog.Groundhog; import org.erasmusmc.groundhog.GroundhogManager; import org.erasmusmc.groundhog.GroundhogStatistics; import org.erasmusmc.medline.FetchPMIDsFromOnlinePubmed; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.OntologyManager; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.WriteTextFile; public class LiteratureUpdateMasterScript { //Settings: // public static String beginning = "1980/01/01"; //yyyy/mm/dd public static String beginning = "1990/07/16"; //yyyy/mm/dd public static String now = "1990/07/17"; //yyyy/mm/dd public static int geneCUIcutoff = 3000000; //Gene conceptIDs start above this number public static int chemCUIcutoff = 4000000; //Chemical conceptIDs start above this number public static String temp = "/tmp/"; //Temp folder. Make sure it is empty before proceeding // public static Set<Integer> removeSemTypes = getRemoveSemTypes(); public static int maxPMIDsPerConcept = 10000; //Truncate lists of PMIDs per concept to this number //Sources: public static String ontologyName = "UMLS2010ABHomologeneJochemToxV1_6"; public static String geneOntologyFile = "/home/reinout/AnniUpdate/GO/go_20120714-assocdb.rdf-xml"; //Download latest file from http://archive.geneontology.org/latest-lite/ //Targets: public static String groundhogRoot = "/tmp/"; public static String groundhogName = "Medline1980till17Jul2012_UMLS2010ABHomologeneJochemToxV1_6-test29"; public static String genegroundhogName = "GeneGroundhog_Anni_July2012"; public static String integerSetStore = "/home/reinout/AnniUpdate/IntegerSetStores/IntegerSetStore_Anni_July2012/"; public static String conceptProfilesGroundhog = "ConceptProfiles_Anni_July2012"; public static String cooccurrenceDBName = "CoOccurrence_Anni_July2012"; //Stored in groundhog folder public static String titlesDatabase = "MedlineTitles_Anni_Jxuly2012"; //Stored on MySQL server public static void main(String[] args){ System.out.println("start"); OntologyManager ontologyManager = new OntologyManager(); Ontology ontology = ontologyManager.fetchClient(ontologyName); System.out.println(StringUtilities.now() + "\tCreating complete groundhog"); //for toxicology subset only: // String query = beginning+"[PDat] : "+now+"[PDat] AND tox[sb] NOT Comment[ptyp] NOT Editorial[ptyp] NOT News[ptyp] NOT Historical Article[ptyp] NOT Congresses[ptyp] NOT Biography[ptyp] NOT Newspaper Article[ptyp] NOT Guideline[ptyp] NOT Practice Guideline[ptyp] NOT Interview[ptyp] NOT Bibliography[ptyp] NOT Legal Cases[ptyp] NOT Lectures[ptyp] NOT Consensus Development Conference[ptyp] NOT Addresses[ptyp] NOT Clinical Conference[ptyp] NOT Patient Education Handout[ptyp] NOT Directory[ptyp] NOT Technical Report[ptyp] NOT Festschrift[ptyp] NOT Retraction of Publication[ptyp] NOT Retracted Publication[ptyp] NOT Duplicate Publication[ptyp] NOT Scientific Integrity Review[ptyp] NOT Published Erratum[ptyp] NOT Consensus Development Conference, NIH[ptyp] NOT Periodical Index[ptyp] NOT Dictionary[ptyp] NOT Legislation[ptyp] NOT Government Publications[ptyp]"; //for human, mouse and rat: // String query = beginning+"[PDat] : "+now+"[PDat] ((humans[MeSH Terms]) OR (rats[MeSH Terms]) OR (mice[MeSH Terms])) NOT Comment[ptyp] NOT Editorial[ptyp] NOT News[ptyp] NOT Historical Article[ptyp] NOT Congresses[ptyp] NOT Biography[ptyp] NOT Newspaper Article[ptyp] NOT Guideline[ptyp] NOT Practice Guideline[ptyp] NOT Interview[ptyp] NOT Bibliography[ptyp] NOT Legal Cases[ptyp] NOT Lectures[ptyp] NOT Consensus Development Conference[ptyp] NOT Addresses[ptyp] NOT Clinical Conference[ptyp] NOT Patient Education Handout[ptyp] NOT Directory[ptyp] NOT Technical Report[ptyp] NOT Festschrift[ptyp] NOT Retraction of Publication[ptyp] NOT Retracted Publication[ptyp] NOT Duplicate Publication[ptyp] NOT Scientific Integrity Review[ptyp] NOT Published Erratum[ptyp] NOT Consensus Development Conference, NIH[ptyp] NOT Periodical Index[ptyp] NOT Dictionary[ptyp] NOT Legislation[ptyp] NOT Government Publications[ptyp]"; //for the whole Medline: (commented out to time thesaurus loading) String query = beginning+"[PDat] : "+now+"[PDat] NOT Comment[ptyp] NOT Editorial[ptyp] NOT News[ptyp] NOT Historical Article[ptyp] NOT Congresses[ptyp] NOT Biography[ptyp] NOT Newspaper Article[ptyp] NOT Guideline[ptyp] NOT Practice Guideline[ptyp] NOT Interview[ptyp] NOT Bibliography[ptyp] NOT Legal Cases[ptyp] NOT Lectures[ptyp] NOT Consensus Development Conference[ptyp] NOT Addresses[ptyp] NOT Clinical Conference[ptyp] NOT Patient Education Handout[ptyp] NOT Directory[ptyp] NOT Technical Report[ptyp] NOT Festschrift[ptyp] NOT Retraction of Publication[ptyp] NOT Retracted Publication[ptyp] NOT Duplicate Publication[ptyp] NOT Scientific Integrity Review[ptyp] NOT Published Erratum[ptyp] NOT Consensus Development Conference, NIH[ptyp] NOT Periodical Index[ptyp] NOT Dictionary[ptyp] NOT Legislation[ptyp] NOT Government Publications[ptyp]"; saveRestrictedPMIDsToFile(query, temp+"Restricted1980tillnow.PMIDs"); indexMedline(groundhogName, ontology, temp+"Restricted1980tillnow.PMIDs"); // //System.out.println(StringUtilities.now() + "\tCreating gene groundhog"); // // //for all species: // //query = "(protein OR gene) AND (mammal OR melanogaster OR gallus OR elegans OR rerio OR cerevisiae OR coli) AND " +beginning+"[PDat] : "+now+"[PDat] NOT plant NOT Comment[ptyp] NOT Editorial[ptyp] NOT News[ptyp] NOT Historical Article[ptyp] NOT Congresses[ptyp] NOT Biography[ptyp] NOT Newspaper Article[ptyp] NOT Guideline[ptyp] NOT Practice Guideline[ptyp] NOT Interview[ptyp] NOT Bibliography[ptyp] NOT Legal Cases[ptyp] NOT Lectures[ptyp] NOT Consensus Development Conference[ptyp] NOT Addresses[ptyp] NOT Clinical Conference[ptyp] NOT Patient Education Handout[ptyp] NOT Directory[ptyp] NOT Technical Report[ptyp] NOT Festschrift[ptyp] NOT Retraction of Publication[ptyp] NOT Retracted Publication[ptyp] NOT Duplicate Publication[ptyp] NOT Scientific Integrity Review[ptyp] NOT Published Erratum[ptyp] NOT Consensus Development Conference, NIH[ptyp] NOT Periodical Index[ptyp] NOT Dictionary[ptyp] NOT Legislation[ptyp] NOT Government Publications[ptyp]"; // // //for mammals only: // //query = "(protein OR gene) AND mammals AND " +beginning+"[PDat] : "+now+"[PDat] NOT Comment[ptyp] NOT Editorial[ptyp] NOT News[ptyp] NOT Historical Article[ptyp] NOT Congresses[ptyp] NOT Biography[ptyp] NOT Newspaper Article[ptyp] NOT Guideline[ptyp] NOT Practice Guideline[ptyp] NOT Interview[ptyp] NOT Bibliography[ptyp] NOT Legal Cases[ptyp] NOT Lectures[ptyp] NOT Consensus Development Conference[ptyp] NOT Addresses[ptyp] NOT Clinical Conference[ptyp] NOT Patient Education Handout[ptyp] NOT Directory[ptyp] NOT Technical Report[ptyp] NOT Festschrift[ptyp] NOT Retraction of Publication[ptyp] NOT Retracted Publication[ptyp] NOT Duplicate Publication[ptyp] NOT Scientific Integrity Review[ptyp] NOT Published Erratum[ptyp] NOT Consensus Development Conference, NIH[ptyp] NOT Periodical Index[ptyp] NOT Dictionary[ptyp] NOT Legislation[ptyp] NOT Government Publications[ptyp]"; // // //for human, mouse and rat: //// query = "(protein OR gene) AND ((humans[MeSH Terms]) OR (rats[MeSH Terms]) OR (mice[MeSH Terms])) AND " +beginning+"[PDat] : "+now+"[PDat] NOT Comment[ptyp] NOT Editorial[ptyp] NOT News[ptyp] NOT Historical Article[ptyp] NOT Congresses[ptyp] NOT Biography[ptyp] NOT Newspaper Article[ptyp] NOT Guideline[ptyp] NOT Practice Guideline[ptyp] NOT Interview[ptyp] NOT Bibliography[ptyp] NOT Legal Cases[ptyp] NOT Lectures[ptyp] NOT Consensus Development Conference[ptyp] NOT Addresses[ptyp] NOT Clinical Conference[ptyp] NOT Patient Education Handout[ptyp] NOT Directory[ptyp] NOT Technical Report[ptyp] NOT Festschrift[ptyp] NOT Retraction of Publication[ptyp] NOT Retracted Publication[ptyp] NOT Duplicate Publication[ptyp] NOT Scientific Integrity Review[ptyp] NOT Published Erratum[ptyp] NOT Consensus Development Conference, NIH[ptyp] NOT Periodical Index[ptyp] NOT Dictionary[ptyp] NOT Legislation[ptyp] NOT Government Publications[ptyp]"; // //// for mammals in toxicology only: //// query = "(protein OR gene) AND mammals AND " +beginning+"[PDat] : "+now+"[PDat] AND tox[sb] NOT Comment[ptyp] NOT Editorial[ptyp] NOT News[ptyp] NOT Historical Article[ptyp] NOT Congresses[ptyp] NOT Biography[ptyp] NOT Newspaper Article[ptyp] NOT Guideline[ptyp] NOT Practice Guideline[ptyp] NOT Interview[ptyp] NOT Bibliography[ptyp] NOT Legal Cases[ptyp] NOT Lectures[ptyp] NOT Consensus Development Conference[ptyp] NOT Addresses[ptyp] NOT Clinical Conference[ptyp] NOT Patient Education Handout[ptyp] NOT Directory[ptyp] NOT Technical Report[ptyp] NOT Festschrift[ptyp] NOT Retraction of Publication[ptyp] NOT Retracted Publication[ptyp] NOT Duplicate Publication[ptyp] NOT Scientific Integrity Review[ptyp] NOT Published Erratum[ptyp] NOT Consensus Development Conference, NIH[ptyp] NOT Periodical Index[ptyp] NOT Dictionary[ptyp] NOT Legislation[ptyp] NOT Government Publications[ptyp]"; // // //saveGenePMIDsToFile(query, temp+"RestrictedGene1980tillnow.PMIDs"); // //indexMedline(genegroundhogName, ontology,temp+"RestrictedGene1980tillnow.PMIDs"); // // //System.out.println(StringUtilities.now() + "\tCreating gene integersetstore"); // //saveGeneCUIsToFile(ontology, temp+"Genes.CUIs", temp+"NonGenes.CUIs"); // //CreateIntegerSetStoreFromGroundhog.create(groundhogRoot+genegroundhogName, temp+"IntegerSetStoreGene/", temp+"Genes.CUIs"); // // //System.out.println(StringUtilities.now() + "\tCreating GO integersetstore"); // //GeneOntology geneOntology = new GeneOntology(geneOntologyFile); // //geneOntology.dumpPMIDs(ontology, temp+"IntegerSetStoreGO/"); // // //code for creating UMLS integersetstore gives an error message that the store can't be closed. Need to stop and run from this point. // //System.out.println(StringUtilities.now() + "\tCreating UMLS integersetstore"); // //filterCUIs(temp+"NonGenes.CUIs", temp+"NonGenesFiltered.CUIs", ontology); // //CreateIntegerSetStoreFromGroundhog.create(groundhogRoot + groundhogName, temp+"IntegerSetStoreUMLS/", temp+"NonGenesFiltered.CUIs"); // // //System.out.println(StringUtilities.now() + "\tCombining integersetstores"); // //MergeIntegerSetStores mergeIntegerSetStores = new MergeIntegerSetStores(integerSetStore); // //mergeIntegerSetStores.sizeUpperCutoff = maxPMIDsPerConcept; // //mergeIntegerSetStores.addStore(temp+"IntegerSetStoreGene/"); // //mergeIntegerSetStores.addStore(temp+"IntegerSetStoreGO/"); // //mergeIntegerSetStores.addStore(temp+"IntegerSetStoreUMLS/"); // // //System.out.println(StringUtilities.now() + "\tGenerating concept profiles"); // //GenerateConceptProfilesFromIntegerSetStore generator = new GenerateConceptProfilesFromIntegerSetStore(); // //generator.groundhogRoot = groundhogRoot; // //generator.sourceGroundhogName = groundhogName; // //generator.targetGroundhogName = conceptProfilesGroundhog; // //generator.integerSetStoreFilename = integerSetStore; // //generator.groundhogStatisticsFilename = groundhogRoot + groundhogName + "/GroundhogStatistics.txt"; // //generator.maximumNumberOfPmidsForCP = maxPMIDsPerConcept; // //generator.run(); // // //System.out.println(StringUtilities.now() + "\tCreate cooccurrence database"); // //CooccurrenceDatabase cooccurrenceDB = new CooccurrenceDatabase(groundhogRoot + cooccurrenceDBName); // //GroundhogManager groundhogManager = new GroundhogManager(groundhogRoot); // //Groundhog groundhog = groundhogManager.getGroundhog(groundhogName); // //cooccurrenceDB.makeFromGroundhog(groundhog); // // //System.out.println(StringUtilities.now() + "\tCreate medline titles database"); // //CreateTitlesDatabase.create(titlesDatabase, temp+"Restricted1980tillnow.PMIDs"); System.out.println(StringUtilities.now() + "\tDone!"); } // private static void filterCUIs(String sourceCUIfile, String targetCUIfile, Ontology ontology) { // Set<Integer> semFilter = OntologyUtilities.getSemanticFilter(ontology, removeSemTypes); // ReadTextFile in = new ReadTextFile(sourceCUIfile); // WriteTextFile out = new WriteTextFile(targetCUIfile); // for (String line : in){ // Integer cui = Integer.parseInt(line); // if (!semFilter.contains(cui)) // out.writeln(cui.toString()); // } // out.close(); // } // private static void saveGeneCUIsToFile(Ontology ontology, String geneFilename, String nongeneFilename) { // Iterator<Concept> iterator = ontology.getConceptIterator(); // WriteTextFile geneout = new WriteTextFile(geneFilename); // WriteTextFile nongeneout = new WriteTextFile(nongeneFilename); // while (iterator.hasNext()){ // int cui = iterator.next().getID(); // if (cui > geneCUIcutoff && cui < chemCUIcutoff){ // geneout.writeln(Integer.toString(cui)); // } else // nongeneout.writeln(Integer.toString(cui)); // } // geneout.close(); // nongeneout.close(); // } protected static void indexMedline(String groundhogFolder, Ontology ontology, String pmidFile){ System.out.println(StringUtilities.now() + "\tIndexing Medline"); GroundhogManager groundhogManager = new GroundhogManager(groundhogRoot); Groundhog groundhog = null; try { groundhog = groundhogManager.createNewGroundhog(groundhogFolder); } catch (Exception e) { e.printStackTrace(); } groundhog.setBulkImportMode(true); IndexerMainForGroundhog indexer = new IndexerMainForGroundhog(); indexer.medlineIterator.fetchSubstances = true; indexer.disambiguate = true; indexer.medlineIterator.pmidsFile = pmidFile; indexer.groundhog = groundhog; indexer.ontology = ontology; indexer.start(); groundhog.setBulkImportMode(false); System.out.println(StringUtilities.now() + "\tSaving statistics "); GroundhogStatistics wholeGroundhogStatistics = groundhog.getGroundhogStatistics(); File file = new File(groundhogRoot+groundhogFolder+"/GroundhogStatistics.txt"); try { wholeGroundhogStatistics.saveGroundhogStatisticsToFile(new FileOutputStream(file)); } catch (FileNotFoundException e) { e.printStackTrace(); } groundhog.close(); } private static void saveRestrictedPMIDsToFile(String query, String filename) { List<Integer> pmids = FetchPMIDsFromOnlinePubmed.getPMIDs(query, "e.vanmulligen@erasmsumc.nl"); WriteTextFile file = new WriteTextFile(filename); for (Integer pmid : pmids) file.writeln(pmid.toString()); file.close(); System.out.println("Found " + pmids.size() + " PMIDS"); } // private static void saveGenePMIDsToFile(String query, String filename) { // List<Integer> pmids = FetchPMIDsFromOnlinePubmed.getPMIDs(query, "erasmsumc@erasmsumc.nl"); // WriteTextFile file = new WriteTextFile(filename); // for (Integer pmid : pmids) // file.writeln(pmid.toString()); // file.close(); // System.out.println("Found " + pmids.size() + " gene PMIDS"); // } // private static Set<Integer> getRemoveSemTypes(){ // Set<Integer> result = new HashSet<Integer>(); // BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(LiteratureUpdateMasterScript.class.getResourceAsStream("RemoveSemTypes.txt"))); // try { // while (bufferedReader.ready()){ // String line = bufferedReader.readLine(); // String[] cols = line.split("\t"); // result.add(Integer.parseInt("-"+cols[0])); // } // } catch (IOException e) { // e.printStackTrace(); // } // return result; // } }