/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.ontology.ontologyutilities; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.peregrine.SimpleTokenizer; import org.erasmusmc.textMining.LVG.LVGNormaliser; //import org.erasmusmc.utilities.LVGNormaliser; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; public class CurationFileParser { public Map<String, Set<String>> suppressedTermsPerVoc = new HashMap<String, Set<String>>(); public Set<String> suppressedTermsAllVocs = new HashSet<String>(); public Map<DatabaseID, Set<String>> suppressedTermsPerDatabaseID = new HashMap<DatabaseID, Set<String>>(); public List<DatabaseID> suppressedWholeUMLSConcepts = new ArrayList<DatabaseID>(); public Map<DatabaseID, Set<String>> addedTermsPerDatabaseID = new HashMap<DatabaseID, Set<String>>(); public List<DatabaseIDmap> mappingsFromToDBID = new ArrayList<DatabaseIDmap>(); public NormaliseUsingLVG lvg = new NormaliseUsingLVG(); /** * Reads a curation file and stores the information in the public variables of this class * @param curationInstructionsFile */ public CurationFileParser(String curationInstructionsFile) { for (String line: new ReadTextFile(curationInstructionsFile)){ if (!line.startsWith("#")) { // check if it is not a comment line! List<String> cells = StringUtilities.safeSplit(line, '|'); if (cells.size() > 1) if (cells.get(0).equals("ADD") && cells.get(1).equals("DBLINK")) { String dbID = cells.get(2); String dbEntry = cells.get(3); DatabaseID databaseID = new DatabaseID(dbID, dbEntry); Set<String> addedTerms = new TreeSet<String>(); for (int i = 4; i < cells.size(); i++) addedTerms.add(cells.get(i)); addedTermsPerDatabaseID.put(databaseID, addedTerms); } else if (cells.get(0).equals("SUPPRESS")) { if (cells.get(1).equals("VOC")) { String voc = cells.get(2); if (voc.equals("ALL")){ suppressedTermsAllVocs.add(normalizeTerm(cells.get(3))); if (!OntologyUtilities.isGeneSymbol(cells.get(3))) suppressedTermsAllVocs.add(lvg.lvgnormalise(cells.get(3))); }else { Set<String> suppressedTerms = suppressedTermsPerVoc.get(voc); if (suppressedTerms == null) { suppressedTerms = new HashSet<String>(); suppressedTermsPerVoc.put(voc, suppressedTerms); } if (!OntologyUtilities.isGeneSymbol(cells.get(3))) suppressedTerms.add(lvg.lvgnormalise(cells.get(3))); suppressedTerms.add(normalizeTerm(cells.get(3))); } } else if (cells.get(1).equals("DBLINK")) { String dbID = cells.get(2); String dbEntry = cells.get(3); DatabaseID databaseID = new DatabaseID(dbID, dbEntry); Set<String> suppressedTerms = new TreeSet<String>(); for (int i = 4; i < cells.size(); i++) { suppressedTerms.add(normalizeTerm(cells.get(i))); if (!OntologyUtilities.isGeneSymbol(cells.get(3))) suppressedTerms.add(lvg.lvgnormalise(cells.get(i))); } if (!suppressedTermsPerDatabaseID.containsKey(databaseID)){ suppressedTermsPerDatabaseID.put(databaseID, suppressedTerms); }else suppressedTermsPerDatabaseID.get(databaseID).addAll(suppressedTerms); } } else if (cells.get(0).equals("MAP")) { if (cells.get(1).equals("DBLINK")) { String dbIDfrom = cells.get(2); String dbEntryfrom = cells.get(3); String dbIDto = cells.get(4); String dbEntryto = cells.get(5); mappingsFromToDBID.add(new DatabaseIDmap(new DatabaseID(dbIDfrom, dbEntryfrom), new DatabaseID(dbIDto, dbEntryto))); } } else if (cells.get(0).equals("SUPPRESS_WHOLE_UMLSCONCEPT")) { if (cells.get(1).equals("DBLINK")) { String dbID = cells.get(2); String dbEntry = cells.get(3); if (dbID.equals("UMLS")){ suppressedWholeUMLSConcepts.add(new DatabaseID(dbID, dbEntry)); } } } } } } /** * Normalizes (tokenization, conversion to lowercase) the input string, to compare it with the terms found in the curation file * @param string * @return normalized string */ public static String normalizeTerm(String string){ SimpleTokenizer tokenizer = new SimpleTokenizer(); tokenizer.tokenize(string); StringBuilder sb = new StringBuilder(); for (String token : tokenizer.tokens){ token = StringUtilities.firstLetterToLowerCase(token); //if (!StringUtilities.isAbbr(token)) // token = token.toLowerCase(); if (!OntologyUtilities.stopwordsForIndexing.contains(token)){ if (sb.length() != 0) sb.append(' '); sb.append(token); } } return sb.toString(); } public class DatabaseIDmap { DatabaseID from; DatabaseID to; public DatabaseIDmap(DatabaseID from, DatabaseID to) { this.from = from; this.to = to; } } private class NormaliseUsingLVG extends LVGNormaliser{ public String lvgnormalise(String string){ return externalnormalise(string); } } }