/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package casperSoftwareCode; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.WriteTextFile; public class CasperMainScript { public static boolean dosagesRule; //= true; public static boolean atsignRule; //= true; public static boolean shortTokenRule; //= true; public static boolean necRule; //= true; public static boolean nosRule; //= true; public static boolean ecNumbersRule; //= true; public static boolean miscRule; //= true; public static boolean wordsMoreThanFiveRule; //= false; String mrconsoPath; String mrstyPath; String logfilePath; String ontologyPath; boolean suppressRules; //= true; boolean rewriteRules; //= true; boolean syntacticInvRule; //= true; boolean possessivesRule; //= true; boolean shortformlongformRule; //= true; boolean angularBracketsRule; //= true; boolean semanticTypesRule; //= true; boolean beginParenthesesRule; //= false; boolean endParenthesesRule; //= false; boolean beginBracketsRule; //= false; boolean endBracketsRule; //= false; public static boolean chemical = false; public void run(String filename) { System.out.println("Starting script: "+StringUtilities.now()); /** Set program variables */ String iniFilePath = filename; ReadTextFile iniFile = new ReadTextFile(iniFilePath); Iterator<String> iniFileIterator = iniFile.getIterator(); while (iniFileIterator.hasNext()) { String line = iniFileIterator.next(); if (line.length() != 0 && !line.startsWith("#")) { String[] columns = line.split("="); String variable = columns[0].trim(); /** Set paths */ if(variable.equals("MRCONSO")) mrconsoPath = columns[1].trim(); if(variable.equals("MRSTY")) mrstyPath = columns[1].trim(); if(variable.equals("log")) logfilePath = columns[1].trim(); if(variable.equals("MRCONSO rewritten and suppressed")) ontologyPath = columns[1].trim(); /** Set variables */ if(variable.equals("Apply suppressrules")){ String value = columns[1].trim(); if (value.equals("on")) suppressRules = true; else suppressRules = false; } if(variable.equals("Apply rewriterules")){ String value = columns[1].trim(); if (value.equals("on")) rewriteRules = true; else rewriteRules = false; } if(variable.equals("Syntactic inversion")){ String value = columns[1].trim(); if (value.equals("on")) syntacticInvRule = true; else syntacticInvRule = false; } if(variable.equals("Possessives")){ String value = columns[1].trim(); if (value.equals("on")) possessivesRule = true; else possessivesRule = false; } if(variable.equals("Short Form And Long Form")){ String value = columns[1].trim(); if (value.equals("on")) shortformlongformRule = true; else shortformlongformRule = false; } if(variable.equals("Angular Brackets")){ String value = columns[1].trim(); if (value.equals("on")) angularBracketsRule = true; else angularBracketsRule = false; } if(variable.equals("Semantic Type")){ String value = columns[1].trim(); if (value.equals("on")) semanticTypesRule = true; else semanticTypesRule = false; } if(variable.equals("Begin Parenthesis")){ String value = columns[1].trim(); if (value.equals("on")) beginParenthesesRule = true; else beginParenthesesRule = false; } if(variable.equals("End Parenthesis")){ String value = columns[1].trim(); if (value.equals("on")) endParenthesesRule = true; else endParenthesesRule = false; } if(variable.equals("Begin Brackets")){ String value = columns[1].trim(); if (value.equals("on")) beginBracketsRule = true; else beginBracketsRule = false; } if(variable.equals("End Brackets")){ String value = columns[1].trim(); if (value.equals("on")) endBracketsRule = true; else endBracketsRule = false; } if(variable.equals("Dosages")){ String value = columns[1].trim(); if (value.equals("on")) dosagesRule = true; else dosagesRule = false; } if(variable.equals("At-sign")){ String value = columns[1].trim(); if (value.equals("on")) atsignRule = true; else atsignRule = false; } if(variable.equals("Short token")){ String value = columns[1].trim(); if (value.equals("on")) shortTokenRule = true; else shortTokenRule = false; } if(variable.equals("Any classification")){ String value = columns[1].trim(); if (value.equals("on")) necRule = true; else necRule = false; } if(variable.equals("Any underspecification")){ String value = columns[1].trim(); if (value.equals("on")) nosRule = true; else nosRule = false; } if(variable.equals("EC numbers")){ String value = columns[1].trim(); if (value.equals("on")) ecNumbersRule = true; else ecNumbersRule = false; } if(variable.equals("Miscellaneous")){ String value = columns[1].trim(); if (value.equals("on")) miscRule = true; else miscRule = false; } if(variable.equals("More than five words in term")){ String value = columns[1].trim(); if (value.equals("on")) wordsMoreThanFiveRule = true; else wordsMoreThanFiveRule = false; } } } /** Create log */ WriteTextFile logFile = new WriteTextFile(logfilePath); /** Create datatypes for homonym checks */ System.out.println("Creating datatypes for homonym checks..."); Map<Integer, Set<String>> cuisWithTerms = ExtractCUIsAndTermsFromMRCONSO.extractCuisAndTermsAsMap(mrconsoPath); Collection<Set<String>> allTermsInUMLS = cuisWithTerms.values(); Set<String> allTerms = new HashSet<String>(); for (Set<String> valueSet: allTermsInUMLS){ for(String value: valueSet){ allTerms.add(value); } } allTermsInUMLS = null; /** Add semantic types from MRSTY */ System.out.println("Adding semantic types to concepts..."); Map<Integer, Set<Integer>> conceptsWithSemTypes = AssignSemanticTypeToCui.getCuisWithSemanticTypes(mrstyPath); /** Set ontology variables*/ WriteTextFile newOntologyFile = new WriteTextFile(ontologyPath); RulesCombination rulesClass = new RulesCombination(cuisWithTerms, allTerms); int cuiCol = 0; int suiCol = 5; int termTextCol = 14; int cui = -1; int prevCui =-1; int sui = -1; int prevSui = -1; int moreThan255count = 0; int nonEnglishTermsCount = 0; int suppressableTermsCount = 0; int rewrittenTermsCount = 0; int suppressedTermsCount = 0; CasperConcept concept = null; /** Read from MRCONSO */ System.out.println("Reading from MRCONSO.RRF... "); ReadTextFile textFile = new ReadTextFile(mrconsoPath); Iterator<String> fileIterator = textFile.getIterator(); int lineCount = 0; while (fileIterator.hasNext()) { lineCount++; if (lineCount % 100000 == 0) System.out.println(lineCount+" lines processed from MRCONSO.RRF"); String line = fileIterator.next(); if (line.length() != 0) { String[] columns = line.split("\\|"); if (CasperFilters.isMoreThan255(columns)) { logFile.writeln("TERM FIELD MORE THAN 255 CHARACTERS|" + line); moreThan255count++; } else if (CasperFilters.notRightLanguage(columns)) { logFile.writeln("TERMS NOT ENGLISH LANGUAGE|" + line); nonEnglishTermsCount++; } else if (CasperFilters.isSuppressable(columns)) { logFile.writeln("TERMS MARKED AS SUPPRESSABLE BY NLM|" + line); suppressableTermsCount++; } else { cui = Integer.parseInt(columns[cuiCol].trim().substring(1, columns[cuiCol].length())); sui = Integer.parseInt(columns[suiCol].trim().substring(1, columns[suiCol].length())); String term = columns[termTextCol].trim(); if (prevCui != cui) { RulesCombination.cuisWithRuleNo.clear(); } prevCui = cui; if (prevSui != sui) { concept = new CasperConcept(); concept.setCUI(cui); concept.setSUI(sui); concept.setTermText(term); concept.setSemType(conceptsWithSemTypes.get(concept.getCUI())); boolean suppressed = false; if (rulesClass.conceptHasChemicalSemanticType(concept)) chemical = true; if(suppressRules){ if(applySuppressRules(concept)){ logFile.writeln("TERM REMOVED DUE TO SUPPRESS RULE|" + line); suppressed = true; suppressedTermsCount++; } } if (rewriteRules){ if (!suppressed){ if(syntacticInvRule){ CasperConcept rewrittenConcept = rulesClass.applySyntacticInversionRule(concept); if(rewrittenConcept!=null){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+SYN"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+SYN"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } if(possessivesRule){ CasperConcept rewrittenConcept = rulesClass.applyPossessiveRule(concept); if(rewrittenConcept!=null){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+POS"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+POS"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } if(shortformlongformRule){ List<CasperConcept> rewrittenConceptList = rulesClass.applyShortformLongformRule(concept); if(!rewrittenConceptList.isEmpty()){ for(CasperConcept rewrittenConcept: rewrittenConceptList){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+SFLF"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+SFLF"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } } if(angularBracketsRule){ CasperConcept rewrittenConcept = rulesClass.applyAngluarBracketsRule(concept); if(rewrittenConcept!=null){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+ANG"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+ANG"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } if(semanticTypesRule){ CasperConcept rewrittenConcept = rulesClass.applySemanticTypesRule(concept); if(rewrittenConcept!=null){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+SEM"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+SEM"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } if(beginParenthesesRule && !chemical){ CasperConcept rewrittenConcept = rulesClass.applyLeftSideParenthesesRule(concept); if(rewrittenConcept!=null){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+BPA"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+BPA"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } if(endParenthesesRule && !chemical){ CasperConcept rewrittenConcept = rulesClass.applyRightSideParenthesesRule(concept); if(rewrittenConcept!=null){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+EPA"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+EPA"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } if(beginBracketsRule && !chemical){ CasperConcept rewrittenConcept = rulesClass.applyLeftSideBracketsRule(concept); if(rewrittenConcept!=null){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+BBR"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+BBR"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } if(endBracketsRule && !chemical){ CasperConcept rewrittenConcept = rulesClass.applyRightSideBracketsRule(concept); if(rewrittenConcept!=null){ if(suppressRules){ if(!applySuppressRules(rewrittenConcept)){ rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+EBR"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } }else { rewrittenTermsCount++; newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"+EBR"+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+rewrittenConcept.getTermText()+"|"+columns[15]+"|"+columns[16]+"|"); } } } } } if (!suppressed){ newOntologyFile.writeln(columns[0]+"|"+columns[1]+"|"+columns[2]+"|"+columns[3]+"|"+columns[4]+"|"+columns[5]+"|"+columns[6]+"|"+columns[7]+"|"+columns[8]+"|"+columns[9]+"|"+columns[10]+"|"+columns[11]+"|"+columns[12]+"|"+columns[13]+"|"+columns[14]+"|"+columns[15]+"|"+columns[16]+"|"); } prevSui = sui; chemical = false; } } } } /** Save to ontologyfile and log */ System.out.println("Closing logfile and saving to new MRCONSO file: "+StringUtilities.now()); logFile.close(); newOntologyFile.close(); System.out.println(moreThan255count+ " terms were removed due to length > 255 characters"); System.out.println(nonEnglishTermsCount+ " non-english terms were removed"); System.out.println(suppressableTermsCount+ " terms marked as suppressable by NLM were removed"); System.out.println(suppressedTermsCount+ " terms were removed by suppress rules"); System.out.println(rewrittenTermsCount+ " terms were rewritten and added"); } public static boolean applySuppressRules(CasperConcept concept){ if(dosagesRule){ if(RulesCombination.applyDosagesRule(concept)) return true; } if(atsignRule){ if(RulesCombination.applyAtSignRule(concept)) return true; } if(shortTokenRule && !chemical){ if(RulesCombination.applyMartijnsRule(concept)) return true; } if(ecNumbersRule){ if(RulesCombination.applyECrule(concept)) return true; } if(necRule){ if(RulesCombination.applyNECrule(concept)) return true; } if(nosRule){ if(RulesCombination.applyNOSrule(concept)) return true; } if(miscRule){ if(RulesCombination.applyMiscRule(concept)) return true; } if(wordsMoreThanFiveRule && !chemical){ if(RulesCombination.applyNoOfWordsMoreThanFiveRule(concept)) return true; } return false; } }