/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package JochemBuilder.umlsChem;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;
import casperSoftwareCode.AssignSemanticTypeToCui;
import casperSoftwareCode.CasperFilters;
public class FilterMRCONSOforChem {
//public static boolean chemical = false;
public static Set<Integer> chemicalSemanticTypes = getAllChemicalSemanticTypes();
public void getChemicalsFromMRCONSO(String mrconsoPath, String mrstyPath, String ontologyPath, String logfile) {
System.out.println("Starting script: "+StringUtilities.now());
WriteTextFile logFile = new WriteTextFile(logfile);
/** Add semantic types from MRSTY */
System.out.println("Adding semantic types to concepts...");
Map<Integer, Set<Integer>> conceptsWithSemTypes = AssignSemanticTypeToCui.getCuisWithSemanticTypes(mrstyPath);
/** Set ontology variables*/
WriteTextFile newOntologyFile = new WriteTextFile(ontologyPath);
int cuiCol = 0;
int termTextCol = 14;
int cui = -1;
int chemicalTermsCount = 0;
int moreThan255count = 0;
int nonEnglishTermsCount = 0;
int suppressableTermsCount = 0;
DummyChemConcept concept = null;
/** Filter MRCONSO*/
System.out.println("Filtering MRCONSO file... ");
ReadTextFile textFile = new ReadTextFile(mrconsoPath);
Iterator<String> fileIterator = textFile.getIterator();
int lineCount = 0;
while (fileIterator.hasNext()) {
lineCount++;
if (lineCount % 100000 == 0)
System.out.println(lineCount+" lines processed from MRCONSO.RRF");
String line = fileIterator.next();
if (line.length() != 0) {
String[] columns = line.split("\\|");
cui = Integer.parseInt(columns[cuiCol].trim().substring(1, columns[cuiCol].length()));
String term = columns[termTextCol].trim();
concept = new DummyChemConcept();
concept.setCUI(cui);
concept.setTermText(term);
concept.setSemType(conceptsWithSemTypes.get(concept.getCUI()));
if (conceptHasChemicalSemanticType(concept)){
if (CasperFilters.isMoreThan255(columns)) {
logFile.writeln("TERM FIELD MORE THAN 255 CHARACTERS|" + line);
moreThan255count++;
}
else if (CasperFilters.notRightLanguage(columns)) {
logFile.writeln("TERMS NOT ENGLISH LANGUAGE|" + line);
nonEnglishTermsCount++;
}
else if (CasperFilters.isSuppressable(columns)) {
logFile.writeln("TERMS MARKED AS SUPPRESSABLE BY NLM|" + line);
suppressableTermsCount++;
}else{
chemicalTermsCount++;
newOntologyFile.writeln(line);
}
}
}
}
newOntologyFile.close();
logFile.close();
System.out.println(chemicalTermsCount+ " chemical terms were added");
System.out.println(moreThan255count+ " terms were removed due to >255 characters");
System.out.println(nonEnglishTermsCount+ " terms were removed due to non-English language");
System.out.println(suppressableTermsCount+ " terms were removed due to suppressability by NLM");
}
public static boolean conceptHasChemicalSemanticType(DummyChemConcept concept){
Set<Integer> semsForConcept = concept.getSemType();
if(semsForConcept==null){
return false;
}
Iterator<Integer> semIterator = semsForConcept.iterator();
while (semIterator.hasNext()){
Integer semID = semIterator.next();
if (chemicalSemanticTypes.contains(-semID)){
return true;
}
}
return false;
}
private static Set<Integer> getAllChemicalSemanticTypes() {
Set<Integer> result = new TreeSet<Integer>();
result.add(-103);
result.add(-104);
result.add(-109);
result.add(-114);
result.add(-115);
result.add(-116);
result.add(-118);
result.add(-119);
result.add(-110);
result.add(-111);
result.add(-196);
result.add(-197);
result.add(-120);
result.add(-121);
result.add(-195);
result.add(-122);
result.add(-123);
result.add(-124);
result.add(-125);
result.add(-126);
result.add(-127);
result.add(-129);
result.add(-192);
result.add(-130);
result.add(-131);
result.add(-200);
return result;
}
}