RewrittenMRCONSOLoader.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.dataimport.UMLS;

import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.utilities.ReadTextFile;

public class RewrittenMRCONSOLoader {
  public static void loadFromRewrittenMRCONSO(Ontology ontology, String filename, List<String> log_output, String abbreviationsFile) {

    int cuiCol = 0;
    int termIDCol = 5;
    int termCol = 14;
    int vocCol = 11;
    int GOCol = 13;

    int cui = -1;
    int prevCui = -1;
    String  sui = "";
    String prevSui = "";
    Map<String, Integer> vocabularies = new HashMap<String, Integer>();

    Set<Integer> foundVocsForConcept = null;
    Concept concept = null;

    // Get list of abbreviations and acronyms
    List<String> listOfAbbreviationsOrAcronyms = UMLSFiltersBeforeOntologyCreation.getAbbreviationsAndAcronyms(abbreviationsFile);

    ReadTextFile textFile = new ReadTextFile(filename);
    Iterator<String> fileIterator = textFile.getIterator();
    int lineCount = 0;
    while (fileIterator.hasNext()) {
      lineCount++;
      if (lineCount % 100000 == 0)
        System.out.println(lineCount);
      String line = fileIterator.next();
      if (line.length() != 0) {
        String[] columns = line.split("\\|");
        // Save the cui and TermID as Integers
        cui = Integer.parseInt(columns[cuiCol].trim().substring(1, columns[cuiCol].length()));
        sui = columns[termIDCol].trim().substring(1, columns[termIDCol].length());
        // If we encounter a new concept identifier in the file: create a new
        // concept
        if (prevCui != cui) {
          if (concept != null) {
            if (concept.getTerms().size() != 0)
              concept.setName(concept.getTerms().get(0).text);
            ontology.setConcept(concept);
          }
          concept = new Concept(cui);
          foundVocsForConcept = new TreeSet<Integer>();
          prevCui = cui;
        }
        // Add GO-identifier
        String voc = columns[vocCol].trim();
        if (voc.equals("GO")) {
          String GOstring = columns[GOCol].trim();
          DatabaseID databaseID = new DatabaseID("GO", GOstring);
          ontology.setDatabaseIDForConcept(concept.getID(), databaseID);
        }
        // If we have not encountered the term before, add it to the concept
        if (!prevSui.equals(sui)) {
          String term = columns[termCol].trim();            
          // If the term in not an abbreviation or acronym, convert to lower
          // case
          // if it is from a vocabulary which has only upper case terms
          String checkedTerm = UMLSFiltersBeforeOntologyCreation.convertToLowerCaseIfWordsMoreThan2AndCharactersMoreThan10AndNotAbbreviationOrAcronym(term, voc, listOfAbbreviationsOrAcronyms);
          if (!checkedTerm.equals(term)) {
            log_output.add("TERM HAS BEEN CONVERTED TO LOWERCASE IN ONTOLOGY|" + line);
          }
          term = checkedTerm;
          List<TermStore> terms = concept.getTerms();
          terms.add(new TermStore(term));
          concept.setTerms(terms);
          prevSui = sui;

          // Add vocabulary
          String vocstring = columns[vocCol].trim();
          Integer vocID = vocabularies.get(vocstring);
          if (vocID == null) {
            vocID = -1000 - vocabularies.size();
            vocabularies.put(vocstring, vocID);
            Concept vocabulary = new Concept(vocID);
            vocabulary.setName(vocstring);
            ontology.setConcept(vocabulary);
          }
          // Set the vocabulary if it has not been set before for the concept
          if (!foundVocsForConcept.contains(vocID)) {
            Relation relation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, vocID);
            ontology.setRelation(relation);
            foundVocsForConcept.add(vocID);
          }           
        }

      }
    }
    if (concept != null) {
      if (concept.getTerms().size() != 0)
        concept.setName(concept.getTerms().get(0).text);
      ontology.setConcept(concept);
    }
  }
}