OntologyPSFLoader.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.ontology;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.utilities.StringUtilities;

/**
 * Use this class to load an OntologyStore from a PSF file
 * 
 * @author Schuemie
 * 
 */
public class OntologyPSFLoader {
  /**
   * After loading, the OntologyStore can be found here. Before saving, set the
   * OntologyStore here
   */
  public OntologyStore ontology;

  /**
   * Determines whether the definitions are loaded from the PSF file (uses more
   * memory). <br>
   * <br>
   * The default value is true.
   */
  public boolean loadDefinitions = true;

  /**
   * Determines whether the concept hierarchy is loaded from the PSF file (uses
   * more memory). <br>
   * <br>
   * The default value is false.
   */
  public boolean loadHierarchy = false;

  /**
   * Start reading the PSF file at this line <br>
   * <br>
   * The default value is 0.
   */
  public int offset = 0; // start reading at this line
  /**
   * This variable formalizes the assumption that semantic types have negative
   * concept ids; NOTE this is a partial solution as it is not taken into
   * account that vocs have to have a value of less then -1000 :<
   */
  public static boolean semanticTypesNegative = true;
  /**
   * Read this number of lines from the PSF file. A value of -1 indicates all
   * lines will be read. <br>
   * <br>
   * The default value is -1.
   */
  public int length = -1; // read this many lines (-1 indicates all remaining)

  /**
   * Load the ontology from the given file. After this method, the OntologyStore
   * can be retrieved from the ontology property.
   * 
   * @param filename
   *            The complete path and filename of the file
   */
  public void loadFromPSF(String filename) {
	try {
		loadFromPSF(new FileInputStream(filename));
	} catch (FileNotFoundException e) {
        e.printStackTrace();
	}
  }

  /**
   * Load the ontology from the given file. After this method, the OntologyStore
   * can be retrieved from the ontology property.
   * 
   * @param filename
   *            The complete path and filename of the file
   */
  public void loadFromPSF(InputStream is) {
    ontology = new OntologyStore();
    hasHierarchy = false;
    level = -1;
    sem = -1;
    terms = -1;
    voc = -1;
    cui = -1;
    autoCID = 0;
    vocabularies.clear();
    int count = -1;
    boolean First = true;
    BufferedReader bufferedReader = null;
    try {
      bufferedReader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1000000);

      while (bufferedReader.ready()) {
        String line = bufferedReader.readLine();
        if (!line.equals("")) {
          count++;
          if (First) {
            First = false;
            if (line.substring(0, 5).equalsIgnoreCase("level")) {
              extractColumns(line);
            }
            else {// assume default columns:
              level = 0;
              terms = 1;
              cui = 2;
            }
          }
          else if (count > offset && (length == -1 || count <= offset + length))
            if (line.length() != 0) {
              addToOntology(line);
            }
        }
      }
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
      if (bufferedReader != null)
		try {
			bufferedReader.close();
		} catch (IOException e) {
	        e.printStackTrace();
		}
    }
  }

  /**
   * Save the OntologyStore defined in the ontology properly to the specified
   * file location.
   * 
   * @param filename
   */
  public void saveToPSF(String filename) {
    boolean doVoc = hasVoc();
    boolean doSem = hasSem();
    try {
      FileOutputStream PSFFile = new FileOutputStream(filename);
      BufferedWriter bufferedWrite = new BufferedWriter(new OutputStreamWriter(PSFFile, "UTF-8"), 1000000);
      StringBuffer firstline = new StringBuffer();
      firstline.append("LEVEL|");
      if (doVoc) {
        firstline.append("VOC|");
      }
      if (doSem) {
        firstline.append("SEM|");
      }
      firstline.append("DEFAULT|0");
      bufferedWrite.write(firstline.toString());
      bufferedWrite.newLine();

      // Add concepts
      Iterator<Concept> values = ontology.getConceptIterator();
      while (values.hasNext()) {
        Concept concept = values.next();
        // if (concept.ID >= 0){
        StringBuffer line = new StringBuffer();
        line.append(0 + "|");
        if (doVoc) {
          line.append(relatedToString(concept.getID(), DefaultTypes.fromVocabulary) + "|");
        }
        if (doSem) {
          line.append(semrelatedToString(concept.getID(), DefaultTypes.isOfSemanticType) + "|");
        }

        if (concept.terms != null) {
          if (concept.terms.size() != 0) {
            line.append(StringUtilities.escape(concept.terms.get(0).text));
          }
          for (int j = 1; j < concept.terms.size(); j++) {
            line.append(";" + StringUtilities.escape(concept.terms.get(j).text));
          }
        }
        else {
          line.append(StringUtilities.escape(concept.getName()));
        }
        boolean hasDef = false;
        if (!((concept.definition == null) || concept.definition.equals(""))) {
          line.append("?" + StringUtilities.escape(concept.definition));
          hasDef = true;
        }
        // Append the database identifiers (if any)
        List<DatabaseID> databaseIDs = ontology.getDatabaseIDsForConcept(concept.getID());
        if (databaseIDs != null) {
          boolean first = true;
          for (DatabaseID databaseID: databaseIDs) {
            if (first && !hasDef) {
              first = false;
              line.append("?");
            } else
              line.append("\\;");
            line.append(databaseID.database);
            line.append("_");
            line.append(StringUtilities.escape(databaseID.ID));
          }
        }

        line.append("|" + Integer.toString(concept.ID));
        bufferedWrite.write(line.toString());
        bufferedWrite.newLine();
      }

      // Add hierarchy:
      Iterator<Concept> conceptIterator = ontology.getConceptIterator();
      while (conceptIterator.hasNext()) {
        Concept concept = conceptIterator.next();
        List<Relation> relations = ontology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.isParentOf);
        if (relations != null && relations.size() != 0) {
          StringBuffer line = new StringBuffer();
          line.append("H|");
          Iterator<Relation> relationIterator = relations.iterator();
          while (relationIterator.hasNext()) {
            Relation relation = relationIterator.next();
            line.append(relation.object);
            if (relationIterator.hasNext())
              line.append(";");
          }
          line.append("|");
          line.append(concept.getID());
          bufferedWrite.write(line.toString());
          bufferedWrite.newLine();
        }
      }
      bufferedWrite.flush();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  private String relatedToString(int conceptID, int relationType) {
    List<Relation> vocs = ontology.getRelationsForConceptAsSubject(conceptID, relationType);

    StringBuffer buffer = new StringBuffer();
    if (vocs.size() != 0)
      buffer.append(ontology.getConcept(vocs.get(0).object).getName());
    for (int i = 1; i < vocs.size(); i++) {
      buffer.append(";");
      buffer.append(ontology.getConcept(vocs.get(i).object).getName());
    }
    return buffer.toString();
  }

  private String semrelatedToString(int conceptID, int relationType) {
    List<Relation> sems = ontology.getRelationsForConceptAsSubject(conceptID, relationType);

    StringBuffer buffer = new StringBuffer();
    if (sems.size() != 0 && ontology.getConcept(sems.get(0).object) == null)
      System.out.println(sems.get(0).toString());

    if (sems.size() != 0) {
      Integer id = ontology.getConcept(sems.get(0).object).getID();
      if (semanticTypesNegative)
        id = -id;
      buffer.append(id);
    }
    for (int i = 1; i < sems.size(); i++) {
      buffer.append(";");
      Integer id = ontology.getConcept(sems.get(i).object).getID();
      if (semanticTypesNegative)
        id = -id;
      buffer.append(id);
    }
    return buffer.toString();
  }

  private int level = -1;
  private int sem = -1;
  private int terms = -1;
  private int voc = -1;
  private int cui = -1;
  private int autoCID = 0;
  private boolean hasHierarchy = false;
  private Map<String, Integer> vocabularies = new HashMap<String, Integer>();

  // Finds which colums in the file contains what type of info:
  private void extractColumns(String line) {
    String[] columns = line.split("\\|");

    for (int i = 0; i < columns.length; i++) {
      if (columns[i].equalsIgnoreCase("level")) {
        level = i;
      }
      ;
      if (columns[i].equalsIgnoreCase("sem")) {
        sem = i;
      }
      ;
      if (columns[i].equalsIgnoreCase("voc")) {
        voc = i;
      }
      ;
      if (columns[i].equalsIgnoreCase("default")) {
        terms = i;
      }
      ;
      if (columns[i].equalsIgnoreCase("cui") || StringUtilities.isNumber(columns[i].trim())) {
        cui = i;
      }
      ;
    }
  }

  // Parses the main lines and insert the info in the thesaurus:
  private void addToOntology(String line) {
    List<String> columns = StringUtilities.safeSplit(line, '|');
    if (columns.get(0).equals("H")) {
      if (loadHierarchy) {
        if (!hasHierarchy) {
          hasHierarchy = true;
        }
        int parent = Integer.parseInt(columns.get(2));
        List<String> childrenStrings = StringUtilities.safeSplit(columns.get(1), ';');
        for (String childString: childrenStrings) {
          Relation relation = new Relation(parent, DefaultTypes.isParentOf, Integer.parseInt(childString));
          ontology.setRelation(relation);
        }
      }
    }
    else {// Process concept information line// plugin rob//heee we cant have a
      // cui of -1? hooray for text files! Bugs Galore!
      Concept newConcept;
      if (columns.size() > cui && cui != -1) {
        if (columns.get(cui).trim().equals("")){
          System.err.println("Missing cui in PSF file line: \"" + line + "\"");
          newConcept = new Concept(autoCID++);
        } else {
          newConcept = new Concept(Integer.parseInt(columns.get(cui).trim()));
          autoCID = newConcept.getID() + 1;
        }
      }
      else {
        newConcept = new Concept(autoCID++);
      }
      if (newConcept.getID() < 0) {
        // semantic types and vocabularies
        List<String> subs = StringUtilities.safeSplit(columns.get(terms), '?');
        List<String> termlist = StringUtilities.safeSplit(subs.get(0).trim(), ';');
        if (newConcept.getID() <= -1000) {// It's a vocabulary!
          vocabularies.put(termlist.get(0), newConcept.getID());
          newConcept.setName(termlist.get(0));
          ontology.setConcept(newConcept);
        }
        else {// It's a semantic type (of course)
          newConcept.setName(termlist.get(0));
          if (loadDefinitions && subs.size() == 2) {
            newConcept.setDefinition(StringUtilities.unescape(subs.get(1).trim()));
          }
          ontology.setConcept(newConcept);
        }
      }
      else {
        for (int column = 0; column < columns.size(); column++) {
          if (column == level) {
            // Ignored
          }
          else if (column == sem) {
            List<String> semtypes = StringUtilities.safeSplit(columns.get(column), ';');
            for (String type: semtypes) {
              if (!type.equals("")) {
                int id = Integer.parseInt(type);
                if (semanticTypesNegative)
                  id = -id;
                // check if already in semantic network of thesaurus:
                Concept semtype = ontology.getConcept(id);
                if (semtype == null) {
                  semtype = new Concept(id);
                  semtype.setName(type);
                  ontology.setConcept(semtype);
                }
                Relation relation = new Relation(newConcept.getID(), DefaultTypes.isOfSemanticType, semtype.getID());
                ontology.setRelation(relation);
              }
            }
          }
          else if (column == voc) {
            List<String> vocs = StringUtilities.safeSplit(columns.get(column), ';');
            for (String vocstring: vocs) {
              if (!vocs.equals("")) {
                Integer vocID = vocabularies.get(vocstring);
                if (vocID == null) {
                  if(StringUtilities.isInteger(vocstring)){
                    vocID = Integer.parseInt(vocstring);
                  }
                  else{
                    vocID = makeVocID();
                    Concept vocabulary = new Concept(vocID);
                    vocabulary.setName(vocstring);
                    ontology.setConcept(vocabulary);
                  }
                  vocabularies.put(vocstring, vocID);
                }
                Relation relation = new Relation(newConcept.getID(), DefaultTypes.fromVocabulary, vocID);
                ontology.setRelation(relation);
              }
            }
          }
          else if (column == terms) {
            List<String> subs = StringUtilities.safeSplit(columns.get(column), '?');
            List<String> terms = StringUtilities.safeSplit(subs.get(0).trim(), ';');
            if (terms.size() != 0) {
              newConcept.terms = new ArrayList<TermStore>(terms.size());
            }
            for (int i = 0; i < terms.size(); i++) {
              String text = StringUtilities.unescape(terms.get(i));
              if (text.length() < 256)
                newConcept.terms.add(new TermStore(text));
            }
            if (loadDefinitions && subs.size() == 2) {
              String definition = StringUtilities.unescape(subs.get(1).trim());
              definition = addDatabaseIDs(definition, newConcept);
              newConcept.definition = definition;
            }
          }
        }
        if (ontology.getConcept(newConcept.ID) == null) // not already in
          // thesaurus
          ontology.setConcept(newConcept);
      }
    }
  }

  private Integer makeVocID() {
    if (vocabularies.size() > 0) {
      ArrayList<Integer> list = new ArrayList<Integer>(vocabularies.values());
      Collections.sort(list);
      return list.get(0) - 1;
    }
    else {
      return -1000;
    }

  }

  // Checks whether there are database identifiers in the definition. If so, it
  // adds them to the ontology
  private String addDatabaseIDs(String definition, Concept concept) {
    if (definition.contains("_")) {
      boolean hasIDs = false;
      int minStart = 9999;
      int maxEnd = 0;
      for (String databaseID: DatabaseID.enumerateDatabases().keySet()) {
        String prefix = databaseID + "_";
        int start = 0;
        while (start != -1) {
          start = definition.indexOf(prefix, start);

          // Check whether there is a letter directly before the prefix. If so,
          // search for next:
          while (start > 0 && Character.isLetter(definition.charAt(start - 1)))
            start = definition.indexOf(prefix, start + 1);

          if (start != -1) {
            minStart = Math.min(start, minStart);
            int end = definition.indexOf(';', start);
            int endExclamation = definition.indexOf('!', start);
            if (endExclamation != -1 && (endExclamation < end || end == -1))
              end = endExclamation;

            if (end == -1)
              end = definition.length();
            maxEnd = Math.max(end, maxEnd);
            ontology.setDatabaseIDForConcept(concept.getID(), new DatabaseID(databaseID, definition.substring(start + prefix.length(), end)));
            start = end;
            hasIDs = true;
          }
        }
      }
      if (hasIDs)
        return definition.substring(0, minStart) + definition.substring(maxEnd, definition.length());
    }
    return definition;
  }

  private boolean hasVoc() {
    for (Relation relation: ontology.getRelations())
      if (relation.predicate == DefaultTypes.fromVocabulary)
        return true;
    return false;
  }

  private boolean hasSem() {
    for (Relation relation: ontology.getRelations())
      if (relation.predicate == DefaultTypes.isOfSemanticType)
        return true;
    return false;
  }

}