/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.ontology;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.utilities.StringUtilities;
/**
* Use this class to load an OntologyStore from a PSF file
*
* @author Schuemie
*
*/
public class OntologyPSFLoader {
/**
* After loading, the OntologyStore can be found here. Before saving, set the
* OntologyStore here
*/
public OntologyStore ontology;
/**
* Determines whether the definitions are loaded from the PSF file (uses more
* memory). <br>
* <br>
* The default value is true.
*/
public boolean loadDefinitions = true;
/**
* Determines whether the concept hierarchy is loaded from the PSF file (uses
* more memory). <br>
* <br>
* The default value is false.
*/
public boolean loadHierarchy = false;
/**
* Start reading the PSF file at this line <br>
* <br>
* The default value is 0.
*/
public int offset = 0; // start reading at this line
/**
* This variable formalizes the assumption that semantic types have negative
* concept ids; NOTE this is a partial solution as it is not taken into
* account that vocs have to have a value of less then -1000 :<
*/
public static boolean semanticTypesNegative = true;
/**
* Read this number of lines from the PSF file. A value of -1 indicates all
* lines will be read. <br>
* <br>
* The default value is -1.
*/
public int length = -1; // read this many lines (-1 indicates all remaining)
/**
* Load the ontology from the given file. After this method, the OntologyStore
* can be retrieved from the ontology property.
*
* @param filename
* The complete path and filename of the file
*/
public void loadFromPSF(String filename) {
try {
loadFromPSF(new FileInputStream(filename));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
/**
* Load the ontology from the given file. After this method, the OntologyStore
* can be retrieved from the ontology property.
*
* @param filename
* The complete path and filename of the file
*/
public void loadFromPSF(InputStream is) {
ontology = new OntologyStore();
hasHierarchy = false;
level = -1;
sem = -1;
terms = -1;
voc = -1;
cui = -1;
autoCID = 0;
vocabularies.clear();
int count = -1;
boolean First = true;
BufferedReader bufferedReader = null;
try {
bufferedReader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1000000);
while (bufferedReader.ready()) {
String line = bufferedReader.readLine();
if (!line.equals("")) {
count++;
if (First) {
First = false;
if (line.substring(0, 5).equalsIgnoreCase("level")) {
extractColumns(line);
}
else {// assume default columns:
level = 0;
terms = 1;
cui = 2;
}
}
else if (count > offset && (length == -1 || count <= offset + length))
if (line.length() != 0) {
addToOntology(line);
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (bufferedReader != null)
try {
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Save the OntologyStore defined in the ontology properly to the specified
* file location.
*
* @param filename
*/
public void saveToPSF(String filename) {
boolean doVoc = hasVoc();
boolean doSem = hasSem();
try {
FileOutputStream PSFFile = new FileOutputStream(filename);
BufferedWriter bufferedWrite = new BufferedWriter(new OutputStreamWriter(PSFFile, "UTF-8"), 1000000);
StringBuffer firstline = new StringBuffer();
firstline.append("LEVEL|");
if (doVoc) {
firstline.append("VOC|");
}
if (doSem) {
firstline.append("SEM|");
}
firstline.append("DEFAULT|0");
bufferedWrite.write(firstline.toString());
bufferedWrite.newLine();
// Add concepts
Iterator<Concept> values = ontology.getConceptIterator();
while (values.hasNext()) {
Concept concept = values.next();
// if (concept.ID >= 0){
StringBuffer line = new StringBuffer();
line.append(0 + "|");
if (doVoc) {
line.append(relatedToString(concept.getID(), DefaultTypes.fromVocabulary) + "|");
}
if (doSem) {
line.append(semrelatedToString(concept.getID(), DefaultTypes.isOfSemanticType) + "|");
}
if (concept.terms != null) {
if (concept.terms.size() != 0) {
line.append(StringUtilities.escape(concept.terms.get(0).text));
}
for (int j = 1; j < concept.terms.size(); j++) {
line.append(";" + StringUtilities.escape(concept.terms.get(j).text));
}
}
else {
line.append(StringUtilities.escape(concept.getName()));
}
boolean hasDef = false;
if (!((concept.definition == null) || concept.definition.equals(""))) {
line.append("?" + StringUtilities.escape(concept.definition));
hasDef = true;
}
// Append the database identifiers (if any)
List<DatabaseID> databaseIDs = ontology.getDatabaseIDsForConcept(concept.getID());
if (databaseIDs != null) {
boolean first = true;
for (DatabaseID databaseID: databaseIDs) {
if (first && !hasDef) {
first = false;
line.append("?");
} else
line.append("\\;");
line.append(databaseID.database);
line.append("_");
line.append(StringUtilities.escape(databaseID.ID));
}
}
line.append("|" + Integer.toString(concept.ID));
bufferedWrite.write(line.toString());
bufferedWrite.newLine();
}
// Add hierarchy:
Iterator<Concept> conceptIterator = ontology.getConceptIterator();
while (conceptIterator.hasNext()) {
Concept concept = conceptIterator.next();
List<Relation> relations = ontology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.isParentOf);
if (relations != null && relations.size() != 0) {
StringBuffer line = new StringBuffer();
line.append("H|");
Iterator<Relation> relationIterator = relations.iterator();
while (relationIterator.hasNext()) {
Relation relation = relationIterator.next();
line.append(relation.object);
if (relationIterator.hasNext())
line.append(";");
}
line.append("|");
line.append(concept.getID());
bufferedWrite.write(line.toString());
bufferedWrite.newLine();
}
}
bufferedWrite.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
private String relatedToString(int conceptID, int relationType) {
List<Relation> vocs = ontology.getRelationsForConceptAsSubject(conceptID, relationType);
StringBuffer buffer = new StringBuffer();
if (vocs.size() != 0)
buffer.append(ontology.getConcept(vocs.get(0).object).getName());
for (int i = 1; i < vocs.size(); i++) {
buffer.append(";");
buffer.append(ontology.getConcept(vocs.get(i).object).getName());
}
return buffer.toString();
}
private String semrelatedToString(int conceptID, int relationType) {
List<Relation> sems = ontology.getRelationsForConceptAsSubject(conceptID, relationType);
StringBuffer buffer = new StringBuffer();
if (sems.size() != 0 && ontology.getConcept(sems.get(0).object) == null)
System.out.println(sems.get(0).toString());
if (sems.size() != 0) {
Integer id = ontology.getConcept(sems.get(0).object).getID();
if (semanticTypesNegative)
id = -id;
buffer.append(id);
}
for (int i = 1; i < sems.size(); i++) {
buffer.append(";");
Integer id = ontology.getConcept(sems.get(i).object).getID();
if (semanticTypesNegative)
id = -id;
buffer.append(id);
}
return buffer.toString();
}
private int level = -1;
private int sem = -1;
private int terms = -1;
private int voc = -1;
private int cui = -1;
private int autoCID = 0;
private boolean hasHierarchy = false;
private Map<String, Integer> vocabularies = new HashMap<String, Integer>();
// Finds which colums in the file contains what type of info:
private void extractColumns(String line) {
String[] columns = line.split("\\|");
for (int i = 0; i < columns.length; i++) {
if (columns[i].equalsIgnoreCase("level")) {
level = i;
}
;
if (columns[i].equalsIgnoreCase("sem")) {
sem = i;
}
;
if (columns[i].equalsIgnoreCase("voc")) {
voc = i;
}
;
if (columns[i].equalsIgnoreCase("default")) {
terms = i;
}
;
if (columns[i].equalsIgnoreCase("cui") || StringUtilities.isNumber(columns[i].trim())) {
cui = i;
}
;
}
}
// Parses the main lines and insert the info in the thesaurus:
private void addToOntology(String line) {
List<String> columns = StringUtilities.safeSplit(line, '|');
if (columns.get(0).equals("H")) {
if (loadHierarchy) {
if (!hasHierarchy) {
hasHierarchy = true;
}
int parent = Integer.parseInt(columns.get(2));
List<String> childrenStrings = StringUtilities.safeSplit(columns.get(1), ';');
for (String childString: childrenStrings) {
Relation relation = new Relation(parent, DefaultTypes.isParentOf, Integer.parseInt(childString));
ontology.setRelation(relation);
}
}
}
else {// Process concept information line// plugin rob//heee we cant have a
// cui of -1? hooray for text files! Bugs Galore!
Concept newConcept;
if (columns.size() > cui && cui != -1) {
if (columns.get(cui).trim().equals("")){
System.err.println("Missing cui in PSF file line: \"" + line + "\"");
newConcept = new Concept(autoCID++);
} else {
newConcept = new Concept(Integer.parseInt(columns.get(cui).trim()));
autoCID = newConcept.getID() + 1;
}
}
else {
newConcept = new Concept(autoCID++);
}
if (newConcept.getID() < 0) {
// semantic types and vocabularies
List<String> subs = StringUtilities.safeSplit(columns.get(terms), '?');
List<String> termlist = StringUtilities.safeSplit(subs.get(0).trim(), ';');
if (newConcept.getID() <= -1000) {// It's a vocabulary!
vocabularies.put(termlist.get(0), newConcept.getID());
newConcept.setName(termlist.get(0));
ontology.setConcept(newConcept);
}
else {// It's a semantic type (of course)
newConcept.setName(termlist.get(0));
if (loadDefinitions && subs.size() == 2) {
newConcept.setDefinition(StringUtilities.unescape(subs.get(1).trim()));
}
ontology.setConcept(newConcept);
}
}
else {
for (int column = 0; column < columns.size(); column++) {
if (column == level) {
// Ignored
}
else if (column == sem) {
List<String> semtypes = StringUtilities.safeSplit(columns.get(column), ';');
for (String type: semtypes) {
if (!type.equals("")) {
int id = Integer.parseInt(type);
if (semanticTypesNegative)
id = -id;
// check if already in semantic network of thesaurus:
Concept semtype = ontology.getConcept(id);
if (semtype == null) {
semtype = new Concept(id);
semtype.setName(type);
ontology.setConcept(semtype);
}
Relation relation = new Relation(newConcept.getID(), DefaultTypes.isOfSemanticType, semtype.getID());
ontology.setRelation(relation);
}
}
}
else if (column == voc) {
List<String> vocs = StringUtilities.safeSplit(columns.get(column), ';');
for (String vocstring: vocs) {
if (!vocs.equals("")) {
Integer vocID = vocabularies.get(vocstring);
if (vocID == null) {
if(StringUtilities.isInteger(vocstring)){
vocID = Integer.parseInt(vocstring);
}
else{
vocID = makeVocID();
Concept vocabulary = new Concept(vocID);
vocabulary.setName(vocstring);
ontology.setConcept(vocabulary);
}
vocabularies.put(vocstring, vocID);
}
Relation relation = new Relation(newConcept.getID(), DefaultTypes.fromVocabulary, vocID);
ontology.setRelation(relation);
}
}
}
else if (column == terms) {
List<String> subs = StringUtilities.safeSplit(columns.get(column), '?');
List<String> terms = StringUtilities.safeSplit(subs.get(0).trim(), ';');
if (terms.size() != 0) {
newConcept.terms = new ArrayList<TermStore>(terms.size());
}
for (int i = 0; i < terms.size(); i++) {
String text = StringUtilities.unescape(terms.get(i));
if (text.length() < 256)
newConcept.terms.add(new TermStore(text));
}
if (loadDefinitions && subs.size() == 2) {
String definition = StringUtilities.unescape(subs.get(1).trim());
definition = addDatabaseIDs(definition, newConcept);
newConcept.definition = definition;
}
}
}
if (ontology.getConcept(newConcept.ID) == null) // not already in
// thesaurus
ontology.setConcept(newConcept);
}
}
}
private Integer makeVocID() {
if (vocabularies.size() > 0) {
ArrayList<Integer> list = new ArrayList<Integer>(vocabularies.values());
Collections.sort(list);
return list.get(0) - 1;
}
else {
return -1000;
}
}
// Checks whether there are database identifiers in the definition. If so, it
// adds them to the ontology
private String addDatabaseIDs(String definition, Concept concept) {
if (definition.contains("_")) {
boolean hasIDs = false;
int minStart = 9999;
int maxEnd = 0;
for (String databaseID: DatabaseID.enumerateDatabases().keySet()) {
String prefix = databaseID + "_";
int start = 0;
while (start != -1) {
start = definition.indexOf(prefix, start);
// Check whether there is a letter directly before the prefix. If so,
// search for next:
while (start > 0 && Character.isLetter(definition.charAt(start - 1)))
start = definition.indexOf(prefix, start + 1);
if (start != -1) {
minStart = Math.min(start, minStart);
int end = definition.indexOf(';', start);
int endExclamation = definition.indexOf('!', start);
if (endExclamation != -1 && (endExclamation < end || end == -1))
end = endExclamation;
if (end == -1)
end = definition.length();
maxEnd = Math.max(end, maxEnd);
ontology.setDatabaseIDForConcept(concept.getID(), new DatabaseID(databaseID, definition.substring(start + prefix.length(), end)));
start = end;
hasIDs = true;
}
}
}
if (hasIDs)
return definition.substring(0, minStart) + definition.substring(maxEnd, definition.length());
}
return definition;
}
private boolean hasVoc() {
for (Relation relation: ontology.getRelations())
if (relation.predicate == DefaultTypes.fromVocabulary)
return true;
return false;
}
private boolean hasSem() {
for (Relation relation: ontology.getRelations())
if (relation.predicate == DefaultTypes.isOfSemanticType)
return true;
return false;
}
}