CurationFileParser.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.ontology.ontologyutilities;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.peregrine.SimpleTokenizer;
import org.erasmusmc.textMining.LVG.LVGNormaliser;
//import org.erasmusmc.utilities.LVGNormaliser;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;

public class CurationFileParser {

	public Map<String, Set<String>> suppressedTermsPerVoc = new HashMap<String, Set<String>>();
	public Set<String> suppressedTermsAllVocs = new HashSet<String>();
	public Map<DatabaseID, Set<String>> suppressedTermsPerDatabaseID = new HashMap<DatabaseID, Set<String>>();
	public List<DatabaseID> suppressedWholeUMLSConcepts = new ArrayList<DatabaseID>();
	public Map<DatabaseID, Set<String>> addedTermsPerDatabaseID = new HashMap<DatabaseID, Set<String>>();
	public List<DatabaseIDmap> mappingsFromToDBID = new ArrayList<DatabaseIDmap>();
	public NormaliseUsingLVG lvg = new NormaliseUsingLVG();

	/**
	 * Reads a curation file and stores the information in the public variables of this class
	 * @param curationInstructionsFile
	 */
	public CurationFileParser(String curationInstructionsFile) {
		for (String line: new ReadTextFile(curationInstructionsFile)){
			if (!line.startsWith("#")) { // check if it is not a comment line!
				List<String> cells = StringUtilities.safeSplit(line, '|');
				if (cells.size() > 1)
					if (cells.get(0).equals("ADD") && cells.get(1).equals("DBLINK")) {
						String dbID = cells.get(2);
						String dbEntry = cells.get(3);
						DatabaseID databaseID = new DatabaseID(dbID, dbEntry);
						Set<String> addedTerms = new TreeSet<String>();
						for (int i = 4; i < cells.size(); i++) 
							addedTerms.add(cells.get(i));
						addedTermsPerDatabaseID.put(databaseID, addedTerms);
					} else if (cells.get(0).equals("SUPPRESS")) {
						if (cells.get(1).equals("VOC")) {
							String voc = cells.get(2);
							if (voc.equals("ALL")){
								suppressedTermsAllVocs.add(normalizeTerm(cells.get(3)));
								if (!OntologyUtilities.isGeneSymbol(cells.get(3)))
									suppressedTermsAllVocs.add(lvg.lvgnormalise(cells.get(3)));
							}else {
								Set<String> suppressedTerms = suppressedTermsPerVoc.get(voc);
								if (suppressedTerms == null) {
									suppressedTerms = new HashSet<String>();
									suppressedTermsPerVoc.put(voc, suppressedTerms);
								}
								if (!OntologyUtilities.isGeneSymbol(cells.get(3)))
									suppressedTerms.add(lvg.lvgnormalise(cells.get(3)));
								suppressedTerms.add(normalizeTerm(cells.get(3)));
							}
						}
						else if (cells.get(1).equals("DBLINK")) {
							String dbID = cells.get(2);
							String dbEntry = cells.get(3);
							DatabaseID databaseID = new DatabaseID(dbID, dbEntry);
							Set<String> suppressedTerms = new TreeSet<String>();
							for (int i = 4; i < cells.size(); i++) {
								suppressedTerms.add(normalizeTerm(cells.get(i)));
								if (!OntologyUtilities.isGeneSymbol(cells.get(3)))
									suppressedTerms.add(lvg.lvgnormalise(cells.get(i)));
							}
							if (!suppressedTermsPerDatabaseID.containsKey(databaseID)){
								suppressedTermsPerDatabaseID.put(databaseID, suppressedTerms);
							}else suppressedTermsPerDatabaseID.get(databaseID).addAll(suppressedTerms);
						}

					}
					else if (cells.get(0).equals("MAP")) {
						if (cells.get(1).equals("DBLINK")) {
							String dbIDfrom = cells.get(2);
							String dbEntryfrom = cells.get(3);
							String dbIDto = cells.get(4);
							String dbEntryto = cells.get(5);
							mappingsFromToDBID.add(new DatabaseIDmap(new DatabaseID(dbIDfrom, dbEntryfrom), new DatabaseID(dbIDto, dbEntryto)));
						}
					}
					else if (cells.get(0).equals("SUPPRESS_WHOLE_UMLSCONCEPT")) {
						if (cells.get(1).equals("DBLINK")) {
							String dbID = cells.get(2);
							String dbEntry = cells.get(3);
							if (dbID.equals("UMLS")){
								suppressedWholeUMLSConcepts.add(new DatabaseID(dbID, dbEntry));
							}
						}
					}
			}
		}
	}

	/**
	 * Normalizes (tokenization, conversion to lowercase) the input string, to compare it with the terms found in the curation file
	 * @param string
	 * @return	normalized string
	 */
	public static String normalizeTerm(String string){
		SimpleTokenizer tokenizer = new SimpleTokenizer();
		tokenizer.tokenize(string);
		StringBuilder sb = new StringBuilder();
		for (String token : tokenizer.tokens){
			token = StringUtilities.firstLetterToLowerCase(token);
			//if (!StringUtilities.isAbbr(token))
			//	token = token.toLowerCase();
			if (!OntologyUtilities.stopwordsForIndexing.contains(token)){
				if (sb.length() != 0)
					sb.append(' ');
				sb.append(token);
			}
		}

		return sb.toString();
	}

	public class DatabaseIDmap {
		DatabaseID from;
		DatabaseID to;

		public DatabaseIDmap(DatabaseID from, DatabaseID to) {
			this.from = from;
			this.to = to;
		}
	}

	private class NormaliseUsingLVG extends LVGNormaliser{
		public String lvgnormalise(String string){
			return externalnormalise(string);
		}
	}

}