JochemCurator.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package JochemBuilder.SharedCurationScripts;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.peregrine.ChemicalSBDtokenizer;
import org.erasmusmc.utilities.StringUtilities;

public class JochemCurator {

	private static final String FOLDER_PATH = "/home/bhsingh/Code/workspace/erasmus/trunk/src/DataImport/src/JochemBuilder/SharedCurationScripts/";
	public static int minWordSize = 2;

	public static boolean kristinasChemicalShortTokenFilterRule(String term, Set<String> stopwordsForFiltering) {
		term = term.toLowerCase();
		ChemicalSBDtokenizer tokenizer = new ChemicalSBDtokenizer();
		tokenizer.tokenize(term);
		String tokenizedTerm = "";
		for (String token : tokenizer.tokens) {
			tokenizedTerm = tokenizedTerm.concat(token);
		}
		if (tokenizedTerm.length() < minWordSize || StringUtilities.isNumber(tokenizedTerm)
				|| StringUtilities.isRomanNumeral(tokenizedTerm.toUpperCase())
				|| stopwordsForFiltering.contains(tokenizedTerm)) {
			return true;
		}
		return false;
	}

	public static Pattern signpattern = Pattern.compile("#");

	public static boolean signfilter(String term) {
		if (signpattern.matcher(term).find()) {
			return true;
		} else
			return false;
	}

	public static Pattern mixturePattern = Pattern
			.compile("\\((\\d+):(\\d+)\\)|\\((\\d+):(\\d+):(\\d+)\\)|\\((\\d+):(\\d+):(\\d+):(\\d+)\\)|\\((\\d+):(\\d+):(\\d+):(\\d+):(\\d+)\\)|\\((\\d+)CI\\)|\\((\\d+)CI,(\\d+)CI\\)|\\((\\d+)CI,(\\d+)CI,(\\d+)CI\\)|\\((\\d+)CI,(\\d+)CI,(\\d+)CI,(\\d+)CI\\)");

	// public static Pattern mixturePattern = Pattern.compile("\\((\\d+):(\\d+)\\)");
	public static boolean mixturefilter(String term) {
		if (mixturePattern.matcher(term).find()) {
			return true;
		} else
			return false;
	}

	public static void removeDuplicateTerms(List<TermStore> terms) {
		Set<String> previousTerms = new HashSet<String>();
		Iterator<TermStore> iterator = terms.iterator();
		while (iterator.hasNext()) {
			TermStore term = iterator.next();
			if (previousTerms.contains(term.text)) {
				iterator.remove();
			} else {
				previousTerms.add(term.text);
			}
		}
	}

	public static final String allEndBracketsOrParenthesisNotGreedyPattern = "(\\s\\[[^]]*\\]$)|(\\s\\([^)]*\\)$)";
	public static Pattern allEndBracketsOrParenthesisNotGreedyPatternExp = Pattern
			.compile(allEndBracketsOrParenthesisNotGreedyPattern);

	public static Set<String> dictionaries = getDictionaryNamesForChemicals();

	public static String rewriteNameForDictionaries(String term) {
		boolean found = false;
		String rewritten = "";
		Pattern p = allEndBracketsOrParenthesisNotGreedyPatternExp;
		Matcher m = p.matcher(term);
		while (m.find()) {
			String match = m.group().substring(2, m.group().length() - 1).toLowerCase().trim();
			for (String dict : dictionaries) {
				if (match.equals(dict.toLowerCase()) || (match.contains(dict.toLowerCase()) && match.contains(":"))
						|| (match.contains(dict.toLowerCase()) && match.contains("/"))) {
					found = true;
				}
			}
			if (found) {
				rewritten = m.replaceAll("").trim();
			}
		}
		return rewritten;
	}

	public static String findAndRewriteParenthesesAndBracketsAtEndOfTermRule(String term) {
		boolean found = false;
		String rewritten = "";
		Pattern p = allEndBracketsOrParenthesisNotGreedyPatternExp;
		Matcher m = p.matcher(term);
		while (m.find()) {
			found = true;
		}
		if (found) {
			rewritten = m.replaceAll("").trim();
		}
		return rewritten;
	}

	public static final String BeilsteinPatternString = "(Beilstein Handbook Reference)";
	public static Pattern BeilsteinPattern = Pattern.compile(BeilsteinPatternString, Pattern.CASE_INSENSITIVE);

	public static boolean filterNameForBeilsteinPattern(String term) {
		if (BeilsteinPattern.matcher(term).find())
			return true;
		return false;
	}

	public static boolean findAndSuppressChemicalMisc(String term, String termsToRemove) {
		Set<String> miscTerms = getUndesiredTermsToFilterOut(termsToRemove);
		String lcTerm = term.toLowerCase();
		// if (miscTerms.contains(lcTerm) || lcTerm.contains(" venom ")) return true;
		if (miscTerms.contains(lcTerm))
			return true;
		else
			return false;
	}

	public static Set<Integer> miscConcepts = getUndesiredConceptsToFilterOut();

	public static boolean findAndSuppressChemicalMiscConcept(Concept concept) {
		if (miscConcepts.contains(concept.getID()))
			return true;
		else
			return false;
	}

	public static Set<String> pharmas = getPharmaceuticalCompanies();

	public static String rewriteNameForPharmas(String term) {
		boolean found = false;
		String rewritten = "";
		Pattern p = allEndBracketsOrParenthesisNotGreedyPatternExp;
		Matcher m = p.matcher(term);
		while (m.find()) {
			String match = m.group().substring(2, m.group().length() - 1).toLowerCase().trim();
			for (String miscString : pharmas) {
				if (match.equals(miscString.toLowerCase())) {
					found = true;
				}
			}
			if (found) {
				rewritten = m.replaceAll("").trim();
			}
		}
		return rewritten;
	}

	public static String rewriteNameForPattern(String term) {
		String rewritten = "";
		if (term.contains("|")) {
			rewritten = term.substring(0, term.indexOf("|"));
			return rewritten;
		}
		return rewritten;
	}

	public static Set<String> getDictionaryNamesForChemicals() {
		Set<String> dictionaryNames = new HashSet<String>();
		dictionaryNames.add("BAN");
		dictionaryNames.add("JAN");
		dictionaryNames.add("INN");
		dictionaryNames.add("USAN");
		dictionaryNames.add("USP");
		dictionaryNames.add("USP X");
		dictionaryNames.add("USP XXI");
		dictionaryNames.add("NF");
		dictionaryNames.add("NF X");
		dictionaryNames.add("NF XII");
		dictionaryNames.add("NF XIII");
		dictionaryNames.add("NF XIV");
		dictionaryNames.add("ISO");
		dictionaryNames.add("BSI");
		dictionaryNames.add("NND");
		dictionaryNames.add("ANSI");
		dictionaryNames.add("UN");
		dictionaryNames.add("RN");
		dictionaryNames.add("DCIT");
		dictionaryNames.add("DCF");
		dictionaryNames.add("IUPAC");
		dictionaryNames.add("ESA");
		dictionaryNames.add("JP");
		dictionaryNames.add("VAN");
		dictionaryNames.add("TN");
		dictionaryNames.add("JP15");
		return dictionaryNames;
	}

	public static Set<String> getUndesiredTermPartsToFilterOut() {
		Set<String> result = new HashSet<String>();
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
				JochemCurator.class.getResourceAsStream("termsWithinParentesesToRemove.txt")));
		try {
			while (bufferedReader.ready()) {
				result.add(bufferedReader.readLine().trim().toLowerCase());
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return result;
	}

	public static Set<String> getUndesiredTermsToFilterOut(String filename) {
		Set<String> result = new HashSet<String>();
		File file = new File(FOLDER_PATH + filename);
		LineIterator it = null;
		try {
			it = FileUtils.lineIterator(file);

			while (it.hasNext()) {
				result.add(it.next().trim().toLowerCase());
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			LineIterator.closeQuietly(it);
		}
		return result;
	}

	public static Set<Integer> getUndesiredConceptsToFilterOut() {
		Set<Integer> things = new HashSet<Integer>();
		// InputStreamReader(JochemCurator.class.getResourceAsStream("conceptsToRemove.txt")));
		File file = new File(FOLDER_PATH + "conceptsToRemove.txt");
		LineIterator it = null;
		try {
			it = FileUtils.lineIterator(file);
			while (it.hasNext()) {
				String conceptLine = it.next().trim();
				String[] conceptNumbers = conceptLine.split(";");
				for (String conceptNumber : conceptNumbers) {
					if (conceptNumber.length() != 0)
						things.add(Integer.parseInt(conceptNumber));
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			LineIterator.closeQuietly(it);
		}
		return things;
	}

	public static void removeSuppressedConcepts(Ontology ontology) {
		Set<Integer> suppressedConcepts = getUndesiredConceptsToFilterOut();
		for (Integer id : suppressedConcepts) {
			if (ontology.getConcept(id) != null) {
				ontology.removeConcept(id);
			}
		}
	}

	public static Set<String> getPharmaceuticalCompanies() {
		Set<String> result = new HashSet<String>();
		// InputStreamReader(JochemCurator.class.getResourceAsStream("pharmaceuticalCompanies.txt")));
		File file = new File(FOLDER_PATH + "pharmaceuticalCompanies.txt");
		LineIterator it = null;
		try {
			it = FileUtils.lineIterator(file);
			while (it.hasNext()) {
				result.add(it.next().trim().toLowerCase());
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			LineIterator.closeQuietly(it);
		}
		return result;
	}

	public static Set<Integer> getAllChemicalSemanticTypes() {
		Set<Integer> result = new TreeSet<Integer>();
		result.add(-103);
		result.add(-104);
		result.add(-109);
		result.add(-114);
		result.add(-115);
		result.add(-116);
		result.add(-118);
		result.add(-119);
		result.add(-110);
		result.add(-111);
		result.add(-196);
		result.add(-197);
		result.add(-120);
		result.add(-121);
		result.add(-195);
		result.add(-122);
		result.add(-123);
		result.add(-124);
		result.add(-125);
		result.add(-126);
		result.add(-127);
		result.add(-129);
		result.add(-192);
		result.add(-130);
		result.add(-131);
		result.add(-200);
		return result;
	}

	public static Set<Integer> getUndesiredSemanticTypes() {
		Set<Integer> result = new TreeSet<Integer>();
		result.add(aminoacidPeptideOrProtein);
		result.add(enzyme);
		result.add(receptor);
		// result.add(immunologicFactor);
		result.add(chemicalViewedFunctionally);
		// result.add(chemicalViewedStructually);
		result.add(biomedOrDentalMaterial);
		result.add(virus);
		result.add(plant);
		result.add(chemical);
		result.add(food);
		result.add(cell);
		result.add(geneOrGenome);
		result.add(spatialConcept);
		result.add(environmentalEffectOfHumans);
		result.add(bodySubstance);
		result.add(clinicalDrug);
		result.add(medicalDevice);
		result.add(cellComponent);
		result.add(nucleotideSequence);
		result.add(biomedicalOccupationOrdiscipline);
		result.add(manufacturedObject);
		result.add(bodyPartOrganOrOrganComponent);
		result.add(aminoAcidSequence);
		result.add(classification);
		result.add(drugDeliveryDevice);
		result.add(tissue);
		result.add(bacterium);
		result.add(fungus);
		result.add(molecularFunction);
		return result;
	}

	static int aminoacidPeptideOrProtein = -116;
	static int enzyme = -126;
	static int receptor = -192;
	// static int immunologicFactor = -129;
	static int chemicalViewedFunctionally = -120;
	// static int chemicalViewedStructually = -104;
	static int biomedOrDentalMaterial = -122;
	static int virus = -5;
	static int plant = -2;
	static int chemical = -103;
	static int food = -168;
	static int cell = -25;
	static int geneOrGenome = -28;
	static int spatialConcept = -82;
	static int environmentalEffectOfHumans = -69;
	static int bodySubstance = -31;
	static int clinicalDrug = -200;
	static int medicalDevice = -74;
	static int cellComponent = -26;
	static int nucleotideSequence = -86;
	static int biomedicalOccupationOrdiscipline = -91;
	static int manufacturedObject = -73;
	static int bodyPartOrganOrOrganComponent = -23;
	static int aminoAcidSequence = -87;
	static int classification = -185;
	static int drugDeliveryDevice = -203;
	static int tissue = -24;
	static int bacterium = -7;
	static int fungus = -4;
	static int molecularFunction = -44;

}