/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package JochemBuilder.SharedCurationScripts;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.peregrine.ChemicalSBDtokenizer;
import org.erasmusmc.utilities.StringUtilities;
public class JochemCurator {
private static final String FOLDER_PATH = "/home/bhsingh/Code/workspace/erasmus/trunk/src/DataImport/src/JochemBuilder/SharedCurationScripts/";
public static int minWordSize = 2;
public static boolean kristinasChemicalShortTokenFilterRule(String term, Set<String> stopwordsForFiltering) {
term = term.toLowerCase();
ChemicalSBDtokenizer tokenizer = new ChemicalSBDtokenizer();
tokenizer.tokenize(term);
String tokenizedTerm = "";
for (String token : tokenizer.tokens) {
tokenizedTerm = tokenizedTerm.concat(token);
}
if (tokenizedTerm.length() < minWordSize || StringUtilities.isNumber(tokenizedTerm)
|| StringUtilities.isRomanNumeral(tokenizedTerm.toUpperCase())
|| stopwordsForFiltering.contains(tokenizedTerm)) {
return true;
}
return false;
}
public static Pattern signpattern = Pattern.compile("#");
public static boolean signfilter(String term) {
if (signpattern.matcher(term).find()) {
return true;
} else
return false;
}
public static Pattern mixturePattern = Pattern
.compile("\\((\\d+):(\\d+)\\)|\\((\\d+):(\\d+):(\\d+)\\)|\\((\\d+):(\\d+):(\\d+):(\\d+)\\)|\\((\\d+):(\\d+):(\\d+):(\\d+):(\\d+)\\)|\\((\\d+)CI\\)|\\((\\d+)CI,(\\d+)CI\\)|\\((\\d+)CI,(\\d+)CI,(\\d+)CI\\)|\\((\\d+)CI,(\\d+)CI,(\\d+)CI,(\\d+)CI\\)");
// public static Pattern mixturePattern = Pattern.compile("\\((\\d+):(\\d+)\\)");
public static boolean mixturefilter(String term) {
if (mixturePattern.matcher(term).find()) {
return true;
} else
return false;
}
public static void removeDuplicateTerms(List<TermStore> terms) {
Set<String> previousTerms = new HashSet<String>();
Iterator<TermStore> iterator = terms.iterator();
while (iterator.hasNext()) {
TermStore term = iterator.next();
if (previousTerms.contains(term.text)) {
iterator.remove();
} else {
previousTerms.add(term.text);
}
}
}
public static final String allEndBracketsOrParenthesisNotGreedyPattern = "(\\s\\[[^]]*\\]$)|(\\s\\([^)]*\\)$)";
public static Pattern allEndBracketsOrParenthesisNotGreedyPatternExp = Pattern
.compile(allEndBracketsOrParenthesisNotGreedyPattern);
public static Set<String> dictionaries = getDictionaryNamesForChemicals();
public static String rewriteNameForDictionaries(String term) {
boolean found = false;
String rewritten = "";
Pattern p = allEndBracketsOrParenthesisNotGreedyPatternExp;
Matcher m = p.matcher(term);
while (m.find()) {
String match = m.group().substring(2, m.group().length() - 1).toLowerCase().trim();
for (String dict : dictionaries) {
if (match.equals(dict.toLowerCase()) || (match.contains(dict.toLowerCase()) && match.contains(":"))
|| (match.contains(dict.toLowerCase()) && match.contains("/"))) {
found = true;
}
}
if (found) {
rewritten = m.replaceAll("").trim();
}
}
return rewritten;
}
public static String findAndRewriteParenthesesAndBracketsAtEndOfTermRule(String term) {
boolean found = false;
String rewritten = "";
Pattern p = allEndBracketsOrParenthesisNotGreedyPatternExp;
Matcher m = p.matcher(term);
while (m.find()) {
found = true;
}
if (found) {
rewritten = m.replaceAll("").trim();
}
return rewritten;
}
public static final String BeilsteinPatternString = "(Beilstein Handbook Reference)";
public static Pattern BeilsteinPattern = Pattern.compile(BeilsteinPatternString, Pattern.CASE_INSENSITIVE);
public static boolean filterNameForBeilsteinPattern(String term) {
if (BeilsteinPattern.matcher(term).find())
return true;
return false;
}
public static boolean findAndSuppressChemicalMisc(String term, String termsToRemove) {
Set<String> miscTerms = getUndesiredTermsToFilterOut(termsToRemove);
String lcTerm = term.toLowerCase();
// if (miscTerms.contains(lcTerm) || lcTerm.contains(" venom ")) return true;
if (miscTerms.contains(lcTerm))
return true;
else
return false;
}
public static Set<Integer> miscConcepts = getUndesiredConceptsToFilterOut();
public static boolean findAndSuppressChemicalMiscConcept(Concept concept) {
if (miscConcepts.contains(concept.getID()))
return true;
else
return false;
}
public static Set<String> pharmas = getPharmaceuticalCompanies();
public static String rewriteNameForPharmas(String term) {
boolean found = false;
String rewritten = "";
Pattern p = allEndBracketsOrParenthesisNotGreedyPatternExp;
Matcher m = p.matcher(term);
while (m.find()) {
String match = m.group().substring(2, m.group().length() - 1).toLowerCase().trim();
for (String miscString : pharmas) {
if (match.equals(miscString.toLowerCase())) {
found = true;
}
}
if (found) {
rewritten = m.replaceAll("").trim();
}
}
return rewritten;
}
public static String rewriteNameForPattern(String term) {
String rewritten = "";
if (term.contains("|")) {
rewritten = term.substring(0, term.indexOf("|"));
return rewritten;
}
return rewritten;
}
public static Set<String> getDictionaryNamesForChemicals() {
Set<String> dictionaryNames = new HashSet<String>();
dictionaryNames.add("BAN");
dictionaryNames.add("JAN");
dictionaryNames.add("INN");
dictionaryNames.add("USAN");
dictionaryNames.add("USP");
dictionaryNames.add("USP X");
dictionaryNames.add("USP XXI");
dictionaryNames.add("NF");
dictionaryNames.add("NF X");
dictionaryNames.add("NF XII");
dictionaryNames.add("NF XIII");
dictionaryNames.add("NF XIV");
dictionaryNames.add("ISO");
dictionaryNames.add("BSI");
dictionaryNames.add("NND");
dictionaryNames.add("ANSI");
dictionaryNames.add("UN");
dictionaryNames.add("RN");
dictionaryNames.add("DCIT");
dictionaryNames.add("DCF");
dictionaryNames.add("IUPAC");
dictionaryNames.add("ESA");
dictionaryNames.add("JP");
dictionaryNames.add("VAN");
dictionaryNames.add("TN");
dictionaryNames.add("JP15");
return dictionaryNames;
}
public static Set<String> getUndesiredTermPartsToFilterOut() {
Set<String> result = new HashSet<String>();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
JochemCurator.class.getResourceAsStream("termsWithinParentesesToRemove.txt")));
try {
while (bufferedReader.ready()) {
result.add(bufferedReader.readLine().trim().toLowerCase());
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
public static Set<String> getUndesiredTermsToFilterOut(String filename) {
Set<String> result = new HashSet<String>();
File file = new File(FOLDER_PATH + filename);
LineIterator it = null;
try {
it = FileUtils.lineIterator(file);
while (it.hasNext()) {
result.add(it.next().trim().toLowerCase());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
LineIterator.closeQuietly(it);
}
return result;
}
public static Set<Integer> getUndesiredConceptsToFilterOut() {
Set<Integer> things = new HashSet<Integer>();
// InputStreamReader(JochemCurator.class.getResourceAsStream("conceptsToRemove.txt")));
File file = new File(FOLDER_PATH + "conceptsToRemove.txt");
LineIterator it = null;
try {
it = FileUtils.lineIterator(file);
while (it.hasNext()) {
String conceptLine = it.next().trim();
String[] conceptNumbers = conceptLine.split(";");
for (String conceptNumber : conceptNumbers) {
if (conceptNumber.length() != 0)
things.add(Integer.parseInt(conceptNumber));
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
LineIterator.closeQuietly(it);
}
return things;
}
public static void removeSuppressedConcepts(Ontology ontology) {
Set<Integer> suppressedConcepts = getUndesiredConceptsToFilterOut();
for (Integer id : suppressedConcepts) {
if (ontology.getConcept(id) != null) {
ontology.removeConcept(id);
}
}
}
public static Set<String> getPharmaceuticalCompanies() {
Set<String> result = new HashSet<String>();
// InputStreamReader(JochemCurator.class.getResourceAsStream("pharmaceuticalCompanies.txt")));
File file = new File(FOLDER_PATH + "pharmaceuticalCompanies.txt");
LineIterator it = null;
try {
it = FileUtils.lineIterator(file);
while (it.hasNext()) {
result.add(it.next().trim().toLowerCase());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
LineIterator.closeQuietly(it);
}
return result;
}
public static Set<Integer> getAllChemicalSemanticTypes() {
Set<Integer> result = new TreeSet<Integer>();
result.add(-103);
result.add(-104);
result.add(-109);
result.add(-114);
result.add(-115);
result.add(-116);
result.add(-118);
result.add(-119);
result.add(-110);
result.add(-111);
result.add(-196);
result.add(-197);
result.add(-120);
result.add(-121);
result.add(-195);
result.add(-122);
result.add(-123);
result.add(-124);
result.add(-125);
result.add(-126);
result.add(-127);
result.add(-129);
result.add(-192);
result.add(-130);
result.add(-131);
result.add(-200);
return result;
}
public static Set<Integer> getUndesiredSemanticTypes() {
Set<Integer> result = new TreeSet<Integer>();
result.add(aminoacidPeptideOrProtein);
result.add(enzyme);
result.add(receptor);
// result.add(immunologicFactor);
result.add(chemicalViewedFunctionally);
// result.add(chemicalViewedStructually);
result.add(biomedOrDentalMaterial);
result.add(virus);
result.add(plant);
result.add(chemical);
result.add(food);
result.add(cell);
result.add(geneOrGenome);
result.add(spatialConcept);
result.add(environmentalEffectOfHumans);
result.add(bodySubstance);
result.add(clinicalDrug);
result.add(medicalDevice);
result.add(cellComponent);
result.add(nucleotideSequence);
result.add(biomedicalOccupationOrdiscipline);
result.add(manufacturedObject);
result.add(bodyPartOrganOrOrganComponent);
result.add(aminoAcidSequence);
result.add(classification);
result.add(drugDeliveryDevice);
result.add(tissue);
result.add(bacterium);
result.add(fungus);
result.add(molecularFunction);
return result;
}
static int aminoacidPeptideOrProtein = -116;
static int enzyme = -126;
static int receptor = -192;
// static int immunologicFactor = -129;
static int chemicalViewedFunctionally = -120;
// static int chemicalViewedStructually = -104;
static int biomedOrDentalMaterial = -122;
static int virus = -5;
static int plant = -2;
static int chemical = -103;
static int food = -168;
static int cell = -25;
static int geneOrGenome = -28;
static int spatialConcept = -82;
static int environmentalEffectOfHumans = -69;
static int bodySubstance = -31;
static int clinicalDrug = -200;
static int medicalDevice = -74;
static int cellComponent = -26;
static int nucleotideSequence = -86;
static int biomedicalOccupationOrdiscipline = -91;
static int manufacturedObject = -73;
static int bodyPartOrganOrOrganComponent = -23;
static int aminoAcidSequence = -87;
static int classification = -185;
static int drugDeliveryDevice = -203;
static int tissue = -24;
static int bacterium = -7;
static int fungus = -4;
static int molecularFunction = -44;
}