/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package JochemBuilder.SharedCurationScripts;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.erasmusmc.collections.Pair;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;
import casperSoftwareCode.Rules;
public class CasperForJochem {
public static Set<String> stopwordsForFiltering = OntologyUtilities.stopwordsForFiltering;
public static boolean chemicalShortTokenRule = true;
public static boolean shortTokenRule = false;
public static boolean dosagesRule = true;
public static boolean atsignRule = true;
public static boolean necRule = true;
public static boolean nosRule = true;
public static boolean ecNumbersRule = true;
public static boolean miscRule = true;
boolean suppressRules = true;
boolean rewriteRules = true;
boolean semanticTypesRule = true;
boolean syntacticInvRule = true;
boolean possessivesRule = true;
boolean shortformlongformRule = true;
public OntologyStore run(OntologyStore originalOntology, String logfilePath) {
System.out.println("Starting script: "+StringUtilities.now());
/** Create log */
WriteTextFile logFile = new WriteTextFile(logfilePath);
OntologyStore newOntology = new OntologyStore();
Set<Integer> includedCUIs = new HashSet<Integer>();
/** Create datatypes for homonym checks */
System.out.println("Creating datatypes for homonym checks...");
Set<String> allTerms = new HashSet<String>();
Iterator<Concept> conceptIteratorForHomonyms = originalOntology.getConceptIterator();
while (conceptIteratorForHomonyms.hasNext()) {
Concept conceptForHomonyms = conceptIteratorForHomonyms.next();
if (conceptForHomonyms.getID() > 0) {
Iterator<TermStore> termIteratorForHomonyms = conceptForHomonyms.getTerms().iterator();
while (termIteratorForHomonyms.hasNext()) {
TermStore term = termIteratorForHomonyms.next();
allTerms.add(term.text.toLowerCase());
}
}
}
/** Set ontology variables*/
int rewrittenTermsCount = 0;
int suppressedTermsCount = 0;
Concept concept = null;
System.out.println("Rewriting... ");
Iterator<Concept> conceptIterator = originalOntology.getConceptIterator();
int lineCount = 0;
while (conceptIterator.hasNext()) {
lineCount++;
if (lineCount % 10000 == 0)
System.out.println(lineCount);
concept = conceptIterator.next();
if (concept.getID() > 0) {
List<TermStore> terms = concept.getTerms();
List<TermStore> termsToAdd = new ArrayList<TermStore>();
Iterator<TermStore> termIterator = terms.iterator();
while (termIterator.hasNext()) {
TermStore term = termIterator.next();
boolean suppressed = false;
if(suppressRules){
String termToRewrite = term.text;
if(applySuppressRules(termToRewrite)){
logFile.writeln("TERM REMOVED DUE TO SUPPRESS RULE|" + termToRewrite + "|"+concept.getName() +"|"+ concept.getID());
termIterator.remove();
suppressed = true;
suppressedTermsCount++;
}
}
if (rewriteRules){
if (!suppressed){
if(semanticTypesRule){
String old = term.text;
String rewrittenTermText = RewriteRules.findAndRewriteParenthesesWithSemanticType(term.text);
if(!rewrittenTermText.equals("")){
if(suppressRules){
if(!applySuppressRules(rewrittenTermText) && !allTerms.contains(rewrittenTermText.toLowerCase())){
rewrittenTermsCount++;
term.text = rewrittenTermText;
logFile.writeln("REWRITTEN DUE TO SEMANTIC TYPE|"+term.text+"|"+ old + "|"+concept.getID());
}
}else if(!allTerms.contains(rewrittenTermText.toLowerCase())) {
rewrittenTermsCount++;
term.text = rewrittenTermText;
logFile.writeln("REWRITTEN DUE TO SEMANTIC TYPE|"+term.text+"|"+ old + "|"+concept.getID());
}
}
}
if(syntacticInvRule){
String termToRewrite = term.text;
String rewrittenTermText = RewriteRules.findAndRewriteSyntacticUniversion(termToRewrite);
if(!rewrittenTermText.equals("")){
if(suppressRules){
if(!applySuppressRules(rewrittenTermText) && !allTerms.contains(rewrittenTermText.toLowerCase())){
rewrittenTermsCount++;
TermStore rewrittenTerm = new TermStore(rewrittenTermText);
termsToAdd.add(rewrittenTerm);
logFile.writeln("ADDED DUE TO SYNTACTIC INVERSION|"+rewrittenTermText+"|"+ termToRewrite + "|"+concept.getID());
}
}else if(!allTerms.contains(rewrittenTermText.toLowerCase())) {
rewrittenTermsCount++;
TermStore rewrittenTerm = new TermStore(rewrittenTermText);
termsToAdd.add(rewrittenTerm);
logFile.writeln("ADDED DUE TO SYNTACTIC INVERSION|"+rewrittenTermText+"|"+ termToRewrite + "|"+concept.getID());
}
}
}
if(possessivesRule){
String rewrittenTermText = RewriteRules.findAndRewritePossessive(term.text);
if(!rewrittenTermText.equals("")){
if(suppressRules){
if(!applySuppressRules(rewrittenTermText) && !allTerms.contains(rewrittenTermText.toLowerCase())){
rewrittenTermsCount++;
TermStore rewrittenTerm = new TermStore(rewrittenTermText);
termsToAdd.add(rewrittenTerm);
logFile.writeln("ADDED DUE TO POSSESSIVE|"+rewrittenTermText+"|"+ term.text + "|"+concept.getID());
}
}else if(!allTerms.contains(rewrittenTermText.toLowerCase())){
rewrittenTermsCount++;
TermStore rewrittenTerm = new TermStore(rewrittenTermText);
termsToAdd.add(rewrittenTerm);
logFile.writeln("ADDED DUE TO POSSESSIVE|"+rewrittenTermText+"|"+ term.text + "|"+concept.getID());
}
}
}
if(shortformlongformRule){
String termToRewrite = term.text;
List<Pair<String, String>> sflf = RewriteRules.findShortformLongformPattern(termToRewrite);
if (sflf!=null){
List<String> rewrittenTermList = new ArrayList<String>();
rewrittenTermList.add(sflf.get(0).object1.trim());
rewrittenTermList.add(sflf.get(0).object2.trim());
for(String rewrittenTerm: rewrittenTermList){
if(suppressRules){
if(!applySuppressRules(rewrittenTerm) && !allTerms.contains(rewrittenTerm.toLowerCase())){
rewrittenTermsCount++;
TermStore rewrittenTermStore = new TermStore(rewrittenTerm);
termsToAdd.add(rewrittenTermStore);
logFile.writeln("ADDED DUE TO SHORT FORM OR LONG FORM|"+rewrittenTerm+"|"+ termToRewrite + "|"+concept.getID());
}
}else if(!allTerms.contains(rewrittenTerm.toLowerCase())){
rewrittenTermsCount++;
TermStore rewrittenTermStore = new TermStore(rewrittenTerm);
termsToAdd.add(rewrittenTermStore);
logFile.writeln("ADDED DUE TO SHORT FORM OR LONG FORM|"+rewrittenTerm+"|"+ termToRewrite + "|"+concept.getID());
}
}
}
}
}
}
}
if (!termsToAdd.isEmpty()){
terms.addAll(termsToAdd);
}
concept.setTerms(terms);
OntologyUtilities.removeDuplicateTerms(concept.getTerms());
}
if (!concept.getTerms().isEmpty() || concept.getID() < 0) {
includedCUIs.add(concept.getID());
newOntology.setConcept(concept);
}
}
// Copy relationships:
List<Relation> relations = originalOntology.getRelations();
for (Relation relation: relations)
if (includedCUIs.contains(relation.subject) && includedCUIs.contains(relation.object))
newOntology.setRelation(relation);
// Copy databaseIDs:
List<DatabaseID> databaseIDs;
for (int cui: includedCUIs) {
databaseIDs = originalOntology.getDatabaseIDsForConcept(cui);
if (databaseIDs != null)
for (DatabaseID databaseID: databaseIDs)
newOntology.setDatabaseIDForConcept(cui, databaseID);
}
/** Save to ontologyfile and log */
System.out.println("Closing logfile and saving new ontology: "+StringUtilities.now());
logFile.close();
System.out.println(suppressedTermsCount+ " terms were removed by suppress rules");
System.out.println(rewrittenTermsCount+ " terms were rewritten");
return newOntology;
}
public static boolean applySuppressRules(String term){
if(chemicalShortTokenRule){
if(JochemCurator.kristinasChemicalShortTokenFilterRule(term, stopwordsForFiltering)) return true;
}
if(shortTokenRule){
if(Rules.MartijnsFilterRule(term, stopwordsForFiltering)) return true;
}
if(dosagesRule){
if(RewriteRules.findAndSuppressDosages(term)) return true;
}
if(atsignRule){
if(RewriteRules.findAndSuppressAtSign(term)) return true;
}
if(ecNumbersRule){
if(RewriteRules.findAndSuppressECnumbers(term)) return true;
}
if(necRule){
if(RewriteRules.findAndSuppressNEC(term)) return true;
}
if(nosRule){
if(RewriteRules.findAndSuppressNOS(term)) return true;
}
if(miscRule){
if(RewriteRules.findAndSuppressMisc(term)) return true;
}
return false;
}
}