/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.ontology.ontologyutilities.evaluationScripts;
import java.util.HashSet;
import java.util.Set;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyClient;
import org.erasmusmc.ontology.OntologyManager;
import org.erasmusmc.ontology.OntologyPSFLoader;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.peregrine.SimpleTokenizer;
import org.erasmusmc.peregrine.Tokenizer;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;
public class DetectPlainEnglishWords {
public static final int UKWORDS = 0;
public static final int WORDNET = 1;
public static int wordlist = UKWORDS;
public static int maxWordCount = 2;
public static String wordNetOntologyName = "Wordnet3_0";
public static String defaultWordListFile = "/home/data/EnglishWords/ukwords.txt";
private Set<String> words;
private Tokenizer tokenizer = new SimpleTokenizer();
/**
* Detects terms in the ontology that are also plain english words.
* @param ontology
* @param wordListFile null loads default word list
* @param ouputFile
*/
public DetectPlainEnglishWords(Ontology ontology, String wordListFile, String ouputFile){
if (wordListFile == null){
wordListFile = defaultWordListFile;
}
if (wordlist == UKWORDS)
loadEnglishWords(wordListFile);
else
loadWordnetWords();
compare(ontology, ouputFile);
}
private void loadEnglishWords(String wordListFile) {
words = new HashSet<String>();
for (String word : new ReadTextFile(wordListFile))
words.add(word);
}
private void compare(Ontology ontology, String filename) {
WriteTextFile out = new WriteTextFile(filename);
for (Concept concept : ontology)
for (TermStore term : concept.getTerms()){
if (!StringUtilities.isAbbr(term.text)){
if (words.contains(term.text.toLowerCase()))
out.writeln(term.text);
else {
tokenizer.tokenize(term.text);
if (tokenizer.tokens.size() <= maxWordCount){
boolean ambiguous = true;
for (String token : tokenizer.tokens)
if (StringUtilities.isAbbr(token) || !words.contains(token.toLowerCase()))
ambiguous = false;
if (ambiguous)
out.writeln(term.text);
}
}
}
}
out.close();
}
private void loadWordnetWords() {
OntologyManager manager = new OntologyManager();
OntologyClient ontology = manager.fetchClient(wordNetOntologyName);
words = new HashSet<String>();
for (Concept concept : ontology){
for (TermStore term : concept.getTerms())
words.add(term.text.toLowerCase());
}
words.remove("i");
words.remove("ii");
words.remove("iii");
words.remove("iv");
words.remove("v");
words.remove("vi");
words.remove("vii");
words.remove("viii");
words.remove("ix");
words.remove("x");
}
}