DetectPlainEnglishWords.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.ontology.ontologyutilities.evaluationScripts;

import java.util.HashSet;
import java.util.Set;

import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyClient;
import org.erasmusmc.ontology.OntologyManager;
import org.erasmusmc.ontology.OntologyPSFLoader;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.peregrine.SimpleTokenizer;
import org.erasmusmc.peregrine.Tokenizer;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;

public class DetectPlainEnglishWords {

	public static final int UKWORDS = 0;
	public static final int WORDNET = 1;
	public static int wordlist = UKWORDS;
	public static int maxWordCount = 2;
	public static String wordNetOntologyName = "Wordnet3_0";
	public static String defaultWordListFile = "/home/data/EnglishWords/ukwords.txt";
	private Set<String> words;
	private Tokenizer tokenizer = new SimpleTokenizer(); 

	/**
	 * Detects terms in the ontology that are also plain english words.
	 * @param ontology	
	 * @param wordListFile	null loads default word list
	 * @param ouputFile
	 */
	public DetectPlainEnglishWords(Ontology ontology, String wordListFile, String ouputFile){
		if (wordListFile == null){
			wordListFile = defaultWordListFile;
		}
		if (wordlist == UKWORDS)
			loadEnglishWords(wordListFile);
		else
	    loadWordnetWords();
	  
	  compare(ontology, ouputFile);
	}

	private void loadEnglishWords(String wordListFile) {
		words = new HashSet<String>();
		for (String word : new ReadTextFile(wordListFile))
		  words.add(word);
	}

	private void compare(Ontology ontology, String filename) {
		WriteTextFile out = new WriteTextFile(filename);
		for (Concept concept : ontology)
			for (TermStore term : concept.getTerms()){
				if (!StringUtilities.isAbbr(term.text)){
					if (words.contains(term.text.toLowerCase()))
						out.writeln(term.text);
					else {
						tokenizer.tokenize(term.text);
						if (tokenizer.tokens.size() <= maxWordCount){
							boolean ambiguous = true;
							for (String token : tokenizer.tokens)
								if (StringUtilities.isAbbr(token) || !words.contains(token.toLowerCase()))
									ambiguous = false;
							if (ambiguous)
								out.writeln(term.text);
						}
					}
				}
			}
		out.close();
	}

	private void loadWordnetWords() {
		OntologyManager manager = new OntologyManager();
		OntologyClient ontology = manager.fetchClient(wordNetOntologyName);
		words = new HashSet<String>();
		for (Concept concept : ontology){
			for (TermStore term : concept.getTerms())
				words.add(term.text.toLowerCase());
		}
		words.remove("i");
		words.remove("ii");
		words.remove("iii");
		words.remove("iv");
		words.remove("v");
		words.remove("vi");
		words.remove("vii");
		words.remove("viii");
		words.remove("ix");
		words.remove("x");
	}
}