package maui.vocab;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Vector;
import java.util.zip.GZIPInputStream;
import maui.stemmers.Stemmer;
import maui.stopwords.Stopwords;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
/**
* Builds an index with the content of the controlled vocabulary.
* Accepts vocabularies as rdf files (SKOS format) and in plain text format:
* vocabulary_name.en (with "ID TERM" per line) - descriptors & non-descriptors
* vocabulary_name.use (with "ID_NON-DESCR \t ID_DESCRIPTOR" per line)
* vocabulary_name.rel (with "ID \t RELATED_ID1 RELATED_ID2 ... " per line)
*
* @author Olena Medelyan
*/
public class VocabularyJena implements Serializable, Vocabulary {
private static final long serialVersionUID = 1L;
/** Location of the rdf version of the controlled vocabulary
* it needs to be in the SKOS format! */
private static File SKOS;
/** Location of the vocabulary's *.en file
* containing all terms of the vocabularies and their ids.*/
private static File EN;
/** Location of the vocabulary's *.use file
* containing ids of non-descriptor with the corresponding ids of descriptors.*/
private static File USE;
/** Location of the vocabulary's *.rel file
* containing semantically related terms for each descriptor in the vocabulary.*/
private static File REL;
private String vocabularyFormat;
/** index : descriptor --> id */
private HashMap<String, String> termIdIndex;
/** reverse index : id --> descriptor */
private HashMap<String, String> idTermIndex;
/** normalized descriptor --> list of all possible meanings */
private HashMap<String, Vector<String>> listsOfSenses;
/** non-descriptor id --> descriptors id */
private HashMap<String, String> nonDescriptorIndex = null;
/** id --> list of related ids */
private HashMap<String, Vector<String>> listsOfRelatedTerms = null;
/** id-relatedId --> relation */
private HashMap<String, String> relationIndex = null;
/** Document language */
private String language = "en";
/** Document encoding */
private String encoding = "UTF-8";
/** Default stemmer to be used */
private Stemmer stemmer;
/** List of stopwords to be used */
private Stopwords stopwords;
/** Normalization to lower case - defaulte no */
private boolean toLowerCase = true;
/** Normalization via alphabetic reordering - default true*/
private boolean reorder = true;
private boolean debugMode = false;
int numConcepts = 0;
public int getSize() {
return numConcepts;
}
/** Vocabulary constructor.
*
* Given the name of the vocabulary and the format, it first checks whether
* the data/vocabularies directory contains the specified files:<br>
* - vocabularyName.rdf if skos format is selected<br>
* - or a set of 3 flat txt files starting with vocabularyName and with extensions<br>
* <li>.en (id term)
* <li>.use (non-descriptor \t descriptor)
* <li>.rel (id \t related_id1 related_id2 ...)
* If the required files exist, the vocabulary index is built.
*
* @param vocabularyName The name of the vocabulary file (before extension).
* @param vocabularyFormat The format of the vocabulary (skos or text).
* @throws Exception
* */
public VocabularyJena(String vocabularyName, String vocabularyFormat, String vocabularyDirectory)
throws Exception {
this.vocabularyFormat = vocabularyFormat;
if (vocabularyFormat.equals("skos")) {
SKOS = new File(vocabularyDirectory + "/" + vocabularyName + ".rdf.gz");
if (!SKOS.exists())
throw new Exception("File " + SKOS.getAbsolutePath() + " not found!");
} else if (vocabularyFormat.equals("text")) {
EN = new File(vocabularyDirectory +"/" + vocabularyName + ".en");
USE = new File(vocabularyDirectory +"/" + vocabularyName + ".use");
REL = new File(vocabularyDirectory +"/" + vocabularyName + ".rel");
if (!EN.exists())
throw new Exception("File " + EN.getAbsolutePath()
+ " does not exist.");
if (!USE.exists())
throw new Exception("File " + USE.getAbsolutePath()
+ " does not exist.");
if (!REL.exists())
throw new Exception("File " + REL.getAbsolutePath()
+ " does not exist.");
} else {
throw new Exception(vocabularyFormat
+ "is an unsupported vocabulary format! Use skos or text");
}
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#setLanguage(java.lang.String)
*/
@Override
public void setLanguage(String language) {
this.language = language;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#setEncoding(java.lang.String)
*/
@Override
public void setEncoding(String encoding) {
this.encoding = encoding;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#setLowerCase(boolean)
*/
@Override
public void setLowerCase(boolean toLowerCase) {
this.toLowerCase = toLowerCase;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#setReorder(boolean)
*/
@Override
public void setReorder(boolean reorder) {
this.reorder = reorder;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#setStemmer(maui.stemmers.Stemmer)
*/
@Override
public void setStemmer(Stemmer stemmer) {
this.stemmer = stemmer;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#setDebug(boolean)
*/
@Override
public void setDebug(boolean debugMode) {
this.debugMode = debugMode;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#initialize()
*/
@Override
public void initialize() throws Exception {
if (vocabularyFormat.equals("skos")) {
buildSKOS();
} else {
buildTEXT();
buildUSE();
buildREL();
}
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#setStopwords(maui.stopwords.Stopwords)
*/
@Override
public void setStopwords(Stopwords stopwords) {
this.stopwords = stopwords;
}
/**
* Builds the vocabulary indexes from SKOS file.
*/
public void buildSKOS() throws Exception {
if (debugMode) {
System.err.println("--- Building the Vocabulary index from the SKOS file...");
}
termIdIndex = new HashMap<String, String>();
idTermIndex = new HashMap<String, String>();
listsOfSenses = new HashMap<String, Vector<String>>();
nonDescriptorIndex = new HashMap<String, String>();
listsOfRelatedTerms = new HashMap<String, Vector<String>>();
relationIndex = new HashMap<String, String>();
Model model = ModelFactory.createDefaultModel();
InputStream gzipStream = new GZIPInputStream(new FileInputStream(SKOS));
model.read(new InputStreamReader(gzipStream,encoding),"");
// FileInputStream fileStream = new FileInputStream(SKOS);
//model.read(new InputStreamReader(fileStream,encoding),"");
StmtIterator iter;
Statement stmt;
Resource concept;
Property property;
RDFNode value;
// to create IDs for non-descriptors!
int count = 0;
// Iterating over all statements in the SKOS file
iter = model.listStatements();
while (iter.hasNext()) {
stmt = iter.nextStatement();
// id of the concept (Resource), e.g. "c_4828"
concept = stmt.getSubject();
String id = concept.getURI();
// relation or Property of the concept, e.g. "narrower"
property = stmt.getPredicate();
String relation = property.getLocalName();
// value of the property, e.g. c_4828 has narrower term "c_4829"
value = stmt.getObject();
String name = value.toString();
if (relation.equals("prefLabel")) {
numConcepts++;
String descriptor, language;
int atPosition = name.indexOf('@');
if (atPosition != -1) {
language = name.substring(atPosition + 1);
name = name.substring(0, atPosition);
if (language.equals(this.language))
descriptor = name;
else
continue;
} else {
descriptor = name;
}
String descriptorNormalized = normalizePhrase(descriptor);
if (descriptorNormalized.length() >= 1) {
Vector<String> ids = listsOfSenses
.get(descriptorNormalized);
if (ids == null)
ids = new Vector<String>();
ids.add(id);
listsOfSenses.put(descriptorNormalized, ids);
termIdIndex.put(descriptor.toLowerCase(), id);
idTermIndex.put(id, descriptor);
}
} else if (relation.equals("altLabel")
|| (relation.equals("hiddenLabel"))) {
String non_descriptor, language;
int atPosition = name.indexOf('@');
if (atPosition != -1) {
language = name.substring(atPosition + 1);
name = name.substring(0, atPosition);
if (language.equals(this.language))
non_descriptor = name;
else
continue;
} else {
non_descriptor = name;
}
addNonDescriptor(count, id, non_descriptor);
count++;
} else if (relation.equals("broader")
|| relation.equals("narrower")
|| relation.equals("composite")
|| relation.equals("compositeOf")
|| relation.equals("hasTopConcept")
|| relation.equals("related")) {
String relatedId = name;
Vector<String> relatedIds = listsOfRelatedTerms.get(id);
if (relatedIds == null)
relatedIds = new Vector<String>();
relatedIds.add(relatedId);
listsOfRelatedTerms.put(id, relatedIds);
relationIndex.put(id + "-" + relatedId, relation);
if (relation.equals("related")) {
relationIndex.put(relatedId + "-" + id, relation);
}
}
}
if (debugMode) {
System.err.println("--- Statistics about the vocabulary: ");
System.err.println("\t" + termIdIndex.size() + " terms in total");
System.err.println("\t" + nonDescriptorIndex.size() + " non-descriptive terms");
System.err.println("\t" + listsOfRelatedTerms.size()
+ " terms have related terms");
}
}
private void addNonDescriptor(int count, String idDescriptor,
String nonDescriptor) {
String idNonDescriptor = "d_" + count;
count++;
String normalizedNonDescriptor = normalizePhrase(nonDescriptor);
if (normalizedNonDescriptor.length() >= 1) {
Vector<String> ids = listsOfSenses.get(normalizedNonDescriptor);
if (ids == null)
ids = new Vector<String>();
ids.add(idNonDescriptor);
listsOfSenses.put(normalizedNonDescriptor, ids);
}
termIdIndex.put(nonDescriptor.toLowerCase(), idNonDescriptor);
idTermIndex.put(idNonDescriptor, nonDescriptor);
nonDescriptorIndex.put(idNonDescriptor, idDescriptor);
}
/**
* Builds the vocabulary index from the text files.
*/
public void buildTEXT() throws Exception {
System.err.println("-- Building the Vocabulary index");
termIdIndex = new HashMap<String, String>();
idTermIndex = new HashMap<String, String>();
String readline;
String term;
String avterm;
String id;
InputStreamReader is = new InputStreamReader(new FileInputStream(EN));
BufferedReader br = new BufferedReader(is);
while ((readline = br.readLine()) != null) {
int i = readline.indexOf(' ');
term = readline.substring(i + 1);
avterm = normalizePhrase(term);
if (avterm.length() >= 1) {
id = readline.substring(0, i);
termIdIndex.put(avterm, id);
idTermIndex.put(id, term);
}
}
}
/**
* Builds the vocabulary index with descriptors/non-descriptors relations.
*/
public void buildUSE() throws Exception {
nonDescriptorIndex = new HashMap<String, String>();
String readline;
String[] entry;
InputStreamReader is = new InputStreamReader(new FileInputStream(USE));
BufferedReader br = new BufferedReader(is);
while ((readline = br.readLine()) != null) {
entry = readline.split("\t");
// if more than one descriptors for
// one non-descriptors are used, ignore it!
// probably just related terms (cf. latest edition of Agrovoc)
if ((entry[1].indexOf(" ")) == -1) {
nonDescriptorIndex.put(entry[0], entry[1]);
}
}
}
/**
* Builds the vocabulary index with semantically related terms.
*/
public void buildREL() throws Exception {
System.err
.println("-- Building the Vocabulary index with related pairs");
listsOfRelatedTerms = new HashMap<String, Vector<String>>();
String readline;
String[] entry;
InputStreamReader is = new InputStreamReader(new FileInputStream(REL));
BufferedReader br = new BufferedReader(is);
while ((readline = br.readLine()) != null) {
entry = readline.split("\t");
String[] temp = entry[1].split(" ");
Vector<String> relatedTerms = new Vector<String>();
for (int i = 0; i < temp.length; i++) {
relatedTerms.add(temp[i]);
}
listsOfRelatedTerms.put(entry[0], relatedTerms);
}
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#getID(java.lang.String)
*/
@Override
public String getID(String phrase) {
String id = termIdIndex.get(phrase.toLowerCase());
if (id != null) {
if (nonDescriptorIndex.containsKey(id))
id = nonDescriptorIndex.get(id);
}
return id;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#getTerm(java.lang.String)
*/
@Override
public String getTerm(String id) {
return idTermIndex.get(id);
}
/**
* Checks whether a normalized phrase
* is a valid vocabulary term.
* @param phrase
* @return true if phrase is in the vocabulary
*/
public boolean containsNormalizedEntry(String phrase) {
return listsOfSenses.containsKey(normalizePhrase(phrase));
}
/**
* Returns true if a phrase has more than one senses
* @param phrase
* @return false if a phrase has only one sense
*/
public boolean isAmbiguous(String phrase) {
Vector<String> meanings = listsOfSenses.get(normalizePhrase(phrase));
if (meanings == null || meanings.size() == 1) {
return false;
}
return true;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#getSenses(java.lang.String)
*/
@Override
public Vector<String> getSenses(String phrase) {
String normalized = normalizePhrase(phrase);
Vector<String> senses = new Vector<String>();
if (listsOfSenses.containsKey(normalized)) {
for (String senseId : listsOfSenses.get(normalized)) {
// 1. retrieve a descriptor if this sense is a non-descriptor
if (nonDescriptorIndex.containsKey(senseId))
senseId = nonDescriptorIndex.get(senseId);
senses.add(senseId);
}
}
return senses;
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#getRelated(java.lang.String)
*/
@Override
public Vector<String> getRelated(String id) {
return listsOfRelatedTerms.get(id);
}
/* (non-Javadoc)
* @see maui.vocab.Vocabulary#getRelated(java.lang.String, java.lang.String)
*/
@Override
public Vector<String> getRelated(String id, String relation) {
Vector<String> related = new Vector<String>();
Vector<String> all_related = listsOfRelatedTerms.get(id);
if (all_related != null) {
for (String rel_id : all_related) {
String rel = relationIndex.get(id + "-" + rel_id);
if (rel != null) {
if (rel.equals(relation))
related.add(rel_id);
}
}
}
return related;
}
/**
* Generates the preudo phrase from a string.
* A pseudo phrase is a version of a phrase
* that only contains non-stopwords,
* which are stemmed and sorted into alphabetical order.
*/
public String normalizePhrase(String phrase) {
if (toLowerCase) {
phrase = phrase.toLowerCase();
}
if (toLowerCase) {
phrase = phrase.toLowerCase();
}
StringBuffer result = new StringBuffer();
char prev = ' ';
int i = 0;
while (i < phrase.length()) {
char c = phrase.charAt(i);
// we ignore everything after the "/" symbol and everything in brackets
// e.g. Monocytes/*immunology/microbiology -> monocytes
// e.g. Vanilla (Spice) -> vanilla
if (c == '/' || c == '(')
break;
if (c == '-' || c == '&' || c == '.' || c == '.')
c = ' ';
if (c == '*' || c == ':') {
prev = c;
i++;
continue;
}
if (c != ' ' || prev != ' ')
result.append(c);
prev = c;
i++;
}
phrase = result.toString().trim();
if (reorder || stopwords != null || stemmer != null) {
phrase = pseudoPhrase(phrase);
}
if (phrase.equals("")) {
// to prevent cases where the term is a stop word (e.g. Back).
return result.toString();
} else {
return phrase;
}
}
/**
* Generates the preudo phrase from a string.
* A pseudo phrase is a version of a phrase
* that only contains non-stopwords,
* which are stemmed and sorted into alphabetical order.
*/
public String pseudoPhrase(String str) {
String result = "";
String[] words = str.split(" ");
if (reorder) {
Arrays.sort(words);
}
for (String word : words) {
if (stopwords != null) {
if (stopwords.isStopword(word)) {
continue;
}
}
int apostr = word.indexOf('\'');
if (apostr != -1) {
word = word.substring(0, apostr);
}
if (stemmer != null) {
word = stemmer.stem(word);
}
result += word + " ";
}
return result.trim();
}
@Override
public String getIDFromPrefLabel(String prefLabel) {
return termIdIndex.get(prefLabel);
}
}