package kea.vocab;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Vector;
import kea.stemmers.SpanishStemmerSB;
import kea.stemmers.Stemmer;
import kea.stopwords.Stopwords;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
/**
* Builds an index with the content of the controlled vocabulary.
* Accepts vocabularies as rdf files (SKOS format) and in plain text format:
* vocabulary_name.en (with "ID TERM" per line) - descriptors & non-descriptors
* vocabulary_name.use (with "ID_NON-DESCR \t ID_DESCRIPTOR" per line)
* vocabulary_name.rel (with "ID \t RELATED_ID1 RELATED_ID2 ... " per line)
* See KEA's homepage for more details.
* @author Olena Medelyan
*/
public class VocabularyJena extends Vocabulary {
private static final long serialVersionUID = 1L;
/** Location of the rdf version of the controlled vocabulary
* it needs to be in the SKOS format! */
public static File SKOS;
/** Location of the vocabulary's *.en file
* containing all terms of the vocabularies and their ids.*/
public static File EN;
/** Location of the vocabulary's *.use file
* containing ids of non-descriptor with the corresponding ids of descriptors.*/
public static File USE;
/** Location of the vocabulary's *.rel file
* containing semantically related terms for each descriptor in the vocabulary.*/
public static File REL;
// if the type of the semantic relation will be required later
// this could be a file containing
// this information
// public static File RT;
/**
* Boolean describing which vocabulary format has been chosen:
* true if SKOS, false if text.
*/
private boolean useSkos;
/** <i>Vocabulary</i> index */
private HashMap<String,String> VocabularyEN = null;
/** <i>Vocabulary</i> reverse index */
private HashMap<String,String> VocabularyENrev = null;
/** <i>Vocabulary</i> non-descriptors - descriptors list */
private HashMap<String,String> VocabularyUSE = null;
/** <i>Vocabulary</i> related terms */
private HashMap<String,Vector<String>> VocabularyREL = null;
private HashMap<String,String> VocabularyRT = null;
/** The document language */
private String m_language;
/** The default stemmer to be used */
private Stemmer m_Stemmer;
/** The list of stop words to be used */
private Stopwords m_Stopwords;
/** Vocabulary constructor.
*
* Given the name of the vocabulary and the format it first checks whether
* the VOCABULARIES directory contains the specified files:
* - vocabularyName.rdf if skos format is selected
* - or a set of 3 flat files starting with vocabularyName and with extensions
* .en (id term)
* .use (non-descriptor \t descriptor)
* .rel (id \t related_id1 related_id2 ...)
* If the required files exist, the vocabulary index is built.
*
* @param vocabularyName The name of the vocabulary file (before extension).
* @param vocabularyFormat The format of the vocabulary (skos or text).
* */
public VocabularyJena(String vocabularyName, String vocabularyFormat, String documentLanguage)
{
super(documentLanguage);
if (vocabularyFormat.equals("skos")) {
SKOS = new File("VOCABULARIES/" + vocabularyName + ".rdf");
if (!SKOS.exists()){
System.err.println("File VOCABULARIES/" + vocabularyName + ".rdf does not exist.");
System.exit(1);
}
useSkos = true;
} else if (vocabularyFormat.equals("text")) {
EN = new File("VOCABULARIES/" + vocabularyName + ".en");
USE = new File("VOCABULARIES/" + vocabularyName + ".use");
REL = new File("VOCABULARIES/" + vocabularyName + ".rel");
// RT = new File("VOCABULARIES/" + vocabularyName + ".pairs.p1");
if (!EN.exists()) {
System.err.println("File VOCABULARIES/" + vocabularyName + ".en does not exist.");
System.exit(1);
}
if (!USE.exists()) {
System.err.println("File VOCABULARIES/" + vocabularyName + ".list.use does not exist.");
System.exit(1);
}
if (!REL.exists()) {
System.err.println("File VOCABULARIES/" + vocabularyName + ".rel.p1 does not exist.");
System.exit(1);
}
// if (!RT.exists()) {
// System.err.println("File VOCABULARIES/" + vocabularyName + ".pairs.p1 does not exist.");
// System.exit(1);
// }
}
}
/**
* Starts initialization of the vocabulary.
*
*/
public void initialize() {
System.err.println("-- Loading the Index...");
if (useSkos) {
try {
buildSKOS();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
} else {
try {
build();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
/**
* Set the Stemmer value.
* @param newStemmer The new Stemmer value.
*/
public void setStemmer(Stemmer newStemmer) {
this.m_Stemmer = newStemmer;
}
/**
* Set the M_Stopwords value.
* @param newM_Stopwords The new M_Stopwords value.
*/
public void setStopwords(Stopwords newM_Stopwords) {
this.m_Stopwords = newM_Stopwords;
}
/**
* Builds the vocabulary indexes from SKOS file.
*/
public void buildSKOS() throws Exception {
System.err.println("-- Building the Vocabulary index from SKOS file");
VocabularyEN = new HashMap<String,String>();
VocabularyENrev = new HashMap<String,String>();
VocabularyUSE = new HashMap<String,String>();
VocabularyREL = new HashMap<String,Vector<String>>();
VocabularyRT = new HashMap<String,String>();
// create an empty model
Model model = ModelFactory.createDefaultModel();
try {
model.read(new InputStreamReader(new FileInputStream(SKOS),"UTF-8"), "");
StmtIterator iter;
Statement stmt;
Property relation;
Resource concept;
RDFNode value;
int count = 1;
// Iterating over all statements in the SKOS file
iter = model.listStatements();
while (iter.hasNext()) {
stmt = iter.nextStatement();
// id of the concept (Resource), e.g. "c_4828"
concept = stmt.getSubject();
String id = concept.getURI();
// relation or Property of the concept, e.g. "narrower"
relation = stmt.getPredicate();
String rel = relation.getLocalName();
// value of the property, e.g. c_4828 has narrower term "c_4829"
value = stmt.getObject();
String val = value.toString();
//System.out.println("Concept " + concept);
//System.out.println("Relation " + rel);
//System.out.println("Value " + val);
if (rel.equals("prefLabel")) {
String descriptor;
if (val.contains("@")) {
String[] val_components = val.split("@");
// System.err.println(val_components[1] + " " + m_language);
if (val_components[1].equals(m_language)) {
// System.err.println("Yes");
descriptor = val_components[0];
} else {
continue;
}
} else {
descriptor = val;
}
String avterm = pseudoPhrase(descriptor);
if (avterm.equals("")) {
avterm = descriptor;
}
// System.out.println(descriptor + " ==> " + avterm);
if (avterm.length() > 1) {
VocabularyEN.put(avterm, id);
VocabularyENrev.put(id,descriptor);
}
// fill here the index hash
// id => descriptor
//if (id.equals("http://www.fao.org/aos/agrovoc#c_4314")) {
// System.out.println("Descriptor " + descriptor + " (" + id + ")");
//}
} else if (rel.equals("altLabel") || (rel.equals("hiddenLabel"))) {
String non_descriptor;
if (val.contains("@")) {
String[] val_components = val.split("@");
// System.err.println(val_components[1] + " " + m_language);
if (val_components[1].equals(m_language)) {
// System.err.println("Yes");
non_descriptor = val_components[0];
} else {
continue;
}
} else {
non_descriptor = val;
}
// System.out.println("Descriptor " + non_descriptor);
// first add the non_descriptor to the index hash
// then fill here non-descriptor hash
// id => id_non_descriptor
addNonDescriptor (count, id, non_descriptor);
count++;
//System.out.println("Descriptor " + VocabularyENrev.get(id) + " with id (" + id + ")" +
// " has a non-descriptor " + non_descriptor + " (" + id_non_descriptor + ")");
} else if (rel.equals("broader")
|| rel.equals("narrower")
|| rel.equals("composite")
|| rel.equals("compositeOf")
|| rel.equals("hasTopConcept")
|| rel.equals("related")) {
String id_related = val;
// System.out.println("Descriptor " + VocabularyENrev.get(id) + " with id " + id +
// " has a " + rel + " term " + VocabularyENrev.get(id_related) + " with id (" + id_related + ")");
// fill here semantic relations hash
// id => id_related
if (VocabularyREL.get(id) == null) {
Vector rt = new Vector();
rt.add(id_related);
VocabularyREL.put(id,rt);
} else {
Vector rt = (Vector)VocabularyREL.get(id);
rt.add(id_related);
}
VocabularyRT.put(id + "-" + id_related,rel);
if (rel.equals("related")) {
VocabularyRT.put(id_related + "-" + id,rel);
}
// VocabularyRT.put("id-id_related","1");
// VocabularyRT.put("id_related-id","1");
}
}
// Some statistics:
// System.out.println(VocabularyEN.size() + " terms in total");
// System.out.println(VocabularyUSE.size() + " non-descriptors");
// System.out.println(VocabularyREL.size() + " terms have related terms");
} catch (Exception e) {
e.printStackTrace();
}
}
private void addNonDescriptor (int count, String id_descriptor, String non_descriptor) {
// id => id_non_descriptor
String id_non_descriptor = "d_" + count;
count++;
String avterm = pseudoPhrase(non_descriptor);
if (avterm.length() > 2) {
VocabularyEN.put(avterm, id_non_descriptor);
VocabularyENrev.put(id_non_descriptor,non_descriptor);
}
VocabularyUSE.put(id_non_descriptor,id_descriptor);
}
public String remove (String[] words, int i) {
String result = "";
for (int j = 0; j < words.length; j++) {
if ((j != i) && (!m_Stopwords.isStopword(words[j]))) {
result = result + words[j];
if ((j+1) != words.length) {
result = result + " ";
}
}
}
return result;
}
/**
* Builds the vocabulary index from the text files.
*/
public void build() throws Exception {
System.err.println("-- Building the Vocabulary index");
VocabularyEN = new HashMap<String,String>();
VocabularyENrev = new HashMap<String,String>();
String readline;
String term;
String avterm;
String id;
try {
InputStreamReader is = new InputStreamReader(new FileInputStream(EN));
BufferedReader br = new BufferedReader(is);
while((readline=br.readLine()) != null) {
int i = readline.indexOf(' ');
term = readline.substring(i+1);
avterm = pseudoPhrase(term);
if (avterm.length() > 2) {
id = readline.substring(0,i);
VocabularyEN.put(avterm, id);
VocabularyENrev.put(id,term);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Builds the vocabulary index with descriptors/non-descriptors relations.
*/
public void buildUSE() throws Exception {
if (!useSkos) {
VocabularyUSE = new HashMap<String,String>();
String readline;
String[] entry;
try {
InputStreamReader is = new InputStreamReader(new FileInputStream(USE));
BufferedReader br = new BufferedReader(is);
while((readline=br.readLine()) != null) {
entry = readline.split("\t");
// if more than one descriptors for
// one non-descriptors are used, ignore it!
// probably just related terms (cf. latest edition of Agrovoc)
if ((entry[1].indexOf(" ")) == -1) {
VocabularyUSE.put(entry[0],entry[1]);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* Builds the vocabulary index with semantically related terms.
*/
public void buildREL() throws Exception {
if (!useSkos) {
System.err.println("-- Building the Vocabulary index with related pairs");
VocabularyREL = new HashMap();
String readline;
String[] entry;
try {
InputStreamReader is = new InputStreamReader(new FileInputStream(REL));
BufferedReader br = new BufferedReader(is);
while((readline=br.readLine()) != null) {
entry = readline.split("\t");
String[] temp = entry[1].split(" ");
Vector<String> rt = new Vector<String>();
for (int i = 0; i < temp.length; i++) {
rt.add(temp[i]);
}
VocabularyREL.put(entry[0],rt);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public void buildRT() throws Exception {
}
// Might be useful later, when the kind of relation is important
// or wether two terms are related or not
// public void buildRT() throws Exception {
//
// VocabularyRT = new HashMap();
//
// String[] entry;
// String readline;
// try {
// InputStreamReader is2 = new InputStreamReader(new FileInputStream(RT));
// BufferedReader br2 = new BufferedReader(is2);
// while((readline=br2.readLine()) != null) {
// entry = split(readline,"\t");
// String pair = entry[0] + "-" + entry[1];
// VocabularyRT.put(pair,"1");
//
// }
// } catch (Exception e) {
// System.err.println("You need to put the .pairs file into KEA directory");
// }
//
// }
//
/**
* Checks whether a normalized version of a phrase (pseudo phrase)
* is a valid vocabulary term.
*
* @param phrase
* @return true if phrase is in the vocabulary
*/
public boolean containsEntry(String phrase) {
return VocabularyEN.containsKey(phrase);
}
/**
* Given a phrase returns its id in the vocabulary.
* @param phrase
* @return id of the phrase in the vocabulary index
*/
public String getID(String phrase) {
String pseudo = pseudoPhrase(phrase);
String id = null;
if (pseudo != null) {
id = (String)VocabularyEN.get(pseudo);
if (VocabularyUSE.containsKey(id)) {
id = (String)VocabularyUSE.get(id);
}
}
return id;
}
/**
* Given id, gets the original version of vocabulary term.
* @param id
* @return original version of the vocabulary term
*/
public String getOrig(String id) {
return (String)VocabularyENrev.get(id);
}
/**
* Given id of the non-descriptor returs the id of the corresponding descriptor
* @param id of the non-descriptor
* @return id of the descriptor
*/
public String getDescriptor(String id) {
return (String)VocabularyUSE.get(id);
}
/**
* Given id of a term returns the list with ids of terms related to this term.
* @param id
* @return a vector with ids related to the input id
*/
public Vector<String> getRelated(String id) {
return VocabularyREL.get(id);
}
/**
* Given an ID of a term gets the list of all IDs of terms
* that are semantically related to the given term
* with a specific relation
* @param id, relation
* @return a vector with ids related to the input id by a specified relation
*/
public Vector<String> getRelated (String id, String relation) {
Vector<String> related = new Vector<String>();
Vector<String> all_related = (Vector<String>)VocabularyREL.get(id);
if (all_related != null) {
for (int d = 0; d < all_related.size(); d++) {
String rel_id = (String)all_related.elementAt(d);
String rel = (String)VocabularyRT.get(id + "-" + rel_id);
if (rel != null) {
if (rel.equals(relation)) {
related.add(rel_id);
}
} else {
System.err.println("Problem with " + getOrig(id) + " and " + getOrig(rel_id));
}
}
}
return related;
}
}