package kea.vocab; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Set; import java.util.Vector; import org.openrdf.concepts.skos.core.Concept; import org.openrdf.elmo.sesame.SesameManager; import kea.stemmers.Stemmer; import kea.stopwords.Stopwords; /** * Builds an index with the content of the controlled vocabulary. Accepts * vocabularies as rdf files (SKOS format) and in plain text format: * vocabulary_name.en (with "ID TERM" per line) - descriptors & non-descriptors * vocabulary_name.use (with "ID_NON-DESCR \t ID_DESCRIPTOR" per line) * vocabulary_name.rel (with "ID \t RELATED_ID1 RELATED_ID2 ... " per line) See * KEA's homepage for more details. * * @author Olena Medelyan * Modifications by Jose R. Perez-Aguera */ public class VocabularySesame extends Vocabulary { /** * Location of the rdf version of the controlled vocabulary it needs to be * in the SKOS format! */ public static File SKOS; /** * Location of the vocabulary's *.en file containing all terms of the * vocabularies and their ids. */ public static File EN; /** * Location of the vocabulary's *.use file containing ids of non-descriptor * with the corresponding ids of descriptors. */ public static File USE; /** * Location of the vocabulary's *.rel file containing semantically related * terms for each descriptor in the vocabulary. */ public static File REL; // if the type of the semantic relation will be required later // this could be a file containing // this information // public static File RT; /** * Boolean describing which vocabulary format has been chosen: true if SKOS, * false if text. */ private boolean useSkos; /** <i>Vocabulary</i> index */ private HashMap VocabularyEN = null; /** <i>Vocabulary</i> reverse index */ private HashMap VocabularyENrev = null; /** <i>Vocabulary</i> non-descriptors - descriptors list */ private HashMap VocabularyUSE = null; /** <i>Vocabulary</i> related terms */ private HashMap VocabularyREL = null; private HashMap VocabularyRT = null; private SesameManager manager; /** The document language */ private String m_language; /** The default stemmer to be used */ private Stemmer m_Stemmer; /** The list of stop words to be used */ private Stopwords m_Stopwords; /** * Vocabulary constructor. * * Given the name of the vocabulary and the format it first checks whether * the VOCABULARIES directory contains the specified files: - * vocabularyName.rdf if skos format is selected - or a set of 3 flat files * starting with vocabularyName and with extensions .en (id term) .use * (non-descriptor \t descriptor) .rel (id \t related_id1 related_id2 ...) * If the required files exist, the vocabulary index is built. * * @param vocabularyName * The name of the vocabulary file (before extension). * @param vocabularyFormat * The format of the vocabulary (skos or text). * */ public VocabularySesame(String vocabularyName, String vocabularyFormat, String documentLanguage, SesameManager manager) { super(documentLanguage); this.manager = manager; if (vocabularyFormat.equals("skos")) { // SKOS = new File(vocabularyDir + "/" + vocabularyName + ".rdf"); //SKOS = new File(vocabularyName); // if (!SKOS.exists()) { // System.err.println("File " + vocabularyPath + "/ " // + vocabularyName + ".rdf does not exist."); // System.exit(1); // } useSkos = true; } } /** * Starts initialization of the vocabulary. * */ public void initialize() { System.out.println("-- Loading the Index..."); if (useSkos) { try { buildSKOS(); } catch (Exception e) { e.printStackTrace(); System.exit(1); } } else { try { build(); } catch (Exception e) { e.printStackTrace(); System.exit(1); } } } /** * Set the Stemmer value. * * @param newStemmer * The new Stemmer value. */ public void setStemmer(Stemmer newStemmer) { this.m_Stemmer = newStemmer; } /** * Set the M_Stopwords value. * * @param newM_Stopwords * The new M_Stopwords value. */ public void setStopwords(Stopwords newM_Stopwords) { this.m_Stopwords = newM_Stopwords; } /** * Builds the vocabulary indexes from SKOS file. */ public void buildSKOS() throws Exception { System.out.println("-- Building the Vocabulary index from SKOS Sesame Store"); VocabularyEN = new HashMap(); VocabularyENrev = new HashMap(); VocabularyUSE = new HashMap(); VocabularyREL = new HashMap(); VocabularyRT = new HashMap(); try { int count = 1; for (Concept concept : manager.findAll(Concept.class)) { // id of the concept (Resource), e.g. "c_4828" String id = concept.getQName().getNamespaceURI() + concept.getQName().getLocalPart(); // value of the property, e.g. c_4828 has narrower term "c_4829" //String val = concept.getSkosPrefLabel(); /* * For prefLabels */ String descriptor = concept.getSkosPrefLabel(); // if (val.contains("@")) { // String[] val_components = val.split("@"); // // System.err.println(val_components[1] + " " + // // m_language); // if (val_components[1].equals(m_language)) { // // System.err.println("Yes"); // descriptor = val_components[0]; // } else { // continue; // } // } else { // descriptor = val; // } String avterm = pseudoPhrase(descriptor); if (avterm == null) { avterm = descriptor; } if (avterm.length() > 1) { VocabularyEN.put(avterm, id); VocabularyENrev.put(id, descriptor); } /* * For altLabels */ String non_descriptor; Set<String> altLabels = concept.getSkosAltLabels(); for (String a : altLabels) { non_descriptor = a; // val = a; // if (val.contains("@")) { // String[] val_components = val.split("@"); // // System.err.println(val_components[1] + " " + // // m_language); // if (val_components[1].equals(m_language)) { // // System.err.println("Yes"); // non_descriptor = val_components[0]; // } else { // continue; // } // } else { // non_descriptor = val; // } // first add the non_descriptor to the index hash // then fill here non-descriptor hash // id => id_non_descriptor addNonDescriptor(count, id, non_descriptor); count++; } /* * For broader terms */ String id_broader; Set<Concept> broaders = concept.getSkosBroaders(); for (Concept b : broaders) { id_broader = b.getQName().getNamespaceURI() + b.getQName().getLocalPart(); if (VocabularyREL.get(id) == null) { Vector rt = new Vector(); rt.add(id_broader); VocabularyREL.put(id, rt); } else { Vector rt = (Vector) VocabularyREL.get(id); rt.add(id_broader); VocabularyREL.put(id, rt); } VocabularyRT.put(id + "-" + id_broader, "broader"); } /* * For narrower terms */ String id_narrower; Set<Concept> narrowers = concept.getSkosNarrowers(); for (Concept n : narrowers) { id_narrower = n.getQName().getNamespaceURI() + n.getQName().getLocalPart(); if (VocabularyREL.get(id) == null) { Vector rt = new Vector(); rt.add(id_narrower); VocabularyREL.put(id, rt); } else { Vector rt = (Vector) VocabularyREL.get(id); rt.add(id_narrower); VocabularyREL.put(id, rt); } VocabularyRT.put(id + "-" + id_narrower, "narrower"); } /* * For related terms */ String id_related; Set<Concept> related = concept.getSkosRelated(); for (Concept r : related) { id_related = r.getQName().getNamespaceURI() + r.getQName().getLocalPart(); VocabularyRT.put(id + "-" + id_related, "related"); VocabularyRT.put(id_related + "-" + id, "related"); } } } catch (Exception e) { e.printStackTrace(); } } private void addNonDescriptor(int count, String id_descriptor, String non_descriptor) { // id => id_non_descriptor String id_non_descriptor = "d_" + count; count++; String avterm = pseudoPhrase(non_descriptor); if (avterm.length() > 2) { VocabularyEN.put(avterm, id_non_descriptor); VocabularyENrev.put(id_non_descriptor, non_descriptor); } VocabularyUSE.put(id_non_descriptor, id_descriptor); } public String remove(String[] words, int i) { String result = ""; for (int j = 0; j < words.length; j++) { if ((j != i) && (!m_Stopwords.isStopword(words[j]))) { result = result + words[j]; if ((j + 1) != words.length) { result = result + " "; } } } return result; } /** * Builds the vocabulary index from the text files. */ public void build() throws Exception { System.out.println("-- Building the Vocabulary index"); VocabularyEN = new HashMap(); VocabularyENrev = new HashMap(); String readline; String term; String avterm; String id; try { InputStreamReader is = new InputStreamReader( new FileInputStream(EN)); BufferedReader br = new BufferedReader(is); while ((readline = br.readLine()) != null) { int i = readline.indexOf(' '); term = readline.substring(i + 1); avterm = pseudoPhrase(term); if (avterm.length() > 2) { id = readline.substring(0, i); VocabularyEN.put(avterm, id); VocabularyENrev.put(id, term); } } } catch (Exception e) { e.printStackTrace(); } } /** * Builds the vocabulary index with descriptors/non-descriptors relations. */ public void buildUSE() throws Exception { if (!useSkos) { VocabularyUSE = new HashMap(); String readline; String[] entry; try { InputStreamReader is = new InputStreamReader( new FileInputStream(USE)); BufferedReader br = new BufferedReader(is); while ((readline = br.readLine()) != null) { entry = split(readline, "\t"); // if more than one descriptors for // one non-descriptors are used, ignore it! // probably just related terms (cf. latest edition of // Agrovoc) if ((entry[1].indexOf(" ")) == -1) { VocabularyUSE.put(entry[0], entry[1]); } } } catch (Exception e) { e.printStackTrace(); } } } /** * Builds the vocabulary index with semantically related terms. */ public void buildREL() throws Exception { if (!useSkos) { System.err .println("-- Building the Vocabulary index with related pairs"); VocabularyREL = new HashMap(); String readline; String[] entry; try { InputStreamReader is = new InputStreamReader( new FileInputStream(REL)); BufferedReader br = new BufferedReader(is); while ((readline = br.readLine()) != null) { entry = split(readline, "\t"); String[] temp = split(entry[1], " "); Vector rt = new Vector(); for (int i = 0; i < temp.length; i++) { rt.add(temp[i]); } VocabularyREL.put(entry[0], rt); } } catch (Exception e) { e.printStackTrace(); } } } // Might be useful later, when the kind of relation is important // or wether two terms are related or not // public void buildRT() throws Exception { // // VocabularyRT = new HashMap(); // // String[] entry; // String readline; // try { // InputStreamReader is2 = new InputStreamReader(new FileInputStream(RT)); // BufferedReader br2 = new BufferedReader(is2); // while((readline=br2.readLine()) != null) { // entry = split(readline,"\t"); // String pair = entry[0] + "-" + entry[1]; // VocabularyRT.put(pair,"1"); // // } // } catch (Exception e) { // System.err.println("You need to put the .pairs file into KEA directory"); // } // // } // /** * Checks whether a normalized version of a phrase (pseudo phrase) is a * valid vocabulary term. * * @param phrase * @return true if phrase is in the vocabulary */ public boolean containsEntry(String phrase) { return VocabularyEN.containsKey(phrase); } /** * Given a phrase returns its id in the vocabulary. * * @param phrase * @return id of the phrase in the vocabulary index */ public String getID(String phrase) { String pseudo = pseudoPhrase(phrase); String id = null; if (pseudo != null) { id = (String) VocabularyEN.get(pseudo); if (VocabularyUSE.containsKey(id)) { id = (String) VocabularyUSE.get(id); } } return id; } /** * Given id, gets the original version of vocabulary term. * * @param id * @return original version of the vocabulary term */ public String getOrig(String id) { return (String) VocabularyENrev.get(id); } /** * Given id of the non-descriptor returs the id of the corresponding * descriptor * * @param id * of the non-descriptor * @return id of the descriptor */ public String getDescriptor(String id) { return (String) VocabularyUSE.get(id); } /** * Given id of a term returns the list with ids of terms related to this * term. * * @param id * @return a vector with ids related to the input id */ public Vector getRelated(String id) { return (Vector) VocabularyREL.get(id); } /** * Given an ID of a term gets the list of all IDs of terms that are * semantically related to the given term with a specific relation * * @param id * , relation * @return a vector with ids related to the input id by a specified relation */ public Vector getRelated(String id, String relation) { Vector related = new Vector(); Vector all_related = (Vector) VocabularyREL.get(id); if (all_related != null) { for (int d = 0; d < all_related.size(); d++) { String rel_id = (String) all_related.elementAt(d); String rel = (String) VocabularyRT.get(id + "-" + rel_id); if (rel != null) { if (rel.equals(relation)) { related.add(rel_id); } } else { System.err.println("Problem with " + getOrig(id) + " and " + getOrig(rel_id)); } } } return related; } /** * Splits a string str at given character sequence (separator) into an * array. * * @param str * , separator * @return String array with string parts separated by the separator string */ public String[] split(String str, String separator) { ArrayList lst = new ArrayList(); String word = ""; for (int i = 0; i < str.length(); i++) { int j = i + 1; String letter = str.substring(i, j); if (!letter.equalsIgnoreCase(separator)) { word = word + str.charAt(i); } else { lst.add(word); word = ""; } } if (word != "") { lst.add(word); } String[] result = (String[]) lst.toArray(new String[lst.size()]); return result; } /** * Generates the preudo phrase from a string. A pseudo phrase is a version * of a phrase that only contains non-stopwords, which are stemmed and * sorted into alphabetical order. */ public String pseudoPhrase(String str) { // System.err.print(str + "\t"); String[] pseudophrase; String[] words; String str_nostop; String stemmed; str = str.toLowerCase(); // This is often the case with Mesh Terms, // where a term is accompanied by another specifying term // e.g. Monocytes/*immunology/microbiology // we ignore everything after the "/" symbol. if (str.matches(".+?/.+?")) { String[] elements = str.split("/"); str = elements[0]; } // removes scop notes in brackets // should be replaced with a cleaner solution !! if (str.matches(".+?\\(.+?")) { String[] elements = str.split("\\("); str = elements[0]; } // Remove some non-alphanumeric characters // str = str.replace('/', ' '); str = str.replace('-', ' '); str = str.replace('&', ' '); str = str.replaceAll("\\*", ""); str = str.replaceAll("\\, ", " "); str = str.replaceAll("\\. ", " "); str = str.replaceAll("\\:", ""); str = str.trim(); // Stem string words = str.split(" "); str_nostop = ""; for (int i = 0; i < words.length; i++) { String word = words[i]; if (!m_Stopwords.isStopword(word)) { if (word.matches(".+?\\'.+?")) { String[] elements = word.split("\\'"); if (elements.length < 1) word = elements[1]; } if (str_nostop.equals("")) { str_nostop = word; } else { str_nostop = str_nostop + " " + word; } } } stemmed = m_Stemmer.stemString(str_nostop); // System.err.println(stemmed + "\t" + str_nostop + "\t"+ str); pseudophrase = sort(stemmed.split(" ")); // System.err.println(join(pseudophrase)); return join(pseudophrase); } /** * overloaded swap method: exchange 2 locations in an array of Strings. */ public static void swap(int loc1, int loc2, String[] a) { String temp = a[loc1]; a[loc1] = a[loc2]; a[loc2] = temp; } // end swap /** * Sorts an array of Strings into alphabetic order * */ public static String[] sort(String[] a) { // rename firstAt to reflect new role in alphabetic sorting int i, j, firstAt; for (i = 0; i < a.length - 1; i++) { firstAt = i; for (j = i + 1; j < a.length; j++) { // modify to preserve ordering of a String that starts with // upper case preceding the otherwise identical String that // has only lower case letters if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) < 0) { // reset firstAt firstAt = j; } // if identical when converted to all same case if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) == 0) { // but a[j] precedes when not converted if (a[j].compareTo(a[firstAt]) < 0) { // reset firstAt firstAt = j; } } } if (firstAt != i) { swap(i, firstAt, a); } } return a; } // end method selectionSort }