package kea.vocab;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import kea.stemmers.Stemmer;
import kea.stopwords.Stopwords;
public abstract class Vocabulary implements Serializable
{
private static final long serialVersionUID = -7157202718619833534L;
/** The document language */
private String m_language;
/** The default stemmer to be used */
private Stemmer m_Stemmer;
/** The list of stop words to be used */
private Stopwords m_Stopwords;
public Vocabulary(String documentLanguage)
{
m_language = documentLanguage;
}
/**
* Initializes the vocabulary.
*/
public abstract void initialize();
/**
* Builds the vocabulary index from SKOS RDF/XML files.
* @throws Exception
*/
public abstract void buildSKOS() throws Exception;
/**
* Builds the vocabulary index from text files.
*/
//public abstract void build() throws Exception;
/**
* Builds the vocabulary index with descriptors/non-descriptors relations.
*/
public abstract void buildUSE() throws Exception;
/**
* Builds the vocabulary index with semantically related terms.
*/
public abstract void buildREL() throws Exception;
/**
* Given a phrase returns its id in the vocabulary.
* @param phrase
* @return id of the phrase in the vocabulary index
*/
public abstract String getID(String phrase);
/**
* Given id, gets the original version of vocabulary term.
* @param id
* @return original version of the vocabulary term
*/
public abstract String getOrig(String id);
/**
* Given id of a term returns the list with ids of terms related to this term.
* @param id
* @return a vector with ids related to the input id
*/
public abstract List<String> getRelated(String id);
/**
* Given an ID of a term gets the list of all IDs of terms
* that are semantically related to the given term
* with a specific relation
* @param id, relation
* @return a vector with ids related to the input id by a specified relation
*/
public abstract List<String> getRelated (String id, String relation);
public void setStemmer(Stemmer newStemmer) {
this.m_Stemmer = newStemmer;
}
public Stemmer getStemmer() {
return m_Stemmer;
}
public void setStopwords(Stopwords newM_Stopwords) {
this.m_Stopwords = newM_Stopwords;
}
public String getLanguage() {
return m_language;
}
public String remove (String[] words, int i) {
String result = "";
for (int j = 0; j < words.length; j++) {
if ((j != i) && (!m_Stopwords.isStopword(words[j]))) {
result = result + words[j];
if ((j+1) != words.length) {
result = result + " ";
}
}
}
return result;
}
/**
* Generates the preudo phrase from a string.
* A pseudo phrase is a version of a phrase
* that only contains non-stopwords,
* which are stemmed and sorted into alphabetical order.
*/
public String pseudoPhrase(String str) {
if (str == null)
return null;
// System.err.print(str + "\t");
String[] pseudophrase;
String[] words;
String str_nostop;
String stemmed;
str = str.toLowerCase();
// This is often the case with Mesh Terms,
// where a term is accompanied by another specifying term
// e.g. Monocytes/*immunology/microbiology
// we ignore everything after the "/" symbol.
if (str.matches(".+?/.+?")) {
String[] elements = str.split("/");
str = elements[0];
}
// removes scop notes in brackets
// should be replaced with a cleaner solution !!
if (str.matches(".+?\\(.+?")) {
String[] elements = str.split("\\(");
str = elements[0];
}
// Remove some non-alphanumeric characters
// str = str.replace('/', ' ');
str = str.replace('-', ' ');
str = str.replace('&', ' ');
str = str.replaceAll("\\*", "");
str = str.replaceAll("\\, "," ");
str = str.replaceAll("\\. "," ");
str = str.replaceAll("\\:","");
str = str.trim();
// Stem string
words = str.split(" ");
str_nostop = "";
for (int i = 0; i < words.length; i++) {
String word = words[i];
if (m_Stopwords != null && !m_Stopwords.isStopword(word)) {
if (word.matches(".+?\\'.+?")) {
String[] elements = word.split("\\'");
if (elements.length > 1)
word = elements[1];
}
if (str_nostop.equals("")) {
str_nostop = word;
} else {
str_nostop = str_nostop + " " + word;
}
}
}
stemmed = m_Stemmer.stemString(str_nostop);
// System.err.println(stemmed + "\t" + str_nostop + "\t"+ str);
pseudophrase = stemmed.split(" ");
Arrays.sort(pseudophrase);
//System.err.println(join(pseudophrase));
return join(pseudophrase);
}
/**
* Joins an array of strings to a single string.
*/
protected static String join(String[] str) {
String result = "";
for(int i = 0; i < str.length; i++) {
if (result != "") {
result = result + " " + str[i];
} else {
result = str[i];
}
}
return result;
}
/**
* overloaded swap method: exchange 2 locations in an array of Strings.
*/
public static void swap(int loc1, int loc2, String[] a) {
String temp = a[loc1];
a[loc1] = a[loc2];
a[loc2] = temp;
} // end swap
/**
* Sorts an array of Strings into alphabetic order
*
*/
public static String[] sort(String[] a) {
// rename firstAt to reflect new role in alphabetic sorting
int i, j, firstAt;
for (i = 0; i < a.length - 1; i++) {
firstAt = i;
for (j = i + 1; j < a.length; j++) {
// modify to preserve ordering of a String that starts with
// upper case preceding the otherwise identical String that
// has only lower case letters
if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) < 0) {
// reset firstAt
firstAt = j;
}
// if identical when converted to all same case
if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) == 0) {
// but a[j] precedes when not converted
if (a[j].compareTo(a[firstAt]) < 0) {
// reset firstAt
firstAt = j;
}
}
}
if (firstAt != i) {
swap(i, firstAt, a);
}
}
return a;
} // end method selectionSort
}