package kea.main;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.openrdf.elmo.ElmoModule;
import org.openrdf.elmo.sesame.SesameManager;
import org.openrdf.elmo.sesame.SesameManagerFactory;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.sail.nativerdf.NativeStore;
import edu.unc.ils.mrc.hive.HiveException;
import edu.unc.ils.mrc.hive.api.SKOSScheme;
import edu.unc.ils.mrc.hive.api.impl.elmo.SKOSSchemeImpl;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import kea.filters.KEAFilter;
import kea.stemmers.*;
import kea.stopwords.*;
import kea.vocab.Vocabulary;
import kea.vocab.VocabularyH2;
/**
* Builds a keyphrase extraction model from the documents in a given
* directory. Assumes that the file names for the documents end with
* ".txt". Assumes that files containing corresponding
* author-assigned keyphrases end with ".key". Optionally an encoding
* for the documents/keyphrases can be defined (e.g. for Chinese
* text).
*
* Valid options are:<p>
*
* -l "directory name"<br>
* Specifies name of directory.<p>
*
* -m "model name"<br>
* Specifies name of model.<p>
*
* -e "encoding"<br>
* Specifies encoding.<p>
*
* -v "vocabulary name" <br>
* Specifies vocabulary name (e.g. agrovoc or none).<p>
*
* -f "vocabulary format" <br>
* Specifies vocabulary format (txt or skos).<p>
*
* -i "document language" <br>
* Specifies document language (en, es, de, fr).<p>
*
* -d<br>
* Turns debugging mode on.<p>
*
* -k<br>
* Use keyphrase frequency statistic.<p>
*
* -r<br>
* Use agrovoc relation as feature.<p>
*
* -p<br>
* Disallow internal periods.<p>
*
* -x "length"<br>
* Sets maximum phrase length (default: 3).<p>
*
* -y "length"<br>
* Sets minimum phrase length (default: 1).<p>
*
* -o "number"<br>
* The minimum number of times a phrase needs to occur (default: 2). <p>
*
* -s "name of class implementing list of stop words"<br>
* Sets list of stop words to used (default: StopwordsEnglish).<p>
*
* -t "name of class implementing stemmer"<br>
* Sets stemmer to use (default: IteratedLovinsStemmer). <p>
*
* -n<br>
* Do not check for proper nouns. <p>
*
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @version 1.0
*/
public class KEAModelBuilder implements OptionHandler {
private static final Log logger = LogFactory.getLog(KEAModelBuilder.class);
/** Stopwords path */
String m_stopwordsPath;
/** Name of directory */
String m_dirName = null;
/** Name of model */
String m_modelName = null;
/** Vocabulary name */
String m_vocabulary = null;
/** Format of the vocabulary */
String m_vocabularyFormat = "skos";
/** Document language */
String m_documentLanguage = "en";
/** Encoding */
String m_encoding = "default";
/** Debugging mode? */
boolean m_debug = false;
/** Use keyphrase frequency attribute? */
boolean m_useKFrequency = false;
/** Disallow internal periods? */
boolean m_disallowIPeriods = false;
/** The maximum length of phrases */
private int m_MaxPhraseLength = 5;
/** The minimum length of phrases */
private int m_MinPhraseLength = 1;
/** The minimum number of occurences of a phrase */
private int m_MinNumOccur = 2;
/** The KEA filter object */
KEAFilter m_KEAFilter = null;
/** The stemmer to be used */
private Stemmer m_Stemmer = new SremovalStemmer();
/** The list of stop words to be used */
private Stopwords m_Stopwords;
/** Determines whether check for proper nouns is performed */
private boolean m_CheckForProperNouns = true;
private Vocabulary vocabulary;
public KEAModelBuilder(SKOSScheme scheme) {
m_vocabularyFormat = "skos";
}
/**
* Get the M_CheckProperNouns value.
* @return the M_CheckProperNouns value.
*/
public boolean getCheckForProperNouns() {
return m_CheckForProperNouns;
}
/**
* Set the M_CheckProperNouns value.
* @param newM_CheckProperNouns The new M_CheckProperNouns value.
*/
public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
this.m_CheckForProperNouns = newM_CheckProperNouns;
}
/**
* Get the M_Stopwords value.
* @return the M_Stopwords value.
*/
public Stopwords getStopwords() {
return m_Stopwords;
}
/**
* Set the M_Stopwords value.
* @param newM_Stopwords The new M_Stopwords value.
*/
public void setStopwords(Stopwords newM_Stopwords) {
this.m_Stopwords = newM_Stopwords;
}
public void setStopwords(String stopwordsPath) {
this.m_Stopwords = new StopwordsEnglish(stopwordsPath);
}
/**
* Get the Stemmer value.
* @return the Stemmer value.
*/
public Stemmer getStemmer() {
return m_Stemmer;
}
/**
* Set the Stemmer value.
* @param newStemmer The new Stemmer value.
*/
public void setStemmer(Stemmer newStemmer) {
this.m_Stemmer = newStemmer;
}
/**
* Get the value of MinNumOccur.
*
* @return Value of MinNumOccur.
*/
public int getMinNumOccur() {
return m_MinNumOccur;
}
/**
* Set the value of MinNumOccur.
*
* @param newMinNumOccur Value to assign to MinNumOccur.
*/
public void setMinNumOccur(int newMinNumOccur) {
m_MinNumOccur = newMinNumOccur;
}
/**
* Get the value of MaxPhraseLength.
*
* @return Value of MaxPhraseLength.
*/
public int getMaxPhraseLength() {
return m_MaxPhraseLength;
}
/**
* Set the value of MaxPhraseLength.
*
* @param newMaxPhraseLength Value to assign to MaxPhraseLength.
*/
public void setMaxPhraseLength(int newMaxPhraseLength) {
m_MaxPhraseLength = newMaxPhraseLength;
}
/**
* Get the value of MinPhraseLength.
*
* @return Value of MinPhraseLength.
*/
public int getMinPhraseLength() {
return m_MinPhraseLength;
}
/**
* Set the value of MinPhraseLength.
*
* @param newMinPhraseLength Value to assign to MinPhraseLength.
*/
public void setMinPhraseLength(int newMinPhraseLength) {
m_MinPhraseLength = newMinPhraseLength;
}
/**
* Get the value of disallowIPeriods.
*
* @return Value of disallowIPeriods.
*/
public boolean getDisallowIPeriods() {
return m_disallowIPeriods;
}
/**
* Set the value of disallowIPeriods.
*
* @param newdisallowIPeriods Value to assign to disallowIPeriods.
*/
public void setDisallowIPeriods(boolean newdisallowIPeriods) {
m_disallowIPeriods = newdisallowIPeriods;
}
/**
* Get the value of useKFrequency.
*
* @return Value of useKFrequency.
*/
public boolean getUseKFrequency() {
return m_useKFrequency;
}
/**
* Set the value of useKFrequency.
*
* @param newuseKFrequency Value to assign to useKFrequency.
*/
public void setUseKFrequency(boolean newuseKFrequency) {
m_useKFrequency = newuseKFrequency;
}
/**
* Get the value of debug.
*
* @return Value of debug.
*/
public boolean getDebug() {
return m_debug;
}
/**
* Set the value of debug.
*
* @param newdebug Value to assign to debug.
*/
public void setDebug(boolean newdebug) {
m_debug = newdebug;
}
/**
* Get the value of encoding.
*
* @return Value of encoding.
*/
public String getEncoding() {
return m_encoding;
}
/**
* Set the value of encoding.
*
* @param newencoding Value to assign to encoding.
*/
public void setEncoding(String newencoding) {
m_encoding = newencoding;
}
/**
* Get the value of vocabulary name.
*
* @return Value of vocabulary name.
*/
public String getVocabulary() {
return m_vocabulary;
}
/**
* Set the value of vocabulary name.
*
* @param newvocabulary Value to assign to vocabulary name.
*/
public void setVocabulary(String newvocabulary) {
m_vocabulary = newvocabulary;
}
/**
* Get the value of document language.
*
* @return Value of document language.
*/
public String getDocumentLanguage() {
return m_documentLanguage;
}
/**
* Set the value of document language.
*
* @param newdocumentLanguage Value to assign to document language.
*/
public void setDocumentLanguage(String newdocumentLanguage) {
m_documentLanguage = newdocumentLanguage;
}
/**
* Get the value of vocabulary format.
*
* @return Value of vocabulary format.
*/
public String getVocabularyFormat() {
return m_vocabularyFormat;
}
/**
* Set the value of vocabulary format.
*
* @param newvocabularyFormat Value to assign to vocabulary format.
*/
public void setVocabularyFormat(String newvocabularyFormat) {
m_vocabularyFormat = newvocabularyFormat;
}
/**
* Get the value of modelName.
*
* @return Value of modelName.
*/
public String getModelName() {
return m_modelName;
}
/**
* Set the value of modelName.
*
* @param newmodelName Value to assign to modelName.
*/
public void setModelName(String newmodelName) {
m_modelName = newmodelName;
}
/**
* Get the value of dirName.
*
* @return Value of dirName.
*/
public String getDirName() {
return m_dirName;
}
/**
* Set the value of dirName.
*
* @param newdirName Value to assign to dirName.
*/
public void setDirName(String newdirName) {
m_dirName = newdirName;
}
/**
* Parses a given list of options controlling the behaviour of this object.
* Valid options are:<p>
*
* -l "directory name" <br>
* Specifies name of directory.<p>
*
* -m "model name" <br>
* Specifies name of model.<p>
*
* -v "vocabulary name" <br>
* Specifies vocabulary name.<p>
*
* -f "vocabulary format" <br>
* Specifies vocabulary format.<p>
*
* -i "document language" <br>
* Specifies document language.<p>
*
* -e "encoding" <br>
* Specifies encoding.<p>
*
* -d<br>
* Turns debugging mode on.<p>
*
* -k<br>
* Use keyphrase frequency statistic.<p>
*
* -p<br>
* Disallow internal periods. <p>
*
* -x "length"<br>
* Sets maximum phrase length (default: 3).<p>
*
* -y "length"<br>
* Sets minimum phrase length (default: 3).<p>
*
* -o "number"<br>
* The minimum number of times a phrase needs to occur (default: 2). <p>
*
* -s "name of class implementing list of stop words"<br>
* Sets list of stop words to used (default: StopwordsEnglish).<p>
*
* -t "name of class implementing stemmer"<br>
* Sets stemmer to use (default: IteratedLovinsStemmer). <p>
*
* -n<br>
* Do not check for proper nouns. <p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
String dirName = Utils.getOption('l', options);
if (dirName.length() > 0) {
setDirName(dirName);
} else {
setDirName(null);
throw new Exception("Name of directory required argument.");
}
String modelName = Utils.getOption('m', options);
if (modelName.length() > 0) {
setModelName(modelName);
} else {
setModelName(null);
throw new Exception("Name of model required argument.");
}
String vocabularyName = Utils.getOption('v', options);
if (vocabularyName.length() > 0) {
setVocabulary(vocabularyName);
} else {
setVocabulary(null);
throw new Exception("Name of vocabulary required argument.");
}
String vocabularyFormat = Utils.getOption('f', options);
if (!getVocabulary().equals("none")) {
if (vocabularyFormat.length() > 0) {
if (vocabularyFormat.equals("skos") || vocabularyFormat.equals("text")) {
setVocabularyFormat(vocabularyFormat);
} else {
throw new Exception("Unsupported format of vocabulary. It should be either \"skos\" or \"text\".");
}
} else {
setVocabularyFormat(null);
throw new Exception("If a controlled vocabulary is used, format of vocabulary required argument (skos or text).");
}
} else {
setVocabularyFormat(null);
}
String encoding = Utils.getOption('e', options);
if (encoding.length() > 0) {
setEncoding(encoding);
} else {
setEncoding("default");
}
String documentLanguage = Utils.getOption('i', options);
if (documentLanguage.length() > 0) {
setDocumentLanguage(documentLanguage);
} else {
setDocumentLanguage("en");
}
String maxPhraseLengthString = Utils.getOption('x', options);
if (maxPhraseLengthString.length() > 0) {
setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
} else {
setMaxPhraseLength(5);
}
String minPhraseLengthString = Utils.getOption('y', options);
if (minPhraseLengthString.length() > 0) {
setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
} else {
setMinPhraseLength(1);
}
String minNumOccurString = Utils.getOption('o', options);
if (minNumOccurString.length() > 0) {
setMinNumOccur(Integer.parseInt(minNumOccurString));
} else {
setMinNumOccur(2);
}
String stopwordsString = Utils.getOption('s', options);
if (stopwordsString.length() > 0) {
stopwordsString = "kea.stopwords.".concat(stopwordsString);
setStopwords((Stopwords)Class.forName(stopwordsString).newInstance());
}
String stemmerString = Utils.getOption('t', options);
if (stemmerString.length() > 0) {
stemmerString = "kea.stemmers.".concat(stemmerString);
setStemmer((Stemmer)Class.forName(stemmerString).newInstance());
}
setDebug(Utils.getFlag('d', options));
setUseKFrequency(Utils.getFlag('k', options));
setDisallowIPeriods(Utils.getFlag('p', options));
setCheckForProperNouns(!Utils.getFlag('n', options));
Utils.checkForRemainingOptions(options);
}
/**
* Gets the current option settings.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] options = new String [26];
int current = 0;
options[current++] = "-l";
options[current++] = "" + (getDirName());
options[current++] = "-m";
options[current++] = "" + (getModelName());
options[current++] = "-v";
options[current++] = "" + (getVocabulary());
options[current++] = "-f";
options[current++] = "" + (getVocabularyFormat());
options[current++] = "-e";
options[current++] = "" + (getEncoding());
options[current++] = "-i";
options[current++] = "" + (getDocumentLanguage());
if (getUseKFrequency()) {
options[current++] = "-k";
}
if (getDebug()) {
options[current++] = "-d";
}
if (getDisallowIPeriods()) {
options[current++] = "-p";
}
options[current++] = "-x";
options[current++] = "" + (getMaxPhraseLength());
options[current++] = "-y";
options[current++] = "" + (getMinPhraseLength());
options[current++] = "-o";
options[current++] = "" + (getMinNumOccur());
options[current++] = "-s";
options[current++] = "" + (getStopwords().getClass().getName());
options[current++] = "-t";
options[current++] = "" + (getStemmer().getClass().getName());
if (getCheckForProperNouns()) {
options[current++] = "-n";
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options
*/
public Enumeration listOptions() {
Vector newVector = new Vector(14);
newVector.addElement(new Option(
"\tSpecifies name of directory.",
"l", 1, "-l <directory name>"));
newVector.addElement(new Option(
"\tSpecifies name of model.",
"m", 1, "-m <model name>"));
newVector.addElement(new Option(
"\tSpecifies vocabulary name.",
"v", 1, "-v <vocabulary name>"));
newVector.addElement(new Option(
"\tSpecifies vocabulary format (text or skos or none).",
"f", 1, "-f <vocabulary format>"));
newVector.addElement(new Option(
"\tSpecifies document language (en (default), es, de, fr).",
"i", 1, "-i <document language>"));
newVector.addElement(new Option(
"\tSpecifies encoding.",
"e", 1, "-e <encoding>"));
newVector.addElement(new Option(
"\tTurns debugging mode on.",
"d", 0, "-d"));
newVector.addElement(new Option(
"\tUse keyphrase frequency statistic.",
"k", 0, "-k"));
newVector.addElement(new Option(
"\tDisallow internal periods.",
"p", 0, "-p"));
newVector.addElement(new Option(
"\tSets the maximum phrase length (default: 5).",
"x", 1, "-x <length>"));
newVector.addElement(new Option(
"\tSets the minimum phrase length (default: 1).",
"y", 1, "-y <length>"));
newVector.addElement(new Option(
"\tSet the minimum number of occurences (default: 2).",
"o", 1, "-o"));
newVector.addElement(new Option(
"\tSets the list of stopwords to use (default: StopwordsEnglish).",
"s", 1, "-s <name of stopwords class>"));
newVector.addElement(new Option(
"\tSet the stemmer to use (default: SremovalStemmer).",
"t", 1, "-t <name of stemmer class>"));
newVector.addElement(new Option(
"\tDo not check for proper nouns.",
"n", 0, "-n"));
return newVector.elements();
}
/**
* Collects the stems of the file names.
*/
public Hashtable collectStems() throws Exception {
Hashtable stems = new Hashtable();
try {
File dir = new File(m_dirName);
String[] files = dir.list();
for (int i = 0; i < files.length; i++) {
if (files[i].endsWith(".key") ||
files[i].endsWith(".txt")) {
String stem = files[i].substring(0, files[i].length() - 4);
if (!stems.containsKey(stem)) {
stems.put(stem, new Double(0));
}
}
}
} catch (Exception e) {
throw new Exception("Problem opening directory " + m_dirName);
}
return stems;
}
/**
* Builds the model from the files
*/
public void buildModel(Hashtable stems, SKOSScheme schema, String stopwordsPath, SesameManager manager) throws Exception {
String h2path = new File(schema.getRdfPath()).getParentFile().getAbsolutePath();
//h2path += File.separator + schema.getName().toLowerCase() + "H2" + File.separator + schema.getName().toLowerCase();
this.vocabulary = new VocabularyH2(schema.getName(), h2path, m_documentLanguage, schema.getManager());
// Check whether there is actually any data
if (stems.size() == 0) {
throw new Exception("Couldn't find any data!");
}
FastVector atts = new FastVector(2);
atts.addElement(new Attribute("doc", (FastVector) null));
atts.addElement(new Attribute("keyphrases", (FastVector) null));
Instances data = new Instances("keyphrase_training_data", atts, 0);
// Build model
m_KEAFilter = new KEAFilter();
m_stopwordsPath = stopwordsPath;
m_KEAFilter.setStopwords(m_stopwordsPath);
m_KEAFilter.setDebug(m_debug);
m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
m_KEAFilter.setKFused(getUseKFrequency());
m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
m_KEAFilter.setMinNumOccur(getMinNumOccur());
m_KEAFilter.setStemmer(getStemmer());
m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
m_KEAFilter.setVocabulary(getVocabulary());
m_KEAFilter.setVocabularyFormat(getVocabularyFormat());
m_KEAFilter.setStopwords(getStopwords());
m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
m_KEAFilter.setInputFormat(data);
// if (getVocabulary().equals("none")) {
// m_KEAFilter.m_NODEfeature = false;
// } else {
// m_KEAFilter.loadThesaurus(getStemmer(),getStopwords(),this.vocabulary);
// }
m_KEAFilter.loadThesaurus(getStemmer(),getStopwords(),this.vocabulary);
m_KEAFilter.setNumFeature();
logger.info("-- Reading the documents");
Enumeration elem = stems.keys();
while (elem.hasMoreElements()) {
String str = (String)elem.nextElement();
double[] newInst = new double[2];
try {
File txt = new File(m_dirName + "/" + str + ".txt");
InputStreamReader is;
if (!m_encoding.equals("default")) {
is = new InputStreamReader(new FileInputStream(txt), m_encoding);
} else {
is = new InputStreamReader(new FileInputStream(txt));
}
StringBuffer txtStr = new StringBuffer();
int c;
while ((c = is.read()) != -1) {
txtStr.append((char)c);
}
is.close();
newInst[0] = (double)data.attribute(0).addStringValue(txtStr.toString());
} catch (Exception e) {
if (m_debug) {
System.err.println("Can't find document for stem " + str + ".");
}
newInst[0] = Instance.missingValue();
}
try {
File key = new File(m_dirName + "/" + str + ".key");
InputStreamReader is;
if (!m_encoding.equals("default")) {
is = new InputStreamReader(new FileInputStream(key), m_encoding);
} else {
is = new InputStreamReader(new FileInputStream(key));
}
StringBuffer keyStr = new StringBuffer();
int c;
while ((c = is.read()) != -1) {
keyStr.append((char)c);
}
newInst[1] = (double)data.attribute(1).addStringValue(keyStr.toString());
} catch (Exception e) {
if (m_debug) {
System.err.println("Can't find keyphrases for stem " + str + ".");
}
newInst[1] = Instance.missingValue();
}
data.add(new Instance(1.0, newInst));
m_KEAFilter.input(data.instance(0));
data = data.stringFreeStructure();
}
m_KEAFilter.batchFinished(this.vocabulary);
while ((m_KEAFilter.output()) != null) {};
}
/**
* Saves the extraction model to the file.
*/
public void saveModel() throws Exception {
BufferedOutputStream bufferedOut =
new BufferedOutputStream(new FileOutputStream(m_modelName));
ObjectOutputStream out = new ObjectOutputStream(bufferedOut);
out.writeObject(m_KEAFilter);
out.flush();
out.close();
}
/**
* The main method.
* @throws RepositoryException
*/
public static void main(String[] ops) throws RepositoryException {
String trainDir = "/home/hive/hive-data/nbii/nbiiKEA/train";
String stopwordsPath = "/home/hive/hive-data/nbii/nbiiKEA/data/stopwords/stopwords_en.txt";
String confPath = "/home/hive/workspace/hive-core/conf/";
String vocabularyName = "nbii";
try
{
SKOSScheme schema = new SKOSSchemeImpl(confPath, vocabularyName, false);
KEAModelBuilder kmb = new KEAModelBuilder(schema);
try {
kmb.setOptions(ops);
kmb.setDirName(trainDir);
System.err.print("Building model with options: ");
String[] optionSettings = kmb.getOptions();
for (int i = 0; i < optionSettings.length; i++) {
System.err.print(optionSettings[i] + " ");
}
System.err.println();
kmb.buildModel(kmb.collectStems(),schema,stopwordsPath,null);
kmb.saveModel();
} catch (Exception e) {
e.printStackTrace();
System.err.println(e.getMessage());
System.err.println("\nOptions:\n");
Enumeration en = kmb.listOptions();
while (en.hasMoreElements()) {
Option option = (Option) en.nextElement();
System.err.println(option.synopsis());
System.err.println(option.description());
}
}
} catch (HiveException e) {
e.printStackTrace();
}
}
}