KEAModelBuilder.java example

Explorer
hive-mrc-master
- doc
  - sampleCode
- hive-core
  - src
  - test
    - edu
      - unc
        ils
        mrc
        hive
        api
        SKOSSchemeTest.java
        SearcherTest.java
        TaggerTest.java
        ir
        lucene
        search
        AutocompleteTest
        AutocompleteTest.java
        tagging
        KEATaggerTest.java
        util
        SimpleCrawlerTest.java
        hive2
        api
        impl
        test
        HiveH2IndexImplTest.java
        HiveLuceneIndexImplTest.java
        HiveVocabularyImplTest.java
- hive-rs
  - src
    - org
      - unc
        hive
        services
        rs
        ConceptsResource.java
        ConfigurationListener.java
        SchemesResource.java
  - test
    - org
      - unc
        hive
        services
        rs
        ConceptsResourceTest.java
        FileIO.java
        SchemesResourceTest.java
- hive-web
  - src
    - org
      - unc
        hive
        client
        ClosablePanel.java
        ConceptBrowser.java
        ConceptBrowserService.java
        ConceptBrowserServiceAsync.java
        ConceptLink.java
        ConceptProxy.java
        HIVEMessages.java
        HomePage.java
        Indexer.java
        IndexerService.java
        IndexerServiceAsync.java
        RecordFormatter.java
        TestVis.java
        server
        ConceptBrowserServiceImpl.java
        FileUpload.java
        IndexerServiceImpl.java
        VocabularyService.java
        services
        ConceptListResource.java
        Main.java
        SKOSResourceApplication.java
        servlet
        AutocompleteServlet.java
        TermSuggestionServlet.java
        sync
        SyncJob.java
package kea.main;


import java.io.BufferedOutputStream;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.openrdf.elmo.ElmoModule;
import org.openrdf.elmo.sesame.SesameManager;
import org.openrdf.elmo.sesame.SesameManagerFactory;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.sail.nativerdf.NativeStore;

import edu.unc.ils.mrc.hive.HiveException;
import edu.unc.ils.mrc.hive.api.SKOSScheme;
import edu.unc.ils.mrc.hive.api.impl.elmo.SKOSSchemeImpl;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import kea.filters.KEAFilter;
import kea.stemmers.*;
import kea.stopwords.*;
import kea.vocab.Vocabulary;
import kea.vocab.VocabularyH2;

/**
 * Builds a keyphrase extraction model from the documents in a given
 * directory.  Assumes that the file names for the documents end with
 * ".txt".  Assumes that files containing corresponding
 * author-assigned keyphrases end with ".key". Optionally an encoding
 * for the documents/keyphrases can be defined (e.g. for Chinese
 * text).
 *
 * Valid options are:<p>
 *
 * -l "directory name"<br>
 * Specifies name of directory.<p>
 *
 * -m "model name"<br>
 * Specifies name of model.<p>
 *
 * -e "encoding"<br>
 * Specifies encoding.<p>
 * 
 * -v "vocabulary name" <br>
 * Specifies vocabulary name (e.g. agrovoc or none).<p>
 * 
 * -f "vocabulary format" <br>
 * Specifies vocabulary format (txt or skos).<p>
 *
 * -i "document language" <br>
 * Specifies document language (en, es, de, fr).<p>
 *
 * -d<br>
 * Turns debugging mode on.<p>
 *
 * -k<br>
 * Use keyphrase frequency statistic.<p>
 *
 * -r<br>
 * Use agrovoc relation as feature.<p>
 *
 * -p<br>
 * Disallow internal periods.<p>
 *
 * -x "length"<br>
 * Sets maximum phrase length (default: 3).<p>
 *
 * -y "length"<br>
 * Sets minimum phrase length (default: 1).<p>
 *
 * -o "number"<br>
 * The minimum number of times a phrase needs to occur (default: 2). <p>
 *
 * -s "name of class implementing list of stop words"<br>
 * Sets list of stop words to used (default: StopwordsEnglish).<p>
 *
 * -t "name of class implementing stemmer"<br>
 * Sets stemmer to use (default: IteratedLovinsStemmer). <p>
 *
 * -n<br>
 * Do not check for proper nouns. <p>
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
 */
public class KEAModelBuilder implements OptionHandler {
	
    private static final Log logger = LogFactory.getLog(KEAModelBuilder.class);
	
	/** Stopwords path */
	String m_stopwordsPath;
	
	/** Name of directory */
	String m_dirName = null;
	
	/** Name of model */
	String m_modelName = null;
	
	/** Vocabulary name */
	String m_vocabulary = null;
	
	/** Format of the vocabulary */
	String m_vocabularyFormat = "skos";
	
	/** Document language */
	String m_documentLanguage = "en";
	
	/** Encoding */
	String m_encoding = "default";
	
	/** Debugging mode? */
	boolean m_debug = false;
	
	/** Use keyphrase frequency attribute? */
	boolean m_useKFrequency = false;
	
	/** Disallow internal periods? */
	boolean m_disallowIPeriods = false;
	
	/** The maximum length of phrases */
	private int m_MaxPhraseLength = 5;
	
	/** The minimum length of phrases */
	private int m_MinPhraseLength = 1;
	
	/** The minimum number of occurences of a phrase */
	private int m_MinNumOccur = 2;
	
	/** The KEA filter object */
	KEAFilter m_KEAFilter = null;
	
	/** The stemmer to be used */
	private Stemmer m_Stemmer = new SremovalStemmer();
	
	/** The list of stop words to be used */
	private Stopwords m_Stopwords;
	
	/** Determines whether check for proper nouns is performed */
	private boolean m_CheckForProperNouns = true;
	
	private Vocabulary vocabulary;
	
	public KEAModelBuilder(SKOSScheme scheme) {
		m_vocabularyFormat = "skos";
	}
	
	/**
	 * Get the M_CheckProperNouns value.
	 * @return the M_CheckProperNouns value.
	 */
	public boolean getCheckForProperNouns() {
		return m_CheckForProperNouns;
	}
	
	/**
	 * Set the M_CheckProperNouns value.
	 * @param newM_CheckProperNouns The new M_CheckProperNouns value.
	 */
	public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
		this.m_CheckForProperNouns = newM_CheckProperNouns;
	}
	
	/**
	 * Get the M_Stopwords value.
	 * @return the M_Stopwords value.
	 */
	public Stopwords getStopwords() {
		
		return m_Stopwords;
	}
	
	/**
	 * Set the M_Stopwords value.
	 * @param newM_Stopwords The new M_Stopwords value.
	 */
	public void setStopwords(Stopwords newM_Stopwords) {
		
		this.m_Stopwords = newM_Stopwords;
	}
	
	public void setStopwords(String stopwordsPath) {
		this.m_Stopwords = new StopwordsEnglish(stopwordsPath);
	}
	
	
	/**
	 * Get the Stemmer value.
	 * @return the Stemmer value.
	 */
	public Stemmer getStemmer() {
		
		return m_Stemmer;
	}
	
	/**
	 * Set the Stemmer value.
	 * @param newStemmer The new Stemmer value.
	 */
	public void setStemmer(Stemmer newStemmer) {
		
		this.m_Stemmer = newStemmer;
	}
	
	/**
	 * Get the value of MinNumOccur.
	 *
	 * @return Value of MinNumOccur.
	 */
	public int getMinNumOccur() {
		
		return m_MinNumOccur;
	}
	
	/**
	 * Set the value of MinNumOccur.
	 *
	 * @param newMinNumOccur Value to assign to MinNumOccur.
	 */
	public void setMinNumOccur(int newMinNumOccur) {
		
		m_MinNumOccur = newMinNumOccur;
	}
	
	/**
	 * Get the value of MaxPhraseLength.
	 *
	 * @return Value of MaxPhraseLength.
	 */
	public int getMaxPhraseLength() {
		
		return m_MaxPhraseLength;
	}
	
	/**
	 * Set the value of MaxPhraseLength.
	 *
	 * @param newMaxPhraseLength Value to assign to MaxPhraseLength.
	 */
	public void setMaxPhraseLength(int newMaxPhraseLength) {
		
		m_MaxPhraseLength = newMaxPhraseLength;
	}
	
	/**
	 * Get the value of MinPhraseLength.
	 *
	 * @return Value of MinPhraseLength.
	 */
	public int getMinPhraseLength() {
		
		return m_MinPhraseLength;
	}
	
	/**
	 * Set the value of MinPhraseLength.
	 *
	 * @param newMinPhraseLength Value to assign to MinPhraseLength.
	 */
	public void setMinPhraseLength(int newMinPhraseLength) {
		
		m_MinPhraseLength = newMinPhraseLength;
	}
	
	/**
	 * Get the value of disallowIPeriods.
	 *
	 * @return Value of disallowIPeriods.
	 */
	public boolean getDisallowIPeriods() {
		
		return m_disallowIPeriods;
	}
	
	/**
	 * Set the value of disallowIPeriods.
	 *
	 * @param newdisallowIPeriods Value to assign to disallowIPeriods.
	 */
	public void setDisallowIPeriods(boolean newdisallowIPeriods) {
		
		m_disallowIPeriods = newdisallowIPeriods;
	}
	
	/**
	 * Get the value of useKFrequency.
	 *
	 * @return Value of useKFrequency.
	 */
	public boolean getUseKFrequency() {
		
		return m_useKFrequency;
	}
	
	
	/**
	 * Set the value of useKFrequency.
	 *
	 * @param newuseKFrequency Value to assign to useKFrequency.
	 */
	public void setUseKFrequency(boolean newuseKFrequency) {
		
		m_useKFrequency = newuseKFrequency;
	}
	
	
	/**
	 * Get the value of debug.
	 *
	 * @return Value of debug.
	 */
	public boolean getDebug() {
		
		return m_debug;
	}
	
	/**
	 * Set the value of debug.
	 *
	 * @param newdebug Value to assign to debug.
	 */
	public void setDebug(boolean newdebug) {
		
		m_debug = newdebug;
	}
	
	/**
	 * Get the value of encoding.
	 *
	 * @return Value of encoding.
	 */
	public String getEncoding() {
		
		return m_encoding;
	}
	
	/**
	 * Set the value of encoding.
	 *
	 * @param newencoding Value to assign to encoding.
	 */
	public void setEncoding(String newencoding) {
		
		m_encoding = newencoding;
	}
	
	/**
	 * Get the value of vocabulary name.
	 *
	 * @return Value of vocabulary name.
	 */
	public String getVocabulary() {
		
		return m_vocabulary;
	}
	
	/**
	 * Set the value of vocabulary name.
	 *
	 * @param newvocabulary Value to assign to vocabulary name.
	 */
	public void setVocabulary(String newvocabulary) {
		
		m_vocabulary = newvocabulary;
	}
	
	/**
	 * Get the value of document language.
	 *
	 * @return Value of document language.
	 */
	public String getDocumentLanguage() {
		
		return m_documentLanguage;
	}
	
	/**
	 * Set the value of document language.
	 *
	 * @param newdocumentLanguage Value to assign to document language.
	 */
	public void setDocumentLanguage(String newdocumentLanguage) {
		
		m_documentLanguage = newdocumentLanguage;
	}
	
	/**
	 * Get the value of vocabulary format.
	 *
	 * @return Value of vocabulary format.
	 */
	public String getVocabularyFormat() {
		
		return m_vocabularyFormat;
	}
	
	/**
	 * Set the value of vocabulary format.
	 *
	 * @param newvocabularyFormat Value to assign to vocabulary format.
	 */
	public void setVocabularyFormat(String newvocabularyFormat) {
		
		m_vocabularyFormat = newvocabularyFormat;
	}
	
	
	/**
	 * Get the value of modelName.
	 *
	 * @return Value of modelName.
	 */
	public String getModelName() {
		
		return m_modelName;
	}
	
	/**
	 * Set the value of modelName.
	 *
	 * @param newmodelName Value to assign to modelName.
	 */
	public void setModelName(String newmodelName) {
		
		m_modelName = newmodelName;
	}
	
	/**
	 * Get the value of dirName.
	 *
	 * @return Value of dirName.
	 */
	public String getDirName() {
		
		return m_dirName;
	}
	
	/**
	 * Set the value of dirName.
	 *
	 * @param newdirName Value to assign to dirName.
	 */
	public void setDirName(String newdirName) {
		
		m_dirName = newdirName;
	}
	
	/**
	 * Parses a given list of options controlling the behaviour of this object.
	 * Valid options are:<p>
	 *
	 * -l "directory name" <br>
	 * Specifies name of directory.<p>
	 *
	 * -m "model name" <br>
	 * Specifies name of model.<p>
	 *
	 * -v "vocabulary name" <br>
	 * Specifies vocabulary name.<p>
	 * 
	 * -f "vocabulary format" <br>
	 * Specifies vocabulary format.<p>
	 *    
	 * -i "document language" <br>
	 * Specifies document language.<p>
	 * 
	 * -e "encoding" <br>
	 * Specifies encoding.<p>
	 * 
	 * -d<br>
	 * Turns debugging mode on.<p>
	 *
	 * -k<br>
	 * Use keyphrase frequency statistic.<p>
	 *
	 * -p<br>
	 * Disallow internal periods. <p>
	 *
	 * -x "length"<br>
	 * Sets maximum phrase length (default: 3).<p>
	 *
	 * -y "length"<br>
	 * Sets minimum phrase length (default: 3).<p>
	 *
	 * -o "number"<br>
	 * The minimum number of times a phrase needs to occur (default: 2). <p>
	 *
	 * -s "name of class implementing list of stop words"<br>
	 * Sets list of stop words to used (default: StopwordsEnglish).<p>
	 *
	 * -t "name of class implementing stemmer"<br>
	 * Sets stemmer to use (default: IteratedLovinsStemmer). <p>
	 *
	 * -n<br>
	 * Do not check for proper nouns. <p>
	 *
	 * @param options the list of options as an array of strings
	 * @exception Exception if an option is not supported
	 */
	public void setOptions(String[] options) throws Exception {
		
		String dirName = Utils.getOption('l', options);
		if (dirName.length() > 0) {
			setDirName(dirName);
		} else {
			setDirName(null);
			throw new Exception("Name of directory required argument.");
		}
		
		String modelName = Utils.getOption('m', options);
		if (modelName.length() > 0) {
			setModelName(modelName);
		} else {
			setModelName(null);
			throw new Exception("Name of model required argument.");
		}
		
		String vocabularyName = Utils.getOption('v', options);
		if (vocabularyName.length() > 0) {
			setVocabulary(vocabularyName);
		} else {
			setVocabulary(null);
			throw new Exception("Name of vocabulary required argument.");
		}
		
		String vocabularyFormat = Utils.getOption('f', options);
		
		if (!getVocabulary().equals("none")) {
			if (vocabularyFormat.length() > 0) {
				if (vocabularyFormat.equals("skos") || vocabularyFormat.equals("text")) {
					setVocabularyFormat(vocabularyFormat);
				} else {
					throw new Exception("Unsupported format of vocabulary. It should be either \"skos\" or \"text\".");
				}
			} else {
				setVocabularyFormat(null);
				throw new Exception("If a controlled vocabulary is used, format of vocabulary required argument (skos or text).");
			}
		} else {
			setVocabularyFormat(null);
		}
		
		String encoding = Utils.getOption('e', options);
		if (encoding.length() > 0) {
			setEncoding(encoding);
		} else {
			setEncoding("default");
		}
		
		String documentLanguage = Utils.getOption('i', options);
		if (documentLanguage.length() > 0) {
			setDocumentLanguage(documentLanguage);
		} else {
			setDocumentLanguage("en");
		}
		
		String maxPhraseLengthString = Utils.getOption('x', options);
		if (maxPhraseLengthString.length() > 0) {
			setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
		} else {
			setMaxPhraseLength(5);
		}
		String minPhraseLengthString = Utils.getOption('y', options);
		if (minPhraseLengthString.length() > 0) {
			setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
		} else {
			setMinPhraseLength(1);
		}
		String minNumOccurString = Utils.getOption('o', options);
		if (minNumOccurString.length() > 0) {
			setMinNumOccur(Integer.parseInt(minNumOccurString));
		} else {
			setMinNumOccur(2);
		}
		
		String stopwordsString = Utils.getOption('s', options);
		if (stopwordsString.length() > 0) {
			stopwordsString = "kea.stopwords.".concat(stopwordsString);
			setStopwords((Stopwords)Class.forName(stopwordsString).newInstance());
		}
		
		String stemmerString = Utils.getOption('t', options);
		if (stemmerString.length() > 0) {
			stemmerString = "kea.stemmers.".concat(stemmerString);
			setStemmer((Stemmer)Class.forName(stemmerString).newInstance());
		}
		setDebug(Utils.getFlag('d', options));
		setUseKFrequency(Utils.getFlag('k', options));
		setDisallowIPeriods(Utils.getFlag('p', options));
		setCheckForProperNouns(!Utils.getFlag('n', options));
		Utils.checkForRemainingOptions(options);
	}
	
	/**
	 * Gets the current option settings.
	 *
	 * @return an array of strings suitable for passing to setOptions
	 */
	public String [] getOptions() {
		
		String [] options = new String [26];
		int current = 0;
		
		options[current++] = "-l"; 
		options[current++] = "" + (getDirName());
		options[current++] = "-m"; 
		options[current++] = "" + (getModelName());
		options[current++] = "-v"; 
		options[current++] = "" + (getVocabulary());
		options[current++] = "-f"; 
		options[current++] = "" + (getVocabularyFormat());
		options[current++] = "-e"; 
		options[current++] = "" + (getEncoding());
		options[current++] = "-i"; 
		options[current++] = "" + (getDocumentLanguage());
		
		if (getUseKFrequency()) {
			options[current++] = "-k";
		}
		if (getDebug()) {
			options[current++] = "-d";
		}
		if (getDisallowIPeriods()) {
			options[current++] = "-p";
		}
		options[current++] = "-x"; 
		options[current++] = "" + (getMaxPhraseLength());
		options[current++] = "-y"; 
		options[current++] = "" + (getMinPhraseLength());
		options[current++] = "-o"; 
		options[current++] = "" + (getMinNumOccur());
		options[current++] = "-s"; 
		options[current++] = "" + (getStopwords().getClass().getName());
		options[current++] = "-t"; 
		options[current++] = "" + (getStemmer().getClass().getName());
		if (getCheckForProperNouns()) {
			options[current++] = "-n";
		}
		
		while (current < options.length) {
			options[current++] = "";
		}
		return options;
	}
	
	/**
	 * Returns an enumeration describing the available options.
	 *
	 * @return an enumeration of all the available options
	 */
	public Enumeration listOptions() {
		
		Vector newVector = new Vector(14);
		
		newVector.addElement(new Option(
				"\tSpecifies name of directory.",
				"l", 1, "-l <directory name>"));
		newVector.addElement(new Option(
				"\tSpecifies name of model.",
				"m", 1, "-m <model name>"));
		newVector.addElement(new Option(
				"\tSpecifies vocabulary name.",
				"v", 1, "-v <vocabulary name>"));
		newVector.addElement(new Option(
				"\tSpecifies vocabulary format (text or skos or none).",
				"f", 1, "-f <vocabulary format>"));
		newVector.addElement(new Option(
				"\tSpecifies document language (en (default), es, de, fr).",
				"i", 1, "-i <document language>"));
		newVector.addElement(new Option(
				"\tSpecifies encoding.",
				"e", 1, "-e <encoding>"));
		newVector.addElement(new Option(
				"\tTurns debugging mode on.",
				"d", 0, "-d"));
		newVector.addElement(new Option(
				"\tUse keyphrase frequency statistic.",
				"k", 0, "-k"));
		newVector.addElement(new Option(
				"\tDisallow internal periods.",
				"p", 0, "-p"));
		newVector.addElement(new Option(
				"\tSets the maximum phrase length (default: 5).",
				"x", 1, "-x <length>"));
		newVector.addElement(new Option(
				"\tSets the minimum phrase length (default: 1).",
				"y", 1, "-y <length>"));
		newVector.addElement(new Option(
				"\tSet the minimum number of occurences (default: 2).",
				"o", 1, "-o"));
		newVector.addElement(new Option(
				"\tSets the list of stopwords to use (default: StopwordsEnglish).",
				"s", 1, "-s <name of stopwords class>"));
		newVector.addElement(new Option(
				"\tSet the stemmer to use (default: SremovalStemmer).",
				"t", 1, "-t <name of stemmer class>"));
		newVector.addElement(new Option(
				"\tDo not check for proper nouns.",
				"n", 0, "-n"));
		
		return newVector.elements();
	}
	
	/**
	 * Collects the stems of the file names.
	 */
	public Hashtable collectStems() throws Exception {
		
		Hashtable stems = new Hashtable();
		
		try {
			File dir = new File(m_dirName);
			String[] files = dir.list();
			for (int i = 0; i < files.length; i++) {
				if (files[i].endsWith(".key") ||
						files[i].endsWith(".txt")) {
					String stem = files[i].substring(0, files[i].length() - 4);
					if (!stems.containsKey(stem)) {
						stems.put(stem, new Double(0));
					}
				}
			}
		} catch (Exception e) {
			throw new Exception("Problem opening directory " + m_dirName);
		}
		return stems;
	}
	
	/**
	 * Builds the model from the files
	 */
	public void buildModel(Hashtable stems, SKOSScheme schema, String stopwordsPath, SesameManager manager) throws Exception {
		
		String h2path = new File(schema.getRdfPath()).getParentFile().getAbsolutePath();
		//h2path += File.separator + schema.getName().toLowerCase() + "H2" + File.separator + schema.getName().toLowerCase();
		this.vocabulary = new VocabularyH2(schema.getName(), h2path, m_documentLanguage, schema.getManager());
		
		// Check whether there is actually any data
		if (stems.size() == 0) {
			throw new Exception("Couldn't find any data!");
		}
		
		FastVector atts = new FastVector(2);
		atts.addElement(new Attribute("doc", (FastVector) null));
		atts.addElement(new Attribute("keyphrases", (FastVector) null));
		Instances data = new Instances("keyphrase_training_data", atts, 0);
		
		// Build model
		m_KEAFilter = new KEAFilter();
		m_stopwordsPath = stopwordsPath;
		m_KEAFilter.setStopwords(m_stopwordsPath);
		
		m_KEAFilter.setDebug(m_debug);
		m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
		m_KEAFilter.setKFused(getUseKFrequency());
		
		m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
		m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
		m_KEAFilter.setMinNumOccur(getMinNumOccur());
		m_KEAFilter.setStemmer(getStemmer());
		m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
		m_KEAFilter.setVocabulary(getVocabulary());
		m_KEAFilter.setVocabularyFormat(getVocabularyFormat());
		m_KEAFilter.setStopwords(getStopwords());
		m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
		m_KEAFilter.setInputFormat(data);
		
//		if (getVocabulary().equals("none")) {
//			m_KEAFilter.m_NODEfeature = false;
//		} else {
//			m_KEAFilter.loadThesaurus(getStemmer(),getStopwords(),this.vocabulary);
//		}
		m_KEAFilter.loadThesaurus(getStemmer(),getStopwords(),this.vocabulary);
		m_KEAFilter.setNumFeature();
		
		logger.info("-- Reading the documents");
		
		Enumeration elem = stems.keys();
		while (elem.hasMoreElements()) {
			String str = (String)elem.nextElement();
			
			double[] newInst = new double[2];
			try {
				File txt = new File(m_dirName + "/" + str + ".txt");
				InputStreamReader is;
				if (!m_encoding.equals("default")) {
					is = new InputStreamReader(new FileInputStream(txt), m_encoding);
				} else {
					is = new InputStreamReader(new FileInputStream(txt));
				}
				StringBuffer txtStr = new StringBuffer();
				int c;
				while ((c = is.read()) != -1) {
					txtStr.append((char)c);
				}
				is.close();
				newInst[0] = (double)data.attribute(0).addStringValue(txtStr.toString());
			} catch (Exception e) {
				if (m_debug) {
					System.err.println("Can't find document for stem " + str + ".");
				}
				newInst[0] = Instance.missingValue();
			}
			try {
				File key = new File(m_dirName + "/" + str + ".key");
				InputStreamReader is; 
				if (!m_encoding.equals("default")) {
					is = new InputStreamReader(new FileInputStream(key), m_encoding);
				} else {
					is = new InputStreamReader(new FileInputStream(key));
				}
				StringBuffer keyStr = new StringBuffer();
				int c;
				while ((c = is.read()) != -1) {
					keyStr.append((char)c);
				}      
				newInst[1] = (double)data.attribute(1).addStringValue(keyStr.toString());
			} catch (Exception e) {
				if (m_debug) {
					System.err.println("Can't find keyphrases for stem " + str + ".");
				}
				newInst[1] = Instance.missingValue();
			}
			data.add(new Instance(1.0, newInst));
			m_KEAFilter.input(data.instance(0));
			data = data.stringFreeStructure();
		}
		m_KEAFilter.batchFinished(this.vocabulary);
		
		while ((m_KEAFilter.output()) != null) {};
	}
	
	/** 
	 * Saves the extraction model to the file.
	 */
	public void saveModel() throws Exception {
		
		BufferedOutputStream bufferedOut = 
			new BufferedOutputStream(new FileOutputStream(m_modelName));
		ObjectOutputStream out = new ObjectOutputStream(bufferedOut);
		out.writeObject(m_KEAFilter);
		out.flush();
		out.close();
	}
	
	/**
	 * The main method.  
	 * @throws RepositoryException 
	 */
	public static void main(String[] ops) throws RepositoryException {
		
		String trainDir = "/home/hive/hive-data/nbii/nbiiKEA/train";
		String stopwordsPath = "/home/hive/hive-data/nbii/nbiiKEA/data/stopwords/stopwords_en.txt";
		
		String confPath = "/home/hive/workspace/hive-core/conf/";
		String vocabularyName = "nbii";
		
		try
		{
			SKOSScheme schema = new SKOSSchemeImpl(confPath, vocabularyName, false);

			KEAModelBuilder kmb = new KEAModelBuilder(schema);
			try {
				kmb.setOptions(ops);
				kmb.setDirName(trainDir);
				System.err.print("Building model with options: ");
				String[] optionSettings = kmb.getOptions();
				for (int i = 0; i < optionSettings.length; i++) {
					System.err.print(optionSettings[i] + " ");
				}
				System.err.println();
				kmb.buildModel(kmb.collectStems(),schema,stopwordsPath,null);
				kmb.saveModel();
			} catch (Exception e) {
				e.printStackTrace();
				System.err.println(e.getMessage());
				System.err.println("\nOptions:\n");
				Enumeration en = kmb.listOptions();
				while (en.hasMoreElements()) {
					Option option = (Option) en.nextElement();
					System.err.println(option.synopsis());
					System.err.println(option.description());
				}
			}
		} catch (HiveException e) {
			e.printStackTrace();
		}
	}
}