/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.kea.filter;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import weka.classifiers.Classifier;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.meta.RegressionByDiscretization;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;
import com.openkm.kea.stemmers.SremovalStemmer;
import com.openkm.kea.stemmers.Stemmer;
import com.openkm.kea.stopwords.Stopwords;
import com.openkm.kea.stopwords.StopwordsEnglish;
import com.openkm.kea.util.Counter;
import com.openkm.kea.vocab.Vocabulary;
/**
* This filter converts the incoming data into data appropriate for
* keyphrase classification. It assumes that the dataset contains two
* string attributes. The first attribute should contain the text of a
* document. The second attribute should contain the keyphrases
* associated with that document (if present).
*
* The filter converts every instance (i.e. document) into a set of
* instances, one for each word-based n-gram in the document. The
* string attribute representing the document is replaced by some
* numeric features, the estimated probability of each n-gram being a
* keyphrase, and the rank of this phrase in the document according to
* the probability. Each new instances also has a class value
* associated with it. The class is "true" if the n-gram is a true
* keyphrase, and "false" otherwise. Of course, if the input document
* doesn't come with author-assigned keyphrases, the class values for
* that document will be missing.
*
* @author Eibe Frank (eibe@cs.waikato.ac.nz), Olena Medelyan (olena@cs.waikato.ac.nz)
* @version 2.0
*/
public class KEAFilter extends Filter implements OptionHandler {
private static Logger log = LoggerFactory.getLogger(KEAFilter.class);
/**
*
*/
private static final long serialVersionUID = 1L;
/** Index of attribute containing the documents */
private int m_DocumentAtt = 0;
/** Index of attribute containing the keyphrases */
private int m_KeyphrasesAtt = 1;
/** The maximum length of phrases */
private int m_MaxPhraseLength = 5;
/** The minimum length of phrases */
private int m_MinPhraseLength = 1;
/** The number of phrases to extract. */
private int m_numPhrases = 10;
/** Experimental!
* Number of human indexers (times a keyphrase appears in the keyphrase set) */
// adjust manually for >1 indexer
private int m_Indexers = 1;
/** Should non-descriptors be replaced by corresponding descriptors? */
private boolean m_DESCRreplace = true;
/** Is the node degree (number of related terms in candidate set) being used? */
public boolean m_NODEfeature = true;
/** Is the length of a phrase in words being used?*/
private boolean m_LENGTHfeature = true;
/** Experimental feature!
* If m_STDEVused = true, should the standard deviation of position of phrase occurrences be considered?
* If set to true, the indicies of features need to be adjusted in the code manually!
*
*/
private boolean m_STDEVfeature = false;
/** Experimental feature!
* Is keyphrase frequency attribute being used?
* If set to true, adjust the indicies in the code!*/
private boolean m_KFused = false;
// end. Don't use these features with m_KFused or adjust indicies below.
/** Flag for debugging mode */
private boolean m_Debug = false;
/** Determines whether internal periods are allowed */
private boolean m_DisallowInternalPeriods = false;
/** The minimum number of occurences of a phrase */
private int m_MinNumOccur = 2;
/** The number of features describing a phrase */
private int m_NumFeatures = 2;
/** Indices of attributes in m_ClassifierData */
private int m_TfidfIndex = 0;
private int m_FirstOccurIndex = 1;
/** Indicies of attributes for new features */
private int m_LengthIndex = 2;// adjust!!
private int m_NodeIndex = 3; // decrease if removing the above value
private int m_STDEVIndex = 4; // adjust!!
private int m_KeyFreqIndex = 3;
/** The punctuation filter used by this filter */
private KEAPhraseFilter m_PunctFilter = null;
/** The numbers filter used by this filter */
private NumbersFilter m_NumbersFilter = null;
/** The actual classifier used to compute probabilities */
private Classifier m_Classifier = null;
/** The dictionary containing the document frequencies */
public HashMap<String, Counter> m_Dictionary = null;
/** The dictionary containing the keyphrases */
private HashMap<String, Counter> m_KeyphraseDictionary = null;
/** The number of documents in the global frequencies corpus */
private int m_NumDocs = 0;
/** Template for the classifier data */
private Instances m_ClassifierData = null;
/** The default stemmer to be used */
private Stemmer m_Stemmer = new SremovalStemmer();
/** The list of stop words to be used */
private Stopwords m_Stopwords;
/** The default language to be used */
private String m_documentLanguage = "en";
public KEAFilter(Stopwords m_Stopwords) {
this.m_Stopwords = m_Stopwords;
}
/** The Vocabulary object */
/**
* orininally static
* Changed to non-static so we can have multiple filters running
*/
public Vocabulary m_Vocabulary;
/**
* New method to set the Vocabulary to null
*/
public void clearVocabulary() {
m_Vocabulary = null;
}
/** The Vocabulary name */
private String m_vocabulary = "agrovoc";
/** The Vocabulary format */
private String m_vocabularyFormat = "skos";
/**
* Get the M_Vocabulary value.
* @return the M_Vocabulary value.
*/
public String getVocabulary() {
return m_vocabulary;
}
/**
* Set the M_Vocabulary value.
* @param newM_Vocabulary The new M_Vocabulary value.
*/
public void setVocabulary(String newM_Vocabulary) {
this.m_vocabulary = newM_Vocabulary;
}
/**
* Get the M_VocabularyFormat value.
* @return the M_VocabularyFormat value.
*/
public String getVocabularyFormat() {
return m_vocabularyFormat;
}
/**
* Set the M_VocabularyFormat value.
* @param newM_VocabularyFormat The new M_VocabularyFormat value.
*/
public void setVocabularyFormat(String newM_VocabularyFormat) {
this.m_vocabularyFormat = newM_VocabularyFormat;
}
/**
* Get the M_documentLanguage value.
* @return the M_documentLanguage value.
*/
public String getDocumentLanguage() {
return m_documentLanguage;
}
/**
* Set the M_documentLanguage value.
* @param newM_documentLanguage The new M_documentLanguage value.
*/
public void setDocumentLanguage(String newM_documentLanguage) {
this.m_documentLanguage = newM_documentLanguage;
}
/** Determines whether check for proper nouns is performed */
private boolean m_CheckForProperNouns = true;
/**
* Get the M_CheckProperNouns value.
* @return the M_CheckProperNouns value.
*/
public boolean getCheckForProperNouns() {
return m_CheckForProperNouns;
}
/**
* Set the M_CheckProperNouns value.
* @param newM_CheckProperNouns The new M_CheckProperNouns value.
*/
public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
this.m_CheckForProperNouns = newM_CheckProperNouns;
}
/**
* Get the M_Stopwords value.
* @return the M_Stopwords value.
*/
public Stopwords getStopwords() {
return m_Stopwords;
}
/**
* Set the M_Stopwords value.
* @param newM_Stopwords The new M_Stopwords value.
*/
public void setStopwords(Stopwords newM_Stopwords) {
this.m_Stopwords = newM_Stopwords;
}
/**
* Get the Stemmer value.
* @return the Stemmer value.
*/
public Stemmer getStemmer() {
return m_Stemmer;
}
/**
* Set the Stemmer value.
* @param newStemmer The new Stemmer value.
*/
public void setStemmer(Stemmer newStemmer) {
this.m_Stemmer = newStemmer;
}
/**
* Get the value of MinNumOccur.
*
* @return Value of MinNumOccur.
*/
public int getMinNumOccur() {
return m_MinNumOccur;
}
/**
* Set the value of MinNumOccur.
*
* @param newMinNumOccur Value to assign to MinNumOccur.
*/
public void setMinNumOccur(int newMinNumOccur) {
m_MinNumOccur = newMinNumOccur;
}
/**
* Get the value of MaxPhraseLength.
*
* @return Value of MaxPhraseLength.
*/
public int getMaxPhraseLength() {
return m_MaxPhraseLength;
}
/**
* Set the value of MaxPhraseLength.
*
* @param newMaxPhraseLength Value to assign to MaxPhraseLength.
*/
public void setMaxPhraseLength(int newMaxPhraseLength) {
m_MaxPhraseLength = newMaxPhraseLength;
}
/**
* Get the value of MinPhraseLength.
*
* @return Value of MinPhraseLength.
*/
public int getMinPhraseLength() {
return m_MinPhraseLength;
}
/**
* Set the value of MinPhraseLength.
*
* @param newMinPhraseLength Value to assign to MinPhraseLength.
*/
public void setMinPhraseLength(int newMinPhraseLength) {
m_MinPhraseLength = newMinPhraseLength;
}
/**
* Get the value of numPhrases.
*
* @return Value of numPhrases.
*/
public int getNumPhrases() {
return m_numPhrases;
}
/**
* Set the value of numPhrases.
*
* @param newnumPhrases Value to assign to numPhrases.
*/
public void setNumPhrases(int newnumPhrases) {
m_numPhrases = newnumPhrases;
}
/**
* Returns the index of the stemmed phrases in the output ARFF file.
*/
public int getStemmedPhraseIndex() {
return m_DocumentAtt;
}
/**
* Returns the index of the unstemmed phrases in the output ARFF file.
*/
public int getUnstemmedPhraseIndex() {
return m_DocumentAtt + 1;
}
/**
* Returns the index of the phrases' probabilities in the output ARFF file.
*/
public int getProbabilityIndex() {
int index = m_DocumentAtt + 4;
if (m_Debug) {
if (m_KFused) {
index++;
}
}
if (m_STDEVfeature) {
index++;
}
if (m_NODEfeature) {
index++;
}
if (m_LENGTHfeature) {
index++;
}
return index;
}
/**
* Returns the index of the phrases' ranks in the output ARFF file.
*/
public int getRankIndex() {
return getProbabilityIndex() + 1;
}
/**
* Get the value of DocumentAtt.
*
* @return Value of DocumentAtt.
*/
public int getDocumentAtt() {
return m_DocumentAtt;
}
/**
* Set the value of DocumentAtt.
*
* @param newDocumentAtt Value to assign to DocumentAtt.
*/
public void setDocumentAtt(int newDocumentAtt) {
m_DocumentAtt = newDocumentAtt;
}
/**
* Get the value of KeyphraseAtt.
*
* @return Value of KeyphraseAtt.
*/
public int getKeyphrasesAtt() {
return m_KeyphrasesAtt;
}
/**
* Set the value of KeyphrasesAtt.
*
* @param newKeyphrasesAtt Value to assign to KeyphrasesAtt.
*/
public void setKeyphrasesAtt(int newKeyphrasesAtt) {
m_KeyphrasesAtt = newKeyphrasesAtt;
}
/**
* Get the value of Debug.
*
* @return Value of Debug.
*/
public boolean getDebug() {
return m_Debug;
}
/**
* Set the value of Debug.
*
* @param newDebug Value to assign to Debug.
*/
public void setDebug(boolean newDebug) {
m_Debug = newDebug;
}
/**
* Sets whether keyphrase frequency attribute is used.
*/
public void setKFused(boolean flag) {
m_KFused = flag;
if (flag) {
m_NumFeatures++;
}
}
/**
* Sets whether Vocabulary relation attribute is used.
*/
public void setNumFeature() {
if (m_STDEVfeature) {
m_NumFeatures++;
}
if (m_NODEfeature) {
m_NumFeatures++;
}
if (m_LENGTHfeature) {
m_NumFeatures++;
}
}
/**
* Gets whether keyphrase frequency attribute is used.
*/
public boolean getKFused() {
return m_KFused;
}
/**
* Get whether the supplied columns are to be processed
*
* @return true if the supplied columns won't be processed
*/
public boolean getDisallowInternalPeriods() {
return m_DisallowInternalPeriods;
}
/**
* Set whether selected columns should be processed. If true the
* selected columns won't be processed.
*
* @param disallow the new invert setting
*/
public void setDisallowInternalPeriods(boolean disallow) {
m_DisallowInternalPeriods = disallow;
}
public void loadThesaurus(Stemmer st, Stopwords sw) {
m_Vocabulary = new Vocabulary(m_vocabulary,m_vocabularyFormat, m_documentLanguage);
m_Vocabulary.setStemmer(st);
m_Vocabulary.setStopwords(sw);
m_Vocabulary.initialize();
try {
if (m_DESCRreplace) {
m_Vocabulary.buildUSE();
}
if (m_NODEfeature) {
m_Vocabulary.buildREL();
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Parses a given list of options controlling the behaviour of this object.
* Valid options are:<p>
*
* -K<br>
* Specifies whether keyphrase frequency statistic is used.<p>
*
* -R<br>
* Specifies whether Vocabulary relation statistic is used.<p>
*
* -M length<br>
* Sets the maximum phrase length (default: 5).<p>
*
* -L length<br>
* Sets the minimum phrase length (default: 1).<p>
*
* -D<br>
* Turns debugging mode on.<p>
*
* -I index<br>
* Sets the index of the attribute containing the documents (default: 0).<p>
*
* -J index<br>
* Sets the index of the attribute containing the keyphrases (default: 1).<p>
*
* -P<br>
* Disallow internal periods <p>
*
* -O number<br>
* The minimum number of times a phrase needs to occur (default: 2). <p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
setKFused(Utils.getFlag('K', options));
setDebug(Utils.getFlag('D', options));
String docAttIndexString = Utils.getOption('I', options);
if (docAttIndexString.length() > 0) {
setDocumentAtt(Integer.parseInt(docAttIndexString) - 1);
} else {
setDocumentAtt(0);
}
String keyphraseAttIndexString = Utils.getOption('J', options);
if (keyphraseAttIndexString.length() > 0) {
setKeyphrasesAtt(Integer.parseInt(keyphraseAttIndexString) - 1);
} else {
setKeyphrasesAtt(1);
}
String maxPhraseLengthString = Utils.getOption('M', options);
if (maxPhraseLengthString.length() > 0) {
setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
} else {
setMaxPhraseLength(3);
}
String minPhraseLengthString = Utils.getOption('M', options);
if (minPhraseLengthString.length() > 0) {
setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
} else {
setMinPhraseLength(1);
}
String minNumOccurString = Utils.getOption('O', options);
if (minNumOccurString.length() > 0) {
setMinNumOccur(Integer.parseInt(minNumOccurString));
} else {
setMinNumOccur(2);
}
setDisallowInternalPeriods(Utils.getFlag('P', options));
}
/**
* Gets the current settings of the filter.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] options = new String [13];
int current = 0;
if (getKFused()) {
options[current++] = "-K";
}
if (getDebug()) {
options[current++] = "-D";
}
options[current++] = "-I";
options[current++] = "" + (getDocumentAtt() + 1);
options[current++] = "-J";
options[current++] = "" + (getKeyphrasesAtt() + 1);
options[current++] = "-M";
options[current++] = "" + (getMaxPhraseLength());
options[current++] = "-L";
options[current++] = "" + (getMinPhraseLength());
options[current++] = "-O";
options[current++] = "" + (getMinNumOccur());
if (getDisallowInternalPeriods()) {
options[current++] = "-P";
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Returns an enumeration describing the available options
*
* @return an enumeration of all the available options
*/
public Enumeration<Option> listOptions() {
Vector<Option> newVector = new Vector<Option>(7);
newVector.addElement(new Option(
"\tSpecifies whether keyphrase frequency statistic is used.",
"K", 0, "-K"));
newVector.addElement(new Option(
"\tSets the maximum phrase length (default: 3).",
"M", 1, "-M <length>"));
newVector.addElement(new Option(
"\tSets the minimum phrase length (default: 1).",
"L", 1, "-L <length>"));
newVector.addElement(new Option(
"\tTurns debugging mode on.",
"D", 0, "-D"));
newVector.addElement(new Option(
"\tSets the index of the document attribute (default: 0).",
"I", 1, "-I"));
newVector.addElement(new Option(
"\tSets the index of the keyphrase attribute (default: 1).",
"J", 1, "-J"));
newVector.addElement(new Option(
"\tDisallow internal periods.",
"P", 0, "-P"));
newVector.addElement(new Option(
"\tSet the minimum number of occurences (default: 2).",
"O", 1, "-O"));
return newVector.elements();
}
/**
* Returns a string describing this filter
*
* @return a description of the filter suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Converts incoming data into data appropriate for " +
"keyphrase classification.";
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input
* instance structure (any instances contained in the object are
* ignored - only the structure is required).
* @return true if the outputFormat may be collected immediately
*/
public boolean setInputFormat(Instances instanceInfo) throws Exception {
if (instanceInfo.classIndex() >= 0) {
throw new Exception("Don't know what do to if class index set!");
}
if (!instanceInfo.attribute(m_KeyphrasesAtt).isString() ||
!instanceInfo.attribute(m_DocumentAtt).isString()) {
throw new Exception("Keyphrase attribute and document attribute " +
"need to be string attributes.");
}
m_PunctFilter = new KEAPhraseFilter();
int[] arr = new int[1];
arr[0] = m_DocumentAtt;
m_PunctFilter.setAttributeIndicesArray(arr);
m_PunctFilter.setInputFormat(instanceInfo);
m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());
if (m_vocabulary.equals("none")) {
m_NumbersFilter = new NumbersFilter();
m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
super.setInputFormat(m_NumbersFilter.getOutputFormat());
} else {
super.setInputFormat(m_PunctFilter.getOutputFormat());
}
return false;
}
/**
* Returns the Capabilities of this filter.
*
* @return the capabilities of this object
* @see Capabilities
*/
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
// attributes
result.enableAllAttributes();
result.enable(Capability.MISSING_VALUES);
// class
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.NO_CLASS);
result.enableAllClasses();
// result.or(new LinearRegression().getCapabilities());
return result;
}
/**
* Input an instance for filtering. Ordinarily the instance is processed
* and made available for output immediately. Some filters require all
* instances be read before producing output.
*
* @param instance the input instance
* @return true if the filtered instance may now be
* collected with output().
* @exception Exception if the input instance was not of the correct
* format or if there was a problem with the filtering.
*/
@SuppressWarnings("unchecked")
public boolean input(Instance instance) throws Exception {
if (getInputFormat() == null) {
throw new Exception("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
if (m_Debug) {
log.info("-- Reading instance");
}
m_PunctFilter.input(instance);
m_PunctFilter.batchFinished();
instance = m_PunctFilter.output();
if (m_vocabulary.equals("none")) {
m_NumbersFilter.input(instance);
m_NumbersFilter.batchFinished();
instance = m_NumbersFilter.output();
}
if (m_Dictionary == null) {
bufferInput(instance);
return false;
} else {
FastVector vector = convertInstance(instance, false);
Enumeration<Instance> en = vector.elements();
while (en.hasMoreElements()) {
Instance inst = en.nextElement();
push(inst);
}
return true;
}
}
/**
* Signify that this batch of input to the filter is finished.
* If the filter requires all instances prior to filtering,
* output() may now be called to retrieve the filtered instances.
*
* @return true if there are instances pending output
* @exception Exception if no input structure has been defined
*/
public boolean batchFinished() throws Exception {
if (getInputFormat() == null) {
throw new Exception("No input instance format defined");
}
if (m_Dictionary == null) {
buildGlobalDictionaries();
buildClassifier();
convertPendingInstances();
}
flushInput();
m_NewBatch = true;
return (numPendingOutput() != 0);
}
/**
* Builds the global dictionaries.
*/
public void buildGlobalDictionaries() throws Exception {
if (m_Debug) {
log.info("--- Building global dictionaries");
}
// Build dictionary of n-grams with associated
// document frequencies
m_Dictionary = new HashMap<String, Counter>();
for (int i = 0; i < getInputFormat().numInstances(); i++) {
String str = getInputFormat().instance(i).stringValue(m_DocumentAtt);
HashMap<String, Counter> hash = getPhrasesForDictionary(str);
Iterator<String> it = hash.keySet().iterator();
while (it.hasNext()) {
String phrase = it.next();
Counter counter = (Counter)m_Dictionary.get(phrase);
if (counter == null) {
m_Dictionary.put(phrase, new Counter());
} else {
counter.increment();
}
}
}
if (m_KFused) {
if (m_Debug) {
log.info("KF_used feature");
}
// Build dictionary of n-grams that occur as keyphrases
// with associated keyphrase frequencies
m_KeyphraseDictionary = new HashMap<String, Counter>();
for (int i = 0; i < getInputFormat().numInstances(); i++) {
String str = getInputFormat().instance(i).stringValue(m_KeyphrasesAtt);
HashMap<String, Counter> hash = getGivenKeyphrases(str, false);
if (hash != null) {
Iterator<String> it = hash.keySet().iterator();
while (it.hasNext()) {
String phrase = it.next();
Counter counter = m_KeyphraseDictionary.get(phrase);
if (counter == null) {
m_KeyphraseDictionary.put(phrase, new Counter());
} else {
counter.increment();
}
}
}
}
} else {
m_KeyphraseDictionary = null;
}
// Set the number of documents in the global corpus
m_NumDocs = getInputFormat().numInstances();
}
/**
* Builds the classifier.
*/
// aly: The main function, where everything important happens
private void buildClassifier() throws Exception {
// Generate input format for classifier
FastVector atts = new FastVector();
for (int i = 0; i < getInputFormat().numAttributes(); i++) {
if (i == m_DocumentAtt) {
atts.addElement(new Attribute("TFxIDF"));
atts.addElement(new Attribute("First_occurrence"));
if (m_KFused) {
atts.addElement(new Attribute("Keyphrase_frequency"));
}
if (m_STDEVfeature) {
atts.addElement(new Attribute("Standard_deviation"));
}
if (m_NODEfeature) {
atts.addElement(new Attribute("Relations_number"));
}
if (m_LENGTHfeature) {
atts.addElement(new Attribute("Phrase_length"));
}
} else if (i == m_KeyphrasesAtt) {
FastVector vals = new FastVector(2);
vals.addElement("False");
vals.addElement("True");
//atts.addElement(new Attribute("Keyphrase?", vals));
atts.addElement(new Attribute("Keyphrase?"));
}
}
m_ClassifierData = new Instances("ClassifierData", atts, 0);
m_ClassifierData.setClassIndex(m_NumFeatures);
if (m_Debug) {
log.info("--- Converting instances for classifier");
}
// Convert pending input instances into data for classifier
for(int i = 0; i < getInputFormat().numInstances(); i++) {
Instance current = getInputFormat().instance(i);
// Get the key phrases for the document
String keyphrases = current.stringValue(m_KeyphrasesAtt);
HashMap<String, Counter> hashKeyphrases = getGivenKeyphrases(keyphrases, false);
HashMap<String, Counter> hashKeysEval = getGivenKeyphrases(keyphrases, true);
// Get the phrases for the document
HashMap<String,FastVector> hash = new HashMap<String,FastVector>();
int length = getPhrases(hash, current.stringValue(m_DocumentAtt));
// hash = getComposits(hash);
// Compute the feature values for each phrase and
// add the instance to the data for the classifier
Iterator<String> it = hash.keySet().iterator();
while (it.hasNext()) {
String phrase = it.next();
FastVector phraseInfo = (FastVector)hash.get(phrase);
double[] vals = featVals(phrase, phraseInfo, true,
hashKeysEval, hashKeyphrases, length, hash);
//log.info(vals);
Instance inst = new Instance(current.weight(), vals);
// .err.println(phrase + "\t" + inst.toString());
m_ClassifierData.add(inst);
}
}
if (m_Debug) {
log.info("--- Building classifier");
}
// Build classifier
// Uncomment if you want to use a different classifier
// Caution: Other places in the code will have to be adjusted!!
/*I. Naive Bayes:
FilteredClassifier fclass = new FilteredClassifier();
fclass.setClassifier(new weka.classifiers.bayes.NaiveBayesSimple());
fclass.setFilter(new Discretize());
m_Classifier = fclass;
*/
//NaiveBayes nb = new NaiveBayes();
//nb.setUseSupervisedDiscretization(true);
//m_Classifier = nb;
/* II. Linear Regression:
LinearRegression lr = new LinearRegression();
lr.setAttributeSelectionMethod(new
weka.core.SelectedTag(1, LinearRegression.TAGS_SELECTION));
lr.setEliminateColinearAttributes(false);
lr.setDebug(false);
m_Classifier = lr;*/
/* III. Bagging with REPTrees
Bagging bagging = new Bagging();
String[] ops_bagging = {
new String("-P"),
new String("100"),
new String("-S"),
new String("1"),
new String("-I"),
new String("50")};
*/
/*
* REPTree rept = new REPTree();
//results are worse!
rept.setNoPruning(true);
String[] ops_rept = {
new String("-M"),
new String("2"),
new String("-V"),
new String("0.0010"),
new String("-N"),
new String("3"),
new String("-S"),
new String("1"),
new String("-L"),
new String("1"),};
rept.setOptions(ops_rept);
bagging.setClassifier(rept);
*/
// bagging.setOptions(ops_bagging);
//FilteredClassifier fclass = new FilteredClassifier();
//fclass.setClassifier(new REPTree());
//fclass.setFilter(new Discretize());
//bagging.setClassifier(fclass);
// m_Classifier = bagging;
RegressionByDiscretization rvd = new RegressionByDiscretization();
FilteredClassifier fclass = new FilteredClassifier();
fclass.setClassifier(new weka.classifiers.bayes.NaiveBayesSimple());
fclass.setFilter(new Discretize());
rvd.setClassifier(fclass);
rvd.setNumBins(m_Indexers+1);
m_Classifier = rvd;
// log.info(m_ClassifierData);
//System.exit(1);
m_Classifier.buildClassifier(m_ClassifierData);
if (m_Debug) {
log.info(""+m_Classifier);
}
// Save space
m_ClassifierData = new Instances(m_ClassifierData, 0);
}
/**
* Conmputes the feature values for a given phrase.
*/
private double[] featVals(String id, FastVector phraseInfo,
boolean training, HashMap<String, Counter> hashKeysEval,
HashMap<String, Counter> hashKeyphrases, int length, HashMap<String,FastVector> hash) {
// Compute feature values
Counter counterLocal = (Counter)phraseInfo.elementAt(1);
double[] newInst = new double[m_NumFeatures + 1];
// Compute TFxIDF
Counter counterGlobal = (Counter)m_Dictionary.get(id);
double localVal = counterLocal.value(), globalVal = 0;
if (counterGlobal != null) {
globalVal = counterGlobal.value();
if (training) {
globalVal = globalVal - 1;
}
}
// Just devide by length to get approximation of probability
// that phrase in document is our phrase
// newInst[m_TfidfIndex] = (localVal / ((double)length));
newInst[m_TfidfIndex] = (localVal / ((double)length)) *
(-Math.log((globalVal + 1)/ ((double)m_NumDocs + 1)));
// Compute first occurrence
Counter counterFirst = (Counter)phraseInfo.elementAt(0);
newInst[m_FirstOccurIndex] = (double)counterFirst.value() /
(double)length;
// Is keyphrase frequency attribute being used?
if (m_KFused) {
Counter keyphraseC = (Counter)m_KeyphraseDictionary.get(id);
if ((training) && (hashKeyphrases != null) &&
(hashKeyphrases.containsKey(id))) {
newInst[m_KeyFreqIndex] = keyphraseC.value() - 1;
} else {
if (keyphraseC != null) {
newInst[m_KeyFreqIndex] = keyphraseC.value();
} else {
newInst[m_KeyFreqIndex] = 0;
}
}
}
// Is term appearance attribute being used?
if (m_STDEVfeature) {
FastVector app = (FastVector)phraseInfo.elementAt(3);
double[] vals = new double[app.size()];
for (int i = 0; i < vals.length; i++) {
vals[i] = ((Counter)app.elementAt(i)).value() /
(double)length; ;
}
double mean = Utils.mean(vals);
double summ = 0.0;
for (int i = 0; i < vals.length; i++) {
double a = vals[i];
//log.info("Appearence " + i + " is at " + a);
summ += (a - mean)*(a - mean);
}
double stdev = Math.sqrt(summ/(double)app.size());
newInst[m_STDEVIndex] = stdev;
/* Using instead of STDEV feature a thesaurus based feature (experiment)
if (m_Vocabulary.getRelated(id,"compositeOf") != null) {
//log.info(m_Vocabulary.getOrig(id) + " is a composite!");
newInst[m_STDEVIndex] = 1.0;
} else {
newInst[m_STDEVIndex] = 0.0;
}
*/
}
// Is node degree attribute being used?
if (m_NODEfeature) {
Vector<String> idsRT = m_Vocabulary.getRelated(id);
int intern = 0;
if (idsRT != null) {
for (int d = 0; d < idsRT.size(); d++) {
if (hash.get(idsRT.elementAt(d)) != null) {
intern++;
}
}
}
// log.info("Node feature for " + m_Vocabulary.getOrig(id) + " = " + intern);
newInst[m_NodeIndex] = (double)intern;
}
// Is term length attribute being used?
if (m_LENGTHfeature) {
String original;
if (m_vocabulary.equals("none")) {
original = id;
} else {
original = m_Vocabulary.getOrig(id);
}
if (original == null) {
log.info("problem with id " + id);
newInst[m_LengthIndex] = 1.0;
} else {
String [] words = split(original," ");
newInst[m_LengthIndex] = (double)words.length;
}
}
// Compute class value
if (hashKeysEval == null) { // no author-assigned keyphrases
newInst[m_NumFeatures] = Instance.missingValue();
} else if (!hashKeysEval.containsKey(id)) {
newInst[m_NumFeatures] = 0; // Not a keyphrase
// Experiment with giving phrases related to manually chosen one
// higher values than to unrelated ones
/*Vector related = (Vector)m_Vocabulary.getRelated(id);
// if this id is related to one of the keyphrases, set its class value to 0.5
if (related != null) {
Enumeration en = related.elements();
while (en.hasMoreElements()) {
String relID = (String)en.nextElement();
if (hashKeysEval.containsKey(relID)) {
newInst[m_NumFeatures] = 1; // Keyphrase
}
}
}
*/
} else {
//hashKeysEval.remove(id);
//newInst[m_NumFeatures] = 1; // Keyphrase
// Learning from multiple-indexer's data
// log.info(m_Indexers);
// log.info("Calculating class value with m_Indexers = " + m_Indexers);
double c = (double)((Counter)hashKeysEval.get(id)).value()/m_Indexers;
newInst[m_NumFeatures] = c; // Keyphrase
// Or simple learning from 1 indexer:
// newInst[m_NumFeatures] = 1.0; // Keyphrase
}
return newInst;
}
/**
* Sets output format and converts pending input instances.
*/
@SuppressWarnings("unchecked")
private void convertPendingInstances() throws Exception {
if (m_Debug) {
log.info("--- Converting pending instances");
}
// Create output format for filter
FastVector atts = new FastVector();
for (int i = 0; i < getInputFormat().numAttributes(); i++) {
if (i == m_DocumentAtt) {
// string attributes
atts.addElement(new Attribute("N-gram", (FastVector) null));
atts.addElement(new Attribute("N-gram-original", (FastVector) null));
// numeric attributes
atts.addElement(new Attribute("TFxIDF"));
atts.addElement(new Attribute("First_occurrence"));
// optional attributes
if (m_Debug) {
if (m_KFused) {
atts.addElement(new Attribute("Keyphrase_frequency"));
}
}
if (m_STDEVfeature) {
//FastVector rvals = new FastVector(2);
//rvals.addElement("False");
//rvals.addElement("True");
atts.addElement(new Attribute("Standard_deviation"));
}
if (m_NODEfeature) {
atts.addElement(new Attribute("Relations_number"));
}
if (m_LENGTHfeature) {
atts.addElement(new Attribute("Phrase_length"));
}
atts.addElement(new Attribute("Probability"));
atts.addElement(new Attribute("Rank"));
} else if (i == m_KeyphrasesAtt) {
FastVector vals = new FastVector(2);
vals.addElement("False");
vals.addElement("True");
//atts.addElement(new Attribute("Keyphrase?", vals));
atts.addElement(new Attribute("Keyphrase?"));
} else {
atts.addElement(getInputFormat().attribute(i));
}
}
Instances outFormat = new Instances("KEAdata", atts, 0);
setOutputFormat(outFormat);
// Convert pending input instances into output data
for(int i = 0; i < getInputFormat().numInstances(); i++) {
Instance current = getInputFormat().instance(i);
FastVector vector = convertInstance(current, true);
Enumeration<Instance> en = vector.elements();
while (en.hasMoreElements()) {
Instance inst = en.nextElement();
push(inst);
}
}
}
/**
* Converts an instance.
*/
private FastVector convertInstance(Instance instance, boolean training)
throws Exception {
FastVector vector = new FastVector();
if (m_Debug) {
log.info("-- Converting instance");
}
// Get the key phrases for the document
HashMap<String, Counter> hashKeyphrases = null;
HashMap<String, Counter> hashKeysEval = null;
if (!instance.isMissing(m_KeyphrasesAtt)) {
String keyphrases = instance.stringValue(m_KeyphrasesAtt);
hashKeyphrases = getGivenKeyphrases(keyphrases, false);
hashKeysEval = getGivenKeyphrases(keyphrases, true);
}
// Get the phrases for the document
HashMap<String, FastVector> hash = new HashMap<String, FastVector>();
int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
// hash = getComposits(hash);
/* Experimental:
To compute how many of the manual keyphrases appear in the documents:
log.info("Doc phrases found " + hash.size());
log.info("Manual keyphrases: ");
Iterator iter = hashKeyphrases.keySet().iterator();
int count = 0;
while (iter.hasNext()) {
String id = (String)iter.next();
if (hash.containsKey(id)) {
count++;
}
}
double max_recall = (double)count/(double)hashKeyphrases.size();
m_max_recall += max_recall;
doc++;
double avg_m_max_recall = m_max_recall/(double)doc;
String file = instance.stringValue(2);
log.info(count + " out of " + hashKeyphrases.size() + " are in the document ");
log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents ");
*/
// Compute number of extra attributes
int numFeatures = 5;
if (m_Debug) {
if (m_KFused) {
numFeatures = numFeatures + 1;
}
}
if (m_STDEVfeature) {
numFeatures = numFeatures + 1;
}
if (m_NODEfeature) {
numFeatures = numFeatures + 1;
}
if (m_LENGTHfeature) {
numFeatures = numFeatures + 1;
}
// Set indices of key attributes
//int phraseAttIndex = m_DocumentAtt;
int tfidfAttIndex = m_DocumentAtt + 2;
int distAttIndex = m_DocumentAtt + 3;
int probsAttIndex = m_DocumentAtt + numFeatures - 1;
//int classAttIndex = numFeatures;
// Go through the phrases and convert them into instances
Iterator<String> it = hash.keySet().iterator();
while (it.hasNext()) {
String id = it.next();
FastVector phraseInfo = (FastVector)hash.get(id);
double[] vals = featVals(id, phraseInfo, training,
hashKeysEval, hashKeyphrases, length, hash);
Instance inst = new Instance(instance.weight(), vals);
inst.setDataset(m_ClassifierData);
// Get probability of a phrase being key phrase
double[] probs = m_Classifier.distributionForInstance(inst);
// If simple Naive Bayes used, change here to
//double prob = probs[1];
double prob = probs[0];
// Compute attribute values for final instance
double[] newInst =
new double[instance.numAttributes() + numFeatures];
int pos = 0;
for (int i = 0; i < instance.numAttributes(); i++) {
if (i == m_DocumentAtt) {
// output of values for a given phrase:
// Add phrase
int index = outputFormatPeek().attribute(pos).
addStringValue(id);
newInst[pos++] = index;
// Add original version
String orig = (String)phraseInfo.elementAt(2);
if (orig != null) {
index = outputFormatPeek().attribute(pos).addStringValue(orig);
} else {
index = outputFormatPeek().attribute(pos).addStringValue(id);
}
newInst[pos++] = index;
// Add TFxIDF
newInst[pos++] = inst.value(m_TfidfIndex);
// Add distance
newInst[pos++] = inst.value(m_FirstOccurIndex);
// Add other features
if (m_Debug) {
if (m_KFused) {
newInst[pos++] = inst.value(m_KeyFreqIndex);
}
}
if (m_STDEVfeature) {
newInst[pos++] = inst.value(m_STDEVIndex);
}
if (m_NODEfeature) {
newInst[pos++] = inst.value(m_NodeIndex);
}
if (m_LENGTHfeature) {
newInst[pos++] = inst.value(m_LengthIndex);
}
// Add probability
probsAttIndex = pos;
newInst[pos++] = prob;
// Set rank to missing (computed below)
newInst[pos++] = Instance.missingValue();
} else if (i == m_KeyphrasesAtt) {
newInst[pos++] = inst.classValue();
} else {
newInst[pos++] = instance.value(i);
}
}
Instance ins = new Instance(instance.weight(), newInst);
ins.setDataset(outputFormatPeek());
vector.addElement(ins);
}
// Add dummy instances for keyphrases that don't occur
// in the document
if (hashKeysEval != null) {
Iterator<String> phrases = hashKeysEval.keySet().iterator();
while (phrases.hasNext()) {
String phrase = phrases.next();
double[] newInst =
new double[instance.numAttributes() + numFeatures];
int pos = 0;
for (int i = 0; i < instance.numAttributes(); i++) {
if (i == m_DocumentAtt) {
// log.info("Here: " + phrase);
// Add phrase
int index = outputFormatPeek().attribute(pos).
addStringValue(phrase);
newInst[pos++] = (double)index;
// Add original version
index = outputFormatPeek().attribute(pos).
addStringValue(phrase);
newInst[pos++] = (double)index;
// Add TFxIDF
newInst[pos++] = Instance.missingValue();
// Add distance
newInst[pos++] = Instance.missingValue();
// Add other features
if (m_Debug) {
if (m_KFused) {
newInst[pos++] = Instance.missingValue();
}
}
if (m_STDEVfeature) {
newInst[pos++] = Instance.missingValue();
}
if (m_NODEfeature) {
newInst[pos++] = Instance.missingValue();
}
if (m_LENGTHfeature) {
newInst[pos++] = Instance.missingValue();
}
// Add probability and rank
newInst[pos++] = -Double.MAX_VALUE;
// newInst[pos++] = Instance.missingValue();
} else if (i == m_KeyphrasesAtt) {
newInst[pos++] = 1; // Keyphrase
} else {
newInst[pos++] = instance.value(i);
}
Instance inst = new Instance(instance.weight(), newInst);
inst.setDataset(outputFormatPeek());
vector.addElement(inst);
}
}
}
// Sort phrases according to their distance (stable sort)
double[] vals = new double[vector.size()];
for (int i = 0; i < vals.length; i++) {
vals[i] = ((Instance)vector.elementAt(i)).value(distAttIndex);
}
FastVector newVector = new FastVector(vector.size());
int[] sortedIndices = Utils.stableSort(vals);
for (int i = 0; i < vals.length; i++) {
newVector.addElement(vector.elementAt(sortedIndices[i]));
}
vector = newVector;
// Sort phrases according to their tfxidf value (stable sort)
for (int i = 0; i < vals.length; i++) {
vals[i] = -((Instance)vector.elementAt(i)).value(tfidfAttIndex);
}
newVector = new FastVector(vector.size());
sortedIndices = Utils.stableSort(vals);
for (int i = 0; i < vals.length; i++) {
newVector.addElement(vector.elementAt(sortedIndices[i]));
}
vector = newVector;
// Sort phrases according to their probability (stable sort)
for (int i = 0; i < vals.length; i++) {
vals[i] = 1 - ((Instance)vector.elementAt(i)).value(probsAttIndex);
}
newVector = new FastVector(vector.size());
sortedIndices = Utils.stableSort(vals);
for (int i = 0; i < vals.length; i++) {
newVector.addElement(vector.elementAt(sortedIndices[i]));
}
vector = newVector;
// Compute rank of phrases. Check for subphrases that are ranked
// lower than superphrases and assign probability -1 and set the
// rank to Integer.MAX_VALUE
int rank = 1;
for (int i = 0; i < vals.length; i++) {
Instance currentInstance = (Instance)vector.elementAt(i);
// Short cut: if phrase very unlikely make rank very low and continue
if (Utils.grOrEq(vals[i], 1.0)) {
currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
continue;
}
// Otherwise look for super phrase starting with first phrase
// in list that has same probability, TFxIDF value, and distance as
// current phrase. We do this to catch all superphrases
// that have same probability, TFxIDF value and distance as current phrase.
int startInd = i;
while (startInd < vals.length) {
Instance inst = (Instance)vector.elementAt(startInd);
if ((inst.value(tfidfAttIndex) !=
currentInstance.value(tfidfAttIndex)) ||
(inst.value(probsAttIndex) !=
currentInstance.value(probsAttIndex)) ||
(inst.value(distAttIndex) !=
currentInstance.value(distAttIndex))) {
break;
}
startInd++;
}
currentInstance.setValue(probsAttIndex + 1, rank++);
}
return vector;
}
/*
private HashMap getComposits(HashMap dict) {
HashMap dictClone = (HashMap)dict.clone();
Iterator it1 = dictClone.keySet().iterator();
while (it1.hasNext()) {
String id1 = (String)it1.next();
String term1 = m_Vocabulary.getOrig(id1);
Iterator it2 = dictClone.keySet().iterator();
while (it2.hasNext()) {
String id2 = (String)it2.next();
String term2 = m_Vocabulary.getOrig(id2);
String composite = term1 + " " + term2;
String idNew = m_Vocabulary.getID(composite);
if (term1 != term2 && idNew != null) {
FastVector vec = (FastVector)dict.get(idNew);
if (vec == null) {
log.info("Found " + m_Vocabulary.getOrig(idNew) + " (" + term1 + ", " + term2 + ")");
// Specifying the size of the vector
// According to additional selected features:
vec = new FastVector(2);
// Update hashtable with all the info
vec.addElement(new Counter(0)); //0
vec.addElement(new Counter()); //1
vec.addElement(m_Vocabulary.getOrig(idNew)); //2
dict.put(idNew, vec);
} else {
// Update number of occurrences
((Counter)((FastVector)vec).elementAt(1)).increment();
}
}
}
}
return dict;
}
*/
/**
* Returns a hashtable. Fills the hashtable
* with the stemmed n-grams occuring in the given string
* (as keys) and the number of times it occurs.
*/
public HashMap<String, Counter> getPhrasesForDictionary(String str) {
String[] buffer = new String[m_MaxPhraseLength];
HashMap<String, Counter> hash = new HashMap<String, Counter>();
StringTokenizer tok = new StringTokenizer(str, "\n");
while (tok.hasMoreTokens()) {
String phrase = tok.nextToken();
// log.info("Sentence " + phrase);
int numSeen = 0;
StringTokenizer wordTok = new StringTokenizer(phrase, " ");
while (wordTok.hasMoreTokens()) {
String word = wordTok.nextToken();
// log.info(word);
// Store word in buffer
for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
buffer[i] = buffer[i + 1];
}
buffer[m_MaxPhraseLength - 1] = word;
// How many are buffered?
numSeen++;
if (numSeen > m_MaxPhraseLength) {
numSeen = m_MaxPhraseLength;
}
// Don't consider phrases that end with a stop word
if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
continue;
}
// Loop through buffer and add phrases to hashtable
StringBuffer phraseBuffer = new StringBuffer();
for (int i = 1; i <= numSeen; i++) {
if (i > 1) {
phraseBuffer.insert(0, ' ');
}
phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
// Don't consider phrases that begin with a stop word
if ((i > 1) &&
(m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
continue;
}
// Only consider phrases with minimum length
if (i >= m_MinPhraseLength) {
// Match against the Vocabulary
String orig = phraseBuffer.toString();
// Create internal representation:
// either a stemmed version or a pseudo phrase:
String pseudo = pseudoPhrase(orig);
// log.info("Checking " + orig + " -- " + pseudo);
String id;
if (m_vocabulary.equals("none")) {
// String pseudo = pseudoPhrase(orig);
id = pseudo;
} else {
id = (String)m_Vocabulary.getID(orig);
}
if (id != null) {
Counter count = (Counter)hash.get(id);
if (count == null) {
hash.put(id, new Counter());
} else {
count.increment();
}
// log.info(orig + "\t" + id);
}
}
}
}
}
return hash;
}
/**
* Expects an empty hashtable. Fills the hashtable
* with the stemmed n-grams occuring in the given string
* (as keys). Stores the position, the number of occurences,
* and the most commonly occurring orgininal version of
* each n-gram.
*
* N-grams that occur less than m_MinNumOccur are not used.
*
* Returns the total number of words (!) in the string.
*/
private int getPhrases(HashMap<String,FastVector> hash, String str) {
//FileOutputStream out = new FileOutputStream("candidates_kea41.txt");
//PrintWriter printer = new PrintWriter(new OutputStreamWriter(out));
// hash = table to store all the information about phrases extracted from "str"
// str = the content of the document, separated by newlines in sentences
String[] buffer = new String[m_MaxPhraseLength];
// Extracting strings of a predefined length from "str":
StringTokenizer tok = new StringTokenizer(str, "\n");
int pos = 1;
while (tok.hasMoreTokens()) {
String phrase = tok.nextToken();
int numSeen = 0;
StringTokenizer wordTok = new StringTokenizer(phrase, " ");
while (wordTok.hasMoreTokens()) {
String word = wordTok.nextToken();
// Store word in buffer
for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
buffer[i] = buffer[i + 1];
}
buffer[m_MaxPhraseLength - 1] = word;
// How many are buffered?
numSeen++;
if (numSeen > m_MaxPhraseLength) {
numSeen = m_MaxPhraseLength;
}
// Don't consider phrases that end with a stop word
if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
pos++;
continue;
}
// Loop through buffer and add phrases to hashtable
StringBuffer phraseBuffer = new StringBuffer();
for (int i = 1; i <= numSeen; i++) {
if (i > 1) {
phraseBuffer.insert(0, ' ');
}
phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
// Don't consider phrases that begin with a stop word
if ((i > 1) &&
(m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
continue;
}
// Final restriction:
// Only consider phrases with minimum length
if (i >= m_MinPhraseLength) {
// orig = each detected phase in its original spelling
String orig = phraseBuffer.toString();
// Create internal representation:
// either a stemmed version or a pseudo phrase:
String id;
if (m_vocabulary.equals("none")) {
String pseudo = pseudoPhrase(orig);
id = pseudo;
} else {
// Match against the Vocabulary
id = (String)m_Vocabulary.getID(orig);
}
// log.info(orig + "\t" + pseudo + " \t " + id);
if (id != null) {
// if Vocabulary is used, derive the correct spelling
// of the descriptor, else use one of the spellings as in the document
if (!m_vocabulary.equals("none")) {
orig = m_Vocabulary.getOrig(id);
}
// Get the vector of the current phrase from the hash table.
// If it was already extracted from "str", the values will be
// updated in next steps, if not a new vector will be created.
FastVector vec = (FastVector)hash.get(id);
if (vec == null) {
// Specifying the size of the vector
// According to additional selected features:
if (m_STDEVfeature) {
vec = new FastVector(3);
} else {
vec = new FastVector(2);
}
// Update hashtable with all the info
vec.addElement(new Counter(pos + 1 - i)); //0
vec.addElement(new Counter()); //1
vec.addElement(orig); //2
if (m_STDEVfeature) {
FastVector app = new FastVector();
app.addElement(new Counter(pos + 1 - i));
vec.addElement(app);
}
hash.put(id, vec);
} else {
// If the phrase already was identified,
// update its values in the old vector
// Update number of occurrences
((Counter)((FastVector)vec).elementAt(1)).increment();
if (m_STDEVfeature) {
FastVector app = (FastVector)vec.elementAt(3);
app.addElement(new Counter(pos + 1 - i));
vec.addElement(app);
}
}
}
}
}
pos++;
}
}
// Replace secondary hashtables with most commonly occurring
// version of each phrase (canonical) form. Delete all words
// that are proper nouns.
Iterator<String> phrases = hash.keySet().iterator();
while (phrases.hasNext()) {
String phrase = phrases.next();
FastVector info = (FastVector)hash.get(phrase);
// Occurring less than m_MinNumOccur? //m_MinNumOccur
if (((Counter)((FastVector)info).elementAt(1)).value() < m_MinNumOccur) {
phrases.remove();
continue;
}
}
return pos;
}
/**
* Splits a string at given character into an array (ALY)
*/
private static String[] split(String str,String separator) {
ArrayList<String> lst = new ArrayList<String>();
String word = "";
for (int i = 0; i < str.length(); i++) {
int j = i + 1;
String letter = str.substring(i,j);
if (!letter.equalsIgnoreCase(separator)) {
word = word + str.charAt(i);
} else {
lst.add(word);
word = "";
}
}
if (word != "") {
lst.add(word);
}
String[] result = (String[])lst.toArray(new String[lst.size()]);
return result;
}
/**
* Gets all the phrases in the given string and puts them into the
* hashtable. Also stores the original version of the stemmed
* phrase in the hash table.
*/
private HashMap<String, Counter> getGivenKeyphrases(String str,
boolean forEval) {
HashMap<String, Counter> hash = new HashMap<String, Counter>();
// m_Indexers = 1;
StringTokenizer tok = new StringTokenizer(str, "\n");
while (tok.hasMoreTokens()) {
String orig = tok.nextToken();
orig = orig.trim();
// This is often the case with Mesh Terms,
// where a term is accompanied by another specifying term
// e.g. Monocytes/*immunology/microbiology
// we ignore everything after the "/" symbol.
if (orig.matches(".+?/.+?")) {
String[] elements = orig.split("/");
orig = elements[0];
}
orig = pseudoPhrase(orig);
if (orig.length() > 0) {
String id;
if (m_vocabulary.equals("none")) {
id = orig;
} else {
id = (String)m_Vocabulary.getID(orig);
}
if (id != null) {
//log.info("\t" + id);
if (!hash.containsKey(id)) {
hash.put(id, new Counter());
} else {
Counter c = (Counter)hash.get(id);
c.increment();
hash.put(id, c);
if (forEval && m_Debug) {
log.info("Skipping the phrase " + orig + ", which appears twice in the author-assigned keyphrase set.");
}
}
}
}
}
if (hash.size() == 0) {
return null;
} else {
return hash;
}
}
/**
* Generates the preudo phrase from a string.
* A pseudo phrase is a version of a phrase
* that only contains non-stopwords,
* which are stemmed and sorted into alphabetical order.
*/
public String pseudoPhrase(String str) {
//log.error(str + "\t");
String[] pseudophrase;
String[] words;
String str_nostop;
String stemmed;
str = str.toLowerCase();
// This is often the case with Mesh Terms,
// where a term is accompanied by another specifying term
// e.g. Monocytes/*immunology/microbiology
// we ignore everything after the "/" symbol.
if (str.matches(".+?/.+?")) {
String[] elements = str.split("/");
str = elements[0];
}
// removes scop notes in brackets
// should be replaced with a cleaner solution
if (str.matches(".+?\\(.+?")) {
String[] elements = str.split("\\(");
str = elements[0];
}
if (str.matches(".+?\\'.+?")) {
String[] elements = str.split("\\'");
str = elements[1];
}
// Remove some non-alphanumeric characters
// str = str.replace('/', ' ');
str = str.replace('-', ' ');
str = str.replace('&', ' ');
str = str.replaceAll("\\*", "");
str = str.replaceAll("\\, "," ");
str = str.replaceAll("\\. "," ");
str = str.replaceAll("\\:","");
str = str.trim();
// Stem string
words = str.split(" ");
str_nostop = "";
for (int i = 0; i < words.length; i++) {
if (!m_Stopwords.isStopword(words[i])) {
if (str_nostop.equals("")) {
str_nostop = words[i];
} else {
str_nostop = str_nostop + " " + words[i];
}
}
}
stemmed = m_Stemmer.stemString(str_nostop);
//log.info(stemmed + "\t" + str_nostop + "\t"+ str);
pseudophrase = sort(stemmed.split(" "));
// log.info(join(pseudophrase));
return join(pseudophrase);
}
/**
* Joins an array of strings to a single string.
*/
private static String join(String[] str) {
String result = "";
for(int i = 0; i < str.length; i++) {
if (result != "") {
result = result + " " + str[i];
} else {
result = str[i];
}
}
return result;
}
/**
* overloaded swap method: exchange 2 locations in an array of Strings.
*/
public static void swap (int loc1, int loc2, String [] a) {
String temp = a [loc1];
a [loc1] = a [loc2];
a [loc2] = temp;
} // end swap
/**
* Sorts an array of Strings into alphabetic order
*
*/
public static String[] sort (String [] a) {
// rename firstAt to reflect new role in alphabetic sorting
int i, j, firstAt;
for (i = 0 ; i < a.length - 1 ; i++) {
firstAt = i;
for (j = i + 1 ; j < a.length ; j++) {
// modify to preserve ordering of a String that starts with
// upper case preceding the otherwise identical String that
// has only lower case letters
if (a [j].toUpperCase ().compareTo (a [firstAt].toUpperCase ()) < 0) {
// reset firstAt
firstAt = j;
}
// if identical when converted to all same case
if (a [j].toUpperCase ().compareTo (a [firstAt].toUpperCase ()) == 0) {
// but a[j] precedes when not converted
if (a [j].compareTo (a [firstAt]) < 0) {
// reset firstAt
firstAt = j;
}
}
}
if (firstAt != i) {
swap (i, firstAt, a);
}
}
return a;
} // end method selectionSort
/**
* Main method for testing this class.
*
* @param argv should contain arguments to the filter: use -h for help
*/
public static void main(String [] argv) {
try {
if (Utils.getFlag('b', argv)) {
Filter.batchFilterFile(new KEAFilter(new StopwordsEnglish()), argv);
} else {
Filter.filterFile(new KEAFilter(new StopwordsEnglish()), argv);
}
} catch (Exception ex) {
log.info(ex.getMessage());
}
}
}