/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.kea.filter;
import java.util.Enumeration;
import java.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.filters.Filter;
/**
* This filter splits the text in selected string
* attributes into phrases. The resulting
* string attributes contain these phrases
* separated by '\n' characters.
*
* Phrases are identified according to the
* following definitions:
*
* A phrase is a sequence of words interrupted
* only by sequences of whitespace characters,
* where each sequence of whitespace characters
* contains at most one '\n'.
*
* A word is a sequence of letters or digits
* that contains at least one letter, with
* the following exceptions:
*
* a) '.', '@', '_', '&', '/', '-' are allowed
* if surrounded by letters or digits,
*
* b) '\'' is allowed if preceeded by a letter
* or digit,
*
* c) '-', '/' are also allowed if succeeded by
* whitespace characters followed by another
* word. In that case the whitespace characters
* will be deleted.
*
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @version 1.0
*/
public class KEAPhraseFilter extends Filter implements OptionHandler {
private static Logger log = LoggerFactory.getLogger(KEAPhraseFilter.class);
/**
*
*/
private static final long serialVersionUID = 1L;
/** Stores which columns to select as a funky range */
protected Range m_SelectCols = new Range();
/** Determines whether internal periods are allowed */
protected boolean m_DisallowInternalPeriods = false;
/**
* Returns a string describing this filter
*
* @return a description of the filter suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "This filter splits the text contained " +
"by the selected string attributes into phrases.";
}
/**
* Returns an enumeration describing the available options
*
* @return an enumeration of all the available options
*/
public Enumeration<Option> listOptions() {
Vector<Option> newVector = new Vector<Option>(3);
newVector.addElement(new Option(
"\tSpecify list of attributes to process. First and last are valid\n"
+"\tindexes. (default none)",
"R", 1, "-R <index1,index2-index4,...>"));
newVector.addElement(new Option(
"\tInvert matching sense",
"V", 0, "-V"));
newVector.addElement(new Option(
"\tDisallow internal periods",
"P", 0, "-P"));
return newVector.elements();
}
/**
* Returns the Capabilities of this filter.
*
* @return the capabilities of this object
* @see Capabilities
*/
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
// attributes
result.enableAllAttributes();
result.enable(Capability.MISSING_VALUES);
// class
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.NO_CLASS);
return result;
}
/**
* Parses a given list of options controlling the behaviour of this object.
* Valid options are:<p>
*
* -R index1,index2-index4,...<br>
* Specify list of attributes to process. First and last are valid indexes.
* (default none)<p>
*
* -V<br>
* Invert matching sense <p>
*
* -P<br>
* Disallow internal periods <p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
String list = Utils.getOption('R', options);
if (list.length() != 0) {
setAttributeIndices(list);
}
setInvertSelection(Utils.getFlag('V', options));
setDisallowInternalPeriods(Utils.getFlag('P', options));
if (getInputFormat() != null) {
setInputFormat(getInputFormat());
}
}
/**
* Gets the current settings of the filter.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] options = new String [4];
int current = 0;
if (getInvertSelection()) {
options[current++] = "-V";
}
if (getDisallowInternalPeriods()) {
options[current++] = "-P";
}
if (!getAttributeIndices().equals("")) {
options[current++] = "-R"; options[current++] = getAttributeIndices();
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input
* instance structure (any instances contained in the object are
* ignored - only the structure is required).
* @return true if the outputFormat may be collected immediately
*/
public boolean setInputFormat(Instances instanceInfo) throws Exception {
super.setInputFormat(instanceInfo);
setOutputFormat(instanceInfo);
m_SelectCols.setUpper(instanceInfo.numAttributes() - 1);
return true;
}
/**
* Input an instance for filtering. Ordinarily the instance is processed
* and made available for output immediately. Some filters require all
* instances be read before producing output.
*
* @param instance the input instance
* @return true if the filtered instance may now be
* collected with output().
* @exception Exception if the input instance was not of the correct
* format or if there was a problem with the filtering.
*/
public boolean input(Instance instance) throws Exception {
if (getInputFormat() == null) {
throw new Exception("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
convertInstance(instance);
return true;
}
/**
* Signify that this batch of input to the filter is finished. If
* the filter requires all instances prior to filtering, output()
* may now be called to retrieve the filtered instances. Any
* subsequent instances filtered should be filtered based on setting
* obtained from the first batch (unless the inputFormat has been
* re-assigned or new options have been set). This default
* implementation assumes all instance processing occurs during
* inputFormat() and input().
*
* @return true if there are instances pending output
* @exception NullPointerException if no input structure has been defined,
* @exception Exception if there was a problem finishing the batch.
*/
public boolean batchFinished() throws Exception {
if (getInputFormat() == null) {
throw new NullPointerException("No input instance format defined");
}
m_NewBatch = true;
return (numPendingOutput() != 0);
}
/**
* Main method for testing this class.
*
* @param argv should contain arguments to the filter: use -h for help
*/
public static void main(String [] argv) {
try {
if (Utils.getFlag('b', argv)) {
Filter.batchFilterFile(new KEAPhraseFilter(), argv);
} else {
Filter.filterFile(new KEAPhraseFilter(), argv);
}
} catch (Exception ex) {
log.error(ex.getMessage(), ex);
}
}
/**
* Converts an instance by removing all non-alphanumeric characters
* from its string attribute values.
*/
private void convertInstance(Instance instance) throws Exception {
double[] instVals = new double[instance.numAttributes()];
for (int i = 0; i < instance.numAttributes(); i++) {
if (!instance.attribute(i).isString() ||
instance.isMissing(i)) {
instVals[i] = instance.value(i);
} else {
if (!m_SelectCols.isInRange(i)) {
int index = getOutputFormat().attribute(i).
addStringValue(instance.stringValue(i));
instVals[i] = (double)index;
continue;
}
// aly: str = text of the document
String str = instance.stringValue(i);
String tokenized = tokenize(str);
// aly: resultStr is the clean version of str
// log.info(resultStr.toString());
int index = getOutputFormat().attribute(i).
addStringValue(tokenized);
instVals[i] = (double)index;
}
}
Instance inst = new Instance(instance.weight(), instVals);
inst.setDataset(getOutputFormat());
push(inst);
}
public String tokenize (String str) {
StringBuffer resultStr = new StringBuffer();
int j = 0;
boolean phraseStart = true;
boolean seenNewLine = false;
boolean haveSeenHyphen = false;
boolean haveSeenSlash = false;
while (j < str.length()) {
boolean isWord = false;
boolean potNumber = false;
int startj = j;
while (j < str.length()) {
char ch = str.charAt(j);
if (Character.isLetterOrDigit(ch)) {
potNumber = true;
isWord = true;
//aly: allowing digits as words
/*if (Character.isLetter(ch)) {
isWord = true;
}
*/
j++;
} else if ((!m_DisallowInternalPeriods && (ch == '.')) ||
(ch == '@') ||
(ch == '_') ||
(ch == '&') ||
(ch == '/') ||
(ch == '-')) {
if ((j > 0) && (j + 1 < str.length()) &&
Character.isLetterOrDigit(str.charAt(j - 1)) &&
Character.isLetterOrDigit(str.charAt(j + 1))) {
j++;
} else {
break;
}
} else if (ch == '\'') {
if ((j > 0) &&
Character.isLetterOrDigit(str.charAt(j - 1))) {
j++;
} else {
break;
}
} else {
break;
}
}
if (isWord == true) {
if (!phraseStart) {
if (haveSeenHyphen) {
resultStr.append('-');
} else if (haveSeenSlash) {
resultStr.append('/');
} else {
resultStr.append(' ');
}
}
resultStr.append(str.substring(startj, j));
if (j == str.length()) {
break;
}
phraseStart = false;
seenNewLine = false;
haveSeenHyphen = false;
haveSeenSlash = false;
if (Character.isWhitespace(str.charAt(j))) {
if (str.charAt(j) == '\n') {
seenNewLine = true;
}
} else if (str.charAt(j) == '-') {
haveSeenHyphen = true;
} else if (str.charAt(j) == '/') {
haveSeenSlash = true;
} else {
phraseStart = true;
resultStr.append('\n');
}
j++;
} else if (j == str.length()) {
break;
} else if (str.charAt(j) == '\n') {
if (seenNewLine) {
if (phraseStart == false) {
resultStr.append('\n');
phraseStart = true;
}
} else if (potNumber) {
if (phraseStart == false) {
phraseStart = true;
resultStr.append('\n');
}
}
seenNewLine = true;
j++;
} else if (Character.isWhitespace(str.charAt(j))) {
if (potNumber) {
if (phraseStart == false) {
phraseStart = true;
resultStr.append('\n');
}
}
j++;
} else {
if (phraseStart == false) {
resultStr.append('\n');
phraseStart = true;
}
j++;
}
}
return resultStr.toString();
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String invertSelectionTipText() {
return "If set to false, the specified attributes will be processed;"
+ " If set to true, specified attributes won't be processed.";
}
/**
* Get whether the supplied columns are to be processed
*
* @return true if the supplied columns won't be processed
*/
public boolean getInvertSelection() {
return m_SelectCols.getInvert();
}
/**
* Set whether selected columns should be processed. If true the
* selected columns won't be processed.
*
* @param invert the new invert setting
*/
public void setInvertSelection(boolean invert) {
m_SelectCols.setInvert(invert);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String disallowInternalPeriodsTipText() {
return "If set to false, internal periods are allowed.";
}
/**
* Get whether the supplied columns are to be processed
*
* @return true if the supplied columns won't be processed
*/
public boolean getDisallowInternalPeriods() {
return m_DisallowInternalPeriods;
}
/**
* Set whether selected columns should be processed. If true the
* selected columns won't be processed.
*
* @param disallow the new invert setting
*/
public void setDisallowInternalPeriods(boolean disallow) {
m_DisallowInternalPeriods = disallow;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String attributeIndicesTipText() {
return "Specify range of attributes to act on."
+ " This is a comma separated list of attribute indices, with"
+ " \"first\" and \"last\" valid values. Specify an inclusive"
+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
}
/**
* Get the current range selection.
*
* @return a string containing a comma separated list of ranges
*/
public String getAttributeIndices() {
return m_SelectCols.getRanges();
}
/**
* Set which attributes are to be processed
*
* @param rangeList a string representing the list of attributes. Since
* the string will typically come from a user, attributes are indexed from
* 1. <br>
* eg: first-3,5,6-last
*/
public void setAttributeIndices(String rangeList) {
m_SelectCols.setRanges(rangeList);
}
/**
* Set which attributes are to be processed
*
* @param attributes an array containing indexes of attributes to select.
* Since the array will typically come from a program, attributes are indexed
* from 0.
*/
public void setAttributeIndicesArray(int [] attributes) {
setAttributeIndices(Range.indicesToRangeList(attributes));
}
}