/*
* This file is part of ALOE.
*
* ALOE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* ALOE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with ALOE. If not, see <http://www.gnu.org/licenses/>.
*
* Copyright (c) 2012 SCCL, University of Washington (http://depts.washington.edu/sccl)
*/
package etc.aloe.filters;
import java.io.Serializable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import weka.core.Capabilities.Capability;
import weka.core.*;
import weka.filters.SimpleStreamFilter;
import weka.filters.UnsupervisedFilter;
/**
* Abstract class representing a Weka filter that detects occurrences of regular
* expressions in a specific string field.
*
* @author Michael Brooks <mjbrooks@uw.edu>
*/
public abstract class AbstractRegexFilter extends SimpleStreamFilter
implements UnsupervisedFilter {
private boolean countRegexLengths = false;
/**
* Set to true to add features for the length of the regex match. Defaults
* to false.
*
* @param countRegexLengths
*/
public void setCountRegexLengths(boolean countRegexLengths) {
this.countRegexLengths = countRegexLengths;
}
/**
* A mapping entity between names (feature names) and regular expressions.
*/
protected static class NamedRegex implements Serializable {
private final String name;
private final String regex;
private final Pattern pattern;
public NamedRegex(String name, String regex) {
this(name, regex, 0);
}
public NamedRegex(String name, String regex, int flags) {
this.name = name;
this.regex = regex;
this.pattern = Pattern.compile(regex, flags);
}
public String getName() {
return name;
}
public String getRegex() {
return regex;
}
public Pattern getPattern() {
return pattern;
}
}
private String stringAttributeName;
private int stringAttributeIndex = -1;
protected abstract NamedRegex[] getRegexFeatures();
public String getStringAttributeName() {
return stringAttributeName;
}
public void setStringAttributeName(String stringAttributeName) {
this.stringAttributeName = stringAttributeName;
}
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.enableAllAttributes();
result.enableAllClasses();
result.enable(Capability.NO_CLASS); //// filter doesn't need class to be set//
return result;
}
@Override
protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
if (stringAttributeName == null) {
throw new IllegalStateException("String attribute name not set");
}
Instances outputFormat = new Instances(inputFormat, 0);
Attribute stringAttr = inputFormat.attribute(stringAttributeName);
stringAttributeIndex = stringAttr.index();
//Add the new columns. There is one for each regex feature.
NamedRegex[] regexFeatures = getRegexFeatures();
for (int i = 0; i < regexFeatures.length; i++) {
String name = regexFeatures[i].getName();
Attribute attr = new Attribute(name);
outputFormat.insertAttributeAt(attr, outputFormat.numAttributes());
if (countRegexLengths) {
name = name + "_L";
attr = new Attribute(name);
outputFormat.insertAttributeAt(attr, outputFormat.numAttributes());
}
}
return outputFormat;
}
@Override
protected Instance process(Instance instance) throws Exception {
if (stringAttributeIndex < 0) {
throw new IllegalStateException("String attribute not set");
}
String stringValue = instance.stringValue(stringAttributeIndex);
NamedRegex[] regexFeatures = getRegexFeatures();
int numOldValues = instance.numAttributes();
int numNewFeatures = regexFeatures.length;
if (countRegexLengths) {
numNewFeatures = regexFeatures.length * 2;
}
double[] newValues = new double[numOldValues + numNewFeatures];
// Copy all attributes from input to output
for (int i = 0; i < getInputFormat().numAttributes(); i++) {
if (getInputFormat().attribute(i).type() != Attribute.STRING) {
// Add simple nominal and numeric attributes directly
if (instance.value(i) != 0.0) {
newValues[i] = instance.value(i);
}
} else {
if (instance.isMissing(i)) {
newValues[i] = Utils.missingValue();
} else {
// If this is a string attribute, we have to first add
// this value to the range of possible values, then add
// its new internal index.
if (outputFormatPeek().attribute(i).numValues() == 0) {
// Note that the first string value in a
// SparseInstance doesn't get printed.
outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
}
int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i));
newValues[i] = newIndex;
}
}
}
for (int i = 0; i < regexFeatures.length; i++) {
Pattern pattern = regexFeatures[i].getPattern();
Matcher matches = pattern.matcher(stringValue);
int count = 0;
int maxLength = 0;
while (matches.find()) {
count++;
int len = matches.group().length();
if (len > maxLength) {
maxLength = len;
}
}
int index = numOldValues + i;
if (countRegexLengths) {
index = numOldValues + 2 * i;
}
newValues[index] = count;
if (countRegexLengths) {
newValues[numOldValues + 2 * i + 1] = maxLength;
}
}
Instance result = new SparseInstance(instance.weight(), newValues);
return result;
}
@Override
public String globalInfo() {
return "Generates a set of attributes from a string attribute. Each new attribute is defined by a regular expression.";
}
/**
* Combines an array of string fragments into a regex-compatible string
* using the alternative symbol: "|" All fragments are escaped.
*
* @param fragments
* @return
*/
protected String toRegex(String[] fragments) {
return toRegex(fragments, true);
}
/**
* Combines an array of string fragments into a regex-compatible string
* using the alternative symbol: "|" If escape is true, escapes all special
* characters in the fragments.
*
* @param fragments
* @param escape
* @return
*/
protected String toRegex(String[] fragments, boolean escape) {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < fragments.length; i++) {
String fragment = fragments[i];
if (escape) {
fragment = Pattern.quote(fragment);
}
if (builder.length() > 0) {
builder.append("|");
}
builder.append(fragment);
}
return builder.toString();
}
}