/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * SubstringLabeler.java * Copyright (C) 2011-2012 University of Waikato, Hamilton, New Zealand * */ package weka.gui.beans; import java.awt.BorderLayout; import java.beans.EventSetDescriptor; import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import javax.swing.JPanel; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Environment; import weka.core.EnvironmentHandler; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.Range; import weka.core.SerializedObject; import weka.core.Utils; import weka.filters.unsupervised.attribute.Add; import weka.gui.Logger; /** * A bean that finds matches in string attribute values (using either substring * or regular expression matches) and labels the instance (sets the value of * a new attribute) according to the supplied label for the matching rule. The new * label attribute can be either multivalued nominal (if each match rule specified * has an explicit label associated with it) or, binary numeric/nominal to indicate * that one of the match rules has matched or not matched. * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: 8106 $ * */ @KFStep(category = "Tools", toolTipText = "Label instances according to substring matches in String attributes") public class SubstringLabeler extends JPanel implements BeanCommon, Visible, Serializable, InstanceListener, TrainingSetListener, TestSetListener, DataSourceListener, EventConstraints, EnvironmentHandler, DataSource { /** * For serialization */ private static final long serialVersionUID = 6297059699297260134L; /** * Inner class encapsulating the logic for matching * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) */ protected static class Match { /** The substring literal/regex to use for matching */ protected String m_match = ""; protected String m_label = ""; /** True if a regular expression match is to be used */ protected boolean m_regex; /** True if case should be ignored when matching */ protected boolean m_ignoreCase; /** Precompiled regex pattern */ protected Pattern m_regexPattern; /** The attributes to apply the match-replace rule to */ protected String m_attsToApplyTo = ""; protected String m_matchS; protected String m_labelS; protected int[] m_selectedAtts; protected String m_statusMessagePrefix; protected Logger m_logger; /** * Constructor */ public Match() { } /** * Constructor * * @param setup an internally encoded representation of * all the match information for this rule */ public Match(String setup) { parseFromInternal(setup); } /** * Constructor * * @param match the match string * @param regex true if this is a regular expression match * @param ignoreCase true if case is to be ignored * @param selectedAtts the attributes to apply the rule to */ public Match(String match, boolean regex, boolean ignoreCase, String selectedAtts) { m_match = match; m_regex = regex; m_ignoreCase = ignoreCase; m_attsToApplyTo = selectedAtts; } protected void parseFromInternal(String setup) { String[] parts = setup.split("@@MR@@"); if (parts.length < 4 || parts.length > 5) { throw new IllegalArgumentException("Malformed match definition: " + setup); } m_attsToApplyTo = parts[0].trim(); m_regex = parts[1].trim().toLowerCase().equals("t"); m_ignoreCase = parts[2].trim().toLowerCase().equals("t"); m_match = parts[3].trim(); if (m_match == null || m_match.length() == 0) { throw new IllegalArgumentException("Must provide something to match!"); } if (parts.length == 5) { m_label = parts[4].trim(); } } /** * Set the string/regex to use for matching * * @param match the match string */ public void setMatch(String match) { m_match = match; } /** * Get the string/regex to use for matching * * @return the match string */ public String getMatch() { return m_match; } /** * Set the label to assign if this rule matches, or * empty string if binary flag attribute is being created. * * @param label the label string or empty string */ public void setLabel(String label) { m_label = label; } /** * Get the label to assign if this rule matches, or * empty string if binary flag attribute is being created. * * @return the label string or empty string */ public String getLabel() { return m_label; } /** * Set whether this is a regular expression match or not * * @param regex true if this is a regular expression match */ public void setRegex(boolean regex) { m_regex = regex; } /** * Get whether this is a regular expression match or not * * @return true if this is a regular expression match */ public boolean getRegex() { return m_regex; } /** * Set whether to ignore case when matching * * @param ignore true if case is to be ignored */ public void setIgnoreCase(boolean ignore) { m_ignoreCase = ignore; } /** * Get whether to ignore case when matching * * @return true if case is to be ignored */ public boolean getIgnoreCase() { return m_ignoreCase; } /** * Set the attributes to apply the rule to * * @param a the attributes to apply the rule to. */ public void setAttsToApplyTo(String a) { m_attsToApplyTo = a; } /** * Get the attributes to apply the rule to * * @return the attributes to apply the rule to. */ public String getAttsToApplyTo() { return m_attsToApplyTo; } /** * Initialize this match rule by substituting any * environment variables in the attributes, match and label * strings. Sets up the attribute indices to apply to and * validates that the selected attributes are all String * attributes * * @param env the environment variables * @param structure the structure of the incoming instances */ public void init(Environment env, Instances structure) { m_matchS = m_match; m_labelS = m_label; String attsToApplyToS = m_attsToApplyTo; try { m_matchS = env.substitute(m_matchS); m_labelS = env.substitute(m_labelS); attsToApplyToS = env.substitute(attsToApplyToS); } catch (Exception ex) {} if (m_regex) { String match = m_matchS; if (m_ignoreCase) { match = match.toLowerCase(); } // precompile regular expression for speed m_regexPattern = Pattern.compile(match); } // Try a range first for the attributes String tempRangeS = attsToApplyToS; tempRangeS = tempRangeS.replace("/first", "first").replace("/last", "last"); Range tempR = new Range(); tempR.setRanges(attsToApplyToS); try { tempR.setUpper(structure.numAttributes() - 1); m_selectedAtts = tempR.getSelection(); } catch (IllegalArgumentException ex) { // probably contains attribute names then m_selectedAtts = null; } if (m_selectedAtts == null) { // parse the comma separated list of attribute names Set<Integer> indexes = new HashSet<Integer>(); String[] attParts = m_attsToApplyTo.split(","); for (String att : attParts) { att = att.trim(); if (att.toLowerCase().equals("/first")) { indexes.add(0); } else if (att.toLowerCase().equals("/last")) { indexes.add((structure.numAttributes() - 1)); } else { // try and find attribute if (structure.attribute(att) != null) { indexes.add(new Integer(structure.attribute(att).index())); } else { if (m_logger != null) { String msg = m_statusMessagePrefix + "Can't find attribute '" + att + "in the incoming instances - ignoring"; m_logger.logMessage(msg); } } } } m_selectedAtts = new int[indexes.size()]; int c = 0; for (Integer i : indexes) { m_selectedAtts[c++] = i.intValue(); } } // validate the types of the selected atts Set<Integer> indexes = new HashSet<Integer>(); for (int i = 0; i < m_selectedAtts.length; i++) { if (structure.attribute(m_selectedAtts[i]).isString()) { indexes.add(m_selectedAtts[i]); } else { if (m_logger != null) { String msg = m_statusMessagePrefix + "Attribute '" + structure.attribute(m_selectedAtts[i]).name() + "is not a string attribute - " + "ignoring"; m_logger.logMessage(msg); } } } // final array m_selectedAtts = new int[indexes.size()]; int c = 0; for (Integer i : indexes) { m_selectedAtts[c++] = i.intValue(); } } /** * Apply this rule to the supplied instance * * @param inst the instance to apply to * * @return the label (or empty string) if this rule * matches (empty string is used to indicate a match * in the case that a binary flag attribute is being * created), or null if the rule doesn't match. */ public String apply(Instance inst) { for (int i = 0; i < m_selectedAtts.length; i++) { if (!inst.isMissing(m_selectedAtts[i])) { String value = inst.stringValue(m_selectedAtts[i]); String result = apply(value); if (result != null) { // first match is good enough return result; } } } return null; } /** * Apply this rule to the supplied string * * @param source the string to apply to * @return the label (or empty string) if this rule * matches (empty string is used to indicate a match * in the case that a binary flag attribute is being * created), or null if the rule doesn't match. */ protected String apply(String source) { String result = source; String match = m_matchS; boolean ruleMatches = false; if (m_ignoreCase) { result = result.toLowerCase(); match = match.toLowerCase(); } if (result != null && result.length() > 0) { if (m_regex) { if (m_regexPattern.matcher(result).matches()) { //if (result.matches(match)) { ruleMatches = true; } } else { ruleMatches = (result.indexOf(match) >= 0); } } return (ruleMatches) ? m_label : null; } /** * Return a textual description of this match rule * * @return a textual description of this match rule */ public String toString() { // return a nicely formatted string for display // that shows all the details StringBuffer buff = new StringBuffer(); buff.append((m_regex) ? "Regex: " : "Substring: "); buff.append(m_match).append(" "); buff.append((m_ignoreCase) ? "[ignore case]" : "").append(" "); if (m_label != null && m_label.length() > 0) { buff.append("Label: ").append(m_label).append(" "); } buff.append("[Atts: " + m_attsToApplyTo + "]"); return buff.toString(); } protected String toStringInternal() { // return a string in internal format that is // easy to parse all the data out of StringBuffer buff = new StringBuffer(); buff.append(m_attsToApplyTo).append("@@MR@@"); buff.append((m_regex) ? "t" : "f").append("@@MR@@"); buff.append((m_ignoreCase) ? "t" : "f").append("@@MR@@"); buff.append(m_match).append("@@MR@@"); buff.append(m_label); return buff.toString(); } } /** Environment variables */ protected transient Environment m_env; /** Internally encoded list of match rules */ protected String m_matchDetails = ""; /** Temporary list of match-replace rules */ protected transient List<Match> m_matchRules; /** Logging */ protected transient Logger m_log; /** Busy indicator */ protected transient boolean m_busy; /** Component talking to us */ protected Object m_listenee; /** Downstream steps listening to instance events */ protected ArrayList<InstanceListener> m_instanceListeners = new ArrayList<InstanceListener>(); /** Downstream steps listening to data set events */ protected ArrayList<DataSourceListener> m_dataListeners = new ArrayList<DataSourceListener>(); /** * Whether to make the binary match/non-match attribute * a nominal (rather than numeric) binary attribute. */ protected boolean m_nominalBinary; /** * For multi-valued labeled rules, whether or not to consume * non-matching instances or output them with missing value * for the match attribute. */ protected boolean m_consumeNonMatchingInstances; /** * Whether the match rules all have labels or not. If not, then the * new attribute is a binary match/no-match one */ protected boolean m_hasLabels; /** Add filter for adding the new attribute */ protected Add m_addFilter; /** Name of the new attribute */ protected String m_attName = "Match"; /** The output structure */ protected Instances m_outputStructure; /** Instance event to use */ protected InstanceEvent m_ie = new InstanceEvent(this); /** * Default visual filters */ protected BeanVisual m_visual = new BeanVisual("SubstringLabeler", BeanVisual.ICON_PATH+"DefaultFilter.gif", BeanVisual.ICON_PATH+"DefaultFilter_animated.gif"); /** * Constructor */ public SubstringLabeler() { useDefaultVisual(); setLayout(new BorderLayout()); add(m_visual, BorderLayout.CENTER); m_env = Environment.getSystemWide(); } /** * Help information suitable for displaying in the GUI. * * @return a description of this component */ public String globalInfo() { return "Matches substrings in String attributes using " + "either literal or regular expression matches. " + "The value of a new attribute is set to reflect" + " the status of the match. The new attribute can " + "be either binary (in which case values indicate " + "match or no match) or multi-valued nominal, " + "in which case a label must be associated with each " + "distinct matching rule. In the case of labeled matches, " + "the user can opt to have non matching instances output " + "with missing value set for the new attribute or not" + " output at all (i.e. consumed by the step)."; } /** * Set internally encoded list of match rules * * @param details the list of match rules */ public void setMatchDetails(String details) { m_matchDetails = details; } /** * Get the internally encoded list of match rules * * @return the match rules */ public String getMatchDetails() { return m_matchDetails; } /** * Set whether the new attribute created should be a nominal binary * attribute rather than a numeric binary attribute. * * @param nom true if the attribute should be a nominal binary one */ public void setNominalBinary(boolean nom) { m_nominalBinary = nom; } /** * Get whether the new attribute created should be a nominal binary * attribute rather than a numeric binary attribute. * * @return true if the attribute should be a nominal binary one */ public boolean getNominalBinary() { return m_nominalBinary; } /** * Set whether instances that do not match any of the rules should be * "consumed" rather than output with a missing value set for the new * attribute. * * @param consume true if non matching instances should be consumed by * the component. */ public void setConsumeNonMatching(boolean consume) { m_consumeNonMatchingInstances = consume; } /** * Get whether instances that do not match any of the rules should be * "consumed" rather than output with a missing value set for the new * attribute. * * @return true if non matching instances should be consumed by * the component. */ public boolean getConsumeNonMatching() { return m_consumeNonMatchingInstances; } public void setMatchAttributeName(String name) { m_attName = name; } public String getMatchAttributeName() { return m_attName; } /** * Add a datasource listener * * @param dsl the datasource listener to add */ public void addDataSourceListener(DataSourceListener dsl) { m_dataListeners.add(dsl); } /** * Remove a datasource listener * * @param dsl the datasource listener to remove */ public void removeDataSourceListener(DataSourceListener dsl) { m_dataListeners.remove(dsl); } /** * Add an instance listener * * @param dsl the instance listener to add */ public void addInstanceListener(InstanceListener dsl) { m_instanceListeners.add(dsl); } /** * Remove an instance listener * * @param dsl the instance listener to remove */ public void removeInstanceListener(InstanceListener dsl) { m_instanceListeners.remove(dsl); } /** * Set environment variables to use */ public void setEnvironment(Environment env) { m_env = env; } /** * Returns true if, at the current time, the named event could be * generated. * * @param eventName the name of the event in question * @return true if the named event could be generated */ public boolean eventGeneratable(String eventName) { if (m_listenee == null) { return false; } if (!eventName.equals("instance") && !eventName.equals("dataSet")) { return false; } if (m_listenee instanceof DataSource) { if (m_listenee instanceof EventConstraints) { EventConstraints ec = (EventConstraints)m_listenee; return ec.eventGeneratable(eventName); } } if (m_listenee instanceof TrainingSetProducer) { if (m_listenee instanceof EventConstraints) { EventConstraints ec = (EventConstraints)m_listenee; if (!eventName.equals("dataSet")) { return false; } if (!ec.eventGeneratable("trainingSet")) { return false; } } } if (m_listenee instanceof TestSetProducer) { if (m_listenee instanceof EventConstraints) { EventConstraints ec = (EventConstraints)m_listenee; if (!eventName.equals("dataSet")) { return false; } if (!ec.eventGeneratable("testSet")) { return false; } } } return true; } /** * Use the default visual representation */ public void useDefaultVisual() { m_visual.loadIcons(BeanVisual.ICON_PATH+"DefaultFilter.gif", BeanVisual.ICON_PATH+"DefaultFilter_animated.gif"); m_visual.setText("SubstringLabeler"); } /** * Set a new visual representation * * @param newVisual a <code>BeanVisual</code> value */ public void setVisual(BeanVisual newVisual) { m_visual = newVisual; } /** * Get the visual representation * * @return a <code>BeanVisual</code> value */ public BeanVisual getVisual() { return m_visual; } /** * Set a custom (descriptive) name for this bean * * @param name the name to use */ public void setCustomName(String name) { m_visual.setText(name); } /** * Get the custom (descriptive) name for this bean (if one has been set) * * @return the custom name (or the default name) */ public String getCustomName() { return m_visual.getText(); } /** * Stop any processing that the bean might be doing. */ public void stop() { if (m_listenee != null) { if (m_listenee instanceof BeanCommon) { ((BeanCommon)m_listenee).stop(); } } if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Stopped"); } m_busy = false; } /** * Returns true if. at this time, the bean is busy with some * (i.e. perhaps a worker thread is performing some calculation). * * @return true if the bean is busy. */ public boolean isBusy() { return m_busy; } /** * Set a logger * * @param logger a <code>weka.gui.Logger</code> value */ public void setLog(Logger logger) { m_log = logger; } /** * Returns true if, at this time, * the object will accept a connection via the named event * * @param esd the EventSetDescriptor for the event in question * @return true if the object will accept a connection */ public boolean connectionAllowed(EventSetDescriptor esd) { return connectionAllowed(esd.getName()); } /** * Returns true if, at this time, * the object will accept a connection via the named event * * @param eventName the name of the event * @return true if the object will accept a connection */ public boolean connectionAllowed(String eventName) { if (!eventName.equals("instance") && !eventName.equals("dataSet") && !eventName.equals("trainingSet") && !eventName.equals("testSet")) { return false; } if (m_listenee != null) { return false; } return true; } /** * Notify this object that it has been registered as a listener with * a source for receiving events described by the named event * This object is responsible for recording this fact. * * @param eventName the event * @param source the source with which this object has been registered as * a listener */ public void connectionNotification(String eventName, Object source) { if (connectionAllowed(eventName)) { m_listenee = source; } } /** * Notify this object that it has been deregistered as a listener with * a source for named event. This object is responsible * for recording this fact. * * @param eventName the event * @param source the source with which this object has been registered as * a listener */ public void disconnectionNotification(String eventName, Object source) { if (source == m_listenee) { m_listenee = null; } } /** * Make the output instances structure * * @param inputStructure the incoming instances structure * @throws Exception if a problem occurs */ protected void makeOutputStructure(Instances inputStructure) throws Exception { m_matchRules = new ArrayList<Match>(); if (m_matchDetails != null && m_matchDetails.length() > 0) { String[] matchParts = m_matchDetails.split("@@match-rule@@"); for (String p : matchParts) { Match m = new Match(p.trim()); m.m_statusMessagePrefix = statusMessagePrefix(); m.m_logger = m_log; m.init(m_env, inputStructure); m_matchRules.add(m); } int labelCount = 0; //StringBuffer labelList = new StringBuffer(); HashSet<String> uniqueLabels = new HashSet<String>(); FastVector labelVec = new FastVector(); for (Match m : m_matchRules) { if (m.getLabel() != null && m.getLabel().length() > 0) { if (!uniqueLabels.contains(m.getLabel())) { /* if (labelCount > 0) { labelList.append(","); } */ // labelList.append(m.getLabel()); uniqueLabels.add(m.getLabel()); labelVec.addElement(m.getLabel()); } labelCount++; } } if (labelCount > 0) { if (labelCount == m_matchRules.size()) { m_hasLabels = true; } else { throw new Exception("Can't have only some rules with a label!"); } } m_outputStructure = (Instances)(new SerializedObject(inputStructure).getObject()); Attribute newAtt = null; if (m_hasLabels) { newAtt = new Attribute(m_attName, labelVec); } else if (getNominalBinary()) { labelVec.addElement("0"); labelVec.addElement("1"); newAtt = new Attribute(m_attName, labelVec); } else { newAtt = new Attribute(m_attName); } m_outputStructure.insertAttributeAt(newAtt, m_outputStructure.numAttributes()); /* // make the output structure m_addFilter = new Add(); m_addFilter.setAttributeName(m_attName); if (m_hasLabels) { m_addFilter.setNominalLabels(labelList.toString()); } else if (getNominalBinary()) { m_addFilter.setNominalLabels("0,1"); } m_addFilter.setInputFormat(inputStructure); m_outputStructure = Filter.useFilter(inputStructure, m_addFilter); */ return; } m_outputStructure = new Instances(inputStructure); } /** * Accept and process an instance event * * @param e the instance event to process */ public void acceptInstance(InstanceEvent e) { m_busy = true; if (e.getStatus() == InstanceEvent.FORMAT_AVAILABLE) { Instances structure = e.getStructure(); try { makeOutputStructure(structure); } catch (Exception ex) { String msg = statusMessagePrefix() + "ERROR: unable to create output instances structure."; if (m_log != null) { m_log.statusMessage(msg); m_log.logMessage("[SubstringLabeler] " + ex.getMessage()); } stop(); ex.printStackTrace(); m_busy = false; return; } if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Processing stream..."); } m_ie.setStructure(m_outputStructure); notifyInstanceListeners(m_ie); } else { Instance inst = e.getInstance(); Instance out = null; if (inst != null) { out = makeOutputInstance(inst, false); } if (inst == null || out != null || e.getStatus() == InstanceEvent.BATCH_FINISHED) { // consumed // notify listeners m_ie.setInstance(out); m_ie.setStatus(e.getStatus()); notifyInstanceListeners(m_ie); } if (e.getStatus() == InstanceEvent.BATCH_FINISHED || inst == null) { // we're done if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Finished"); } } } m_busy = false; } /** * Process and input instance and return an output instance * * @param inputI the incoming instance * @param batch whether this is being processed as part of a * batch of instances * * @return the output instance */ protected Instance makeOutputInstance(Instance inputI, boolean batch) { int newAttIndex = m_outputStructure.numAttributes() - 1; Instance result = inputI; if (m_matchRules.size() > 0) { String label = null; for (Match m : m_matchRules) { label = m.apply(inputI); if (label != null) { break; } } double[] vals = new double[m_outputStructure.numAttributes()]; for (int i = 0; i < inputI.numAttributes(); i++) { if (!inputI.attribute(i).isString()) { vals[i] = inputI.value(i); } else { if (!batch) { vals[i] = 0; String v = inputI.stringValue(i); m_outputStructure.attribute(i).setStringValue(v); } else { String v = inputI.stringValue(i); vals[i] = m_outputStructure.attribute(i).addStringValue(v); } } } if (label != null) { if (m_hasLabels) { vals[newAttIndex] = m_outputStructure.attribute(m_attName).indexOfValue(label); } else { vals[newAttIndex] = 1; } } else { // non match if (m_hasLabels) { if (!getConsumeNonMatching()) { vals[newAttIndex] = Utils.missingValue(); } else { return null; } } else { vals[newAttIndex] = 0; } } result = new DenseInstance(1.0, vals); result.setDataset(m_outputStructure); } return result; } /** * Accept and process a data set event * * @param e the data set event to process */ public void acceptDataSet(DataSetEvent e) { m_busy = true; if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Processing batch..."); } try { makeOutputStructure(new Instances(e.getDataSet(), 0)); } catch (Exception ex) { String msg = statusMessagePrefix() + "ERROR: unable to create output instances structure."; if (m_log != null) { m_log.statusMessage(msg); m_log.logMessage("[SubstringLabeler] " + ex.getMessage()); } stop(); ex.printStackTrace(); m_busy = false; return; } Instances toProcess = e.getDataSet(); for (int i = 0; i < toProcess.numInstances(); i++) { Instance current = toProcess.instance(i); Instance result = makeOutputInstance(current, true); if (result != null) { m_outputStructure.add(result); } } if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Finished."); } // notify listeners DataSetEvent d = new DataSetEvent(this, m_outputStructure); notifyDataListeners(d); m_busy = false; } /** * Accept and process a test set event * * @param e the test set event to process */ public void acceptTestSet(TestSetEvent e) { Instances test = e.getTestSet(); DataSetEvent d = new DataSetEvent(this, test); acceptDataSet(d); } /** * Accept and process a training set event * * @parame e the training set event to process */ public void acceptTrainingSet(TrainingSetEvent e) { Instances train = e.getTrainingSet(); DataSetEvent d = new DataSetEvent(this, train); acceptDataSet(d); } @SuppressWarnings("unchecked") private void notifyDataListeners(DataSetEvent e) { List<DataSourceListener> l; synchronized (this) { l = (List<DataSourceListener>) m_dataListeners.clone(); } if (l.size() > 0) { for (DataSourceListener ds : l) { ds.acceptDataSet(e); } } } @SuppressWarnings("unchecked") private void notifyInstanceListeners(InstanceEvent e) { List<InstanceListener> l; synchronized (this) { l = (List<InstanceListener>) m_instanceListeners.clone(); } if (l.size() > 0) { for (InstanceListener il : l) { il.acceptInstance(e); } } } protected String statusMessagePrefix() { return getCustomName() + "$" + hashCode() + "|"; } }