/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* Appender.java
* Copyright (C) 2011-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.gui.beans;
import java.awt.BorderLayout;
import java.beans.EventSetDescriptor;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import javax.swing.JPanel;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;
import weka.core.converters.ArffLoader;
import weka.core.converters.ArffSaver;
import weka.core.converters.SerializedInstancesLoader;
import weka.gui.Logger;
/**
* A bean that appends multiple incoming data connections into a single
* data set. The incoming connections can be either all instance connections
* or all batch-oriented connections (i.e. data set, training set and test set).
* Instance and batch connections can't be mixed. An amalgamated output
* is created that is a combination of all the incoming attributes. Missing values
* are used to fill columns that don't exist in a particular incoming data set.
* If all incoming connections are instance connections, then the outgoing
* connection must be an instance connection (and vice versa for incoming
* batch connections).
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 8034 $
*/
@KFStep(category = "Tools", toolTipText = "Append multiple sets of instances")
public class Appender extends JPanel implements BeanCommon, Visible,
Serializable, DataSource, DataSourceListener, TrainingSetListener,
TestSetListener, InstanceListener, EventConstraints {
/**
* For serialization
*/
private static final long serialVersionUID = 9177433051794199463L;
/** Logging */
protected transient Logger m_log;
/** Upstream components sending us data */
protected Set<String> m_listeneeTypes = new HashSet<String>();
protected Map<Object, Object> m_listenees = new HashMap<Object, Object>();
/**
* Used to keep track
* of how many have sent us complete data sets (batch) or structure available
* events (incremental) so far + store headers from
* each
*/
protected transient Map<Object, Instances> m_completed;
/** Handles on temp files used to store batches of instances in batch mode */
protected transient Map<Object, File> m_tempBatchFiles;
/** Used to hold the final header in the case of incremental operation */
protected transient Instances m_completeHeader;
/**
* Holds savers used for incrementally saving incoming instance streams.
* After we've seen the structure from each incoming connection we can
* create the final output structure, pull any saved instances from the temp
* files and discard these savers as they will no longer be needed.
*/
protected transient Map<Object, ArffSaver> m_incrementalSavers;
/** Instance event to use for incremental mode */
protected InstanceEvent m_ie = new InstanceEvent(this);
/** Keeps track of how many incoming instance streams have finished */
protected int m_finishedCount;
/** For printing status updates in incremental mode */
protected transient int m_incrementalCounter;
/** True if we are busy */
protected boolean m_busy;
/**
* Default visual for data sources
*/
protected BeanVisual m_visual =
new BeanVisual("Appender",
BeanVisual.ICON_PATH+"DefaultFilter.gif",
BeanVisual.ICON_PATH+"DefaultFilter_animated.gif");
/** Downstream steps listening to batch data events */
protected ArrayList<DataSourceListener> m_dataListeners =
new ArrayList<DataSourceListener>();
/** Downstream steps listening to instance events */
protected ArrayList<InstanceListener> m_instanceListeners =
new ArrayList<InstanceListener>();
/**
* Constructs a new Appender.
*/
public Appender() {
useDefaultVisual();
setLayout(new BorderLayout());
add(m_visual, BorderLayout.CENTER);
}
/**
* Returns true if, at the current time, the named event could be
* generated.
*
* @param eventName the name of the event in question
* @return true if the named event could be generated
*/
public boolean eventGeneratable(String eventName) {
if (!m_listeneeTypes.contains(eventName)) {
return false;
}
for (Object listenee : m_listenees.values()) {
if (listenee instanceof EventConstraints) {
if (!((EventConstraints)listenee).eventGeneratable(eventName)) {
return false;
}
}
}
return true;
}
/**
* Accept and process an instance event
*
* @param e an <code>InstanceEvent</code> value
*/
public synchronized void acceptInstance(InstanceEvent e) {
m_busy = true;
if (m_completed == null) {
m_completed = new HashMap<Object, Instances>();
// until we have a header from each incoming connection, we'll have
// to store instances to temp files. If sequential start points are
// being used, or the operation of the flow results in all instances
// from one input path getting passed in before any subsequent input
// paths are processed, then this will be inefficient. Parallel start
// points will be most efficient
m_incrementalSavers = new HashMap<Object, ArffSaver>();
m_finishedCount = 0;
m_incrementalCounter = 0;
}
if (e.getStatus() == InstanceEvent.FORMAT_AVAILABLE) {
// reset if we get a new start of stream from one of streams that
// we've seen a FORMAT_AVAILABLE from previously
if (m_completed.containsKey(e.getSource())) {
if (m_log != null) {
String msg = statusMessagePrefix() + "Resetting appender.";
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + msg + " New start of stream detected before " +
"all incoming streams have finished!");
}
m_completed = new HashMap<Object, Instances>();
m_incrementalSavers = new HashMap<Object, ArffSaver>();
m_incrementalCounter = 0;
m_completeHeader = null;
m_finishedCount = 0;
}
m_completed.put(e.getSource(), e.getStructure());
if (m_completed.size() == m_listenees.size()) {
// create mondo header...
try {
if (m_log != null) {
String msg = statusMessagePrefix() + "Making output header";
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + msg);
}
m_completeHeader = makeOutputHeader();
// notify listeners of output format
m_ie.setStructure(m_completeHeader);
notifyInstanceListeners(m_ie);
// now check for any buffered instances...
if (m_incrementalSavers.size() > 0) {
// read in and convert these instances now
for (ArffSaver s : m_incrementalSavers.values()) {
// finish off the saving process first
s.writeIncremental(null);
File tmpFile = s.retrieveFile();
ArffLoader loader = new ArffLoader();
loader.setFile(tmpFile);
Instances tempStructure = loader.getStructure();
Instance tempLoaded = loader.getNextInstance(tempStructure);
while (tempLoaded != null) {
Instance converted = makeOutputInstance(m_completeHeader, tempLoaded);
m_ie.setStatus(InstanceEvent.INSTANCE_AVAILABLE);
m_ie.setInstance(converted);
notifyInstanceListeners(m_ie);
m_incrementalCounter++;
if (m_incrementalCounter % 10000 == 0) {
if (m_log != null) {
m_log.statusMessage(statusMessagePrefix()
+ "Processed " + m_incrementalCounter + " instances");
}
}
tempLoaded = loader.getNextInstance(tempStructure);
}
}
m_incrementalSavers.clear();
}
} catch (Exception e1) {
String msg = statusMessagePrefix() + "ERROR: unable to create output instances structure.";
if (m_log != null) {
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + e1.getMessage());
}
stop();
e1.printStackTrace();
m_busy = false;
return;
}
}
m_busy = false;
return;
}
if (e.getStatus() == InstanceEvent.BATCH_FINISHED ||
e.getStatus() == InstanceEvent.INSTANCE_AVAILABLE) {
// get the instance (if available)
Instance currentI = e.getInstance();
if (m_completeHeader == null) {
if (currentI != null) {
// save this instance to a temp file
ArffSaver saver = m_incrementalSavers.get(e.getSource());
if (saver == null) {
saver = new ArffSaver();
try {
File tmpFile = File.createTempFile("weka", ".arff");
saver.setFile(tmpFile);
saver.setRetrieval(weka.core.converters.Saver.INCREMENTAL);
saver.setInstances(new Instances(currentI.dataset(), 0));
m_incrementalSavers.put(e.getSource(), saver);
} catch (IOException e1) {
stop();
e1.printStackTrace();
String msg = statusMessagePrefix() + "ERROR: unable to save instance to temp file";
if (m_log != null) {
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + e1.getMessage());
}
m_busy = false;
return;
}
}
try {
saver.writeIncremental(currentI);
if (e.getStatus() == InstanceEvent.BATCH_FINISHED) {
m_finishedCount++;
}
} catch (IOException e1) {
stop();
e1.printStackTrace();
String msg = statusMessagePrefix() + "ERROR: unable to save instance to temp file";
if (m_log != null) {
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + e1.getMessage());
}
m_busy = false;
return;
}
}
} else {
if (currentI != null) {
int code = InstanceEvent.INSTANCE_AVAILABLE;
if (e.getStatus() == InstanceEvent.BATCH_FINISHED) {
m_finishedCount++;
if (m_finishedCount == m_listenees.size()) {
// We're all done!
code = InstanceEvent.BATCH_FINISHED;
}
}
// convert instance and output immediately
Instance newI = makeOutputInstance(m_completeHeader, currentI);
m_ie.setStatus(code);
m_ie.setInstance(newI);
notifyInstanceListeners(m_ie);
m_incrementalCounter++;
if (m_incrementalCounter % 10000 == 0) {
if (m_log != null) {
m_log.statusMessage(statusMessagePrefix()
+ "Processed " + m_incrementalCounter + " instances");
}
}
if (code == InstanceEvent.BATCH_FINISHED) {
if (m_log != null) {
m_log.statusMessage(statusMessagePrefix() + "Finished");
}
m_completed = null;
m_incrementalSavers = null;
m_incrementalCounter = 0;
m_completeHeader = null;
m_finishedCount = 0;
}
}
}
}
m_busy = false;
}
/**
* Accept and process a test set event
*
* @param e a <code>TestSetEvent</code> value
*/
public void acceptTestSet(TestSetEvent e) {
DataSetEvent de = new DataSetEvent(e.getSource(), e.getTestSet());
acceptDataSet(de);
}
/**
* Accept and process a training set event
*
* @param e a <code>TrainingSetEvent</code> value
*/
public void acceptTrainingSet(TrainingSetEvent e) {
DataSetEvent de = new DataSetEvent(e.getSource(), e.getTrainingSet());
acceptDataSet(de);
}
/**
* Accept and process a data set event
*
* @param e a <code>DataSetEvent</code> value
*/
public synchronized void acceptDataSet(DataSetEvent e) {
m_busy = true;
if (m_completed == null) {
// new batch of batches
m_completed = new HashMap<Object, Instances>();
m_tempBatchFiles = new HashMap<Object, File>();
}
// who is this that's sent us data?
Object source = e.getSource();
if (m_completed.containsKey(source)) {
// Can't accept more than one data set from a particular source
if (m_log != null && !e.isStructureOnly()) {
String msg = statusMessagePrefix() + "Resetting appender.";
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + msg + " New batch for an incoming connection " +
"detected before " +
"all incoming connections have sent data!");
}
m_completed = new HashMap<Object, Instances>();
m_tempBatchFiles = new HashMap<Object, File>();
}
Instances header = new Instances(e.getDataSet(), 0);
m_completed.put(source, header);
// write these instances (serialized) to a tmp file.
try {
File tmpF = File.createTempFile("weka", SerializedInstancesLoader.FILE_EXTENSION);
tmpF.deleteOnExit();
ObjectOutputStream oos =
new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(tmpF)));
oos.writeObject(e.getDataSet());
oos.flush();
oos.close();
m_tempBatchFiles.put(source, tmpF);
} catch (IOException e1) {
stop();
e1.printStackTrace();
String msg = statusMessagePrefix() + "ERROR: unable to save batch instances to temp file";
if (m_log != null) {
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + e1.getMessage());
}
m_busy = false;
return;
}
// check to see if we've had one from everyone.
// Not much we can do if one source fails somewhere - won't know this fact...
if (m_completed.size() == m_listenees.size()) {
// process all headers and create mongo header for new output.
// missing values will fill columns that don't exist in particular data
// sets
try {
Instances output = makeOutputHeader();
if (m_log != null) {
String msg = statusMessagePrefix() + "Making output header";
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + msg);
}
for (File f : m_tempBatchFiles.values()) {
ObjectInputStream ois =
new ObjectInputStream(new BufferedInputStream(new FileInputStream(f)));
Instances temp = (Instances)ois.readObject();
ois.close();
// copy each instance over
for (int i = 0; i < temp.numInstances(); i++) {
Instance converted = makeOutputInstance(output, temp.instance(i));
output.add(converted);
}
}
DataSetEvent d = new DataSetEvent(this, output);
notifyDataListeners(d);
} catch (Exception ex) {
stop();
ex.printStackTrace();
String msg = statusMessagePrefix() + "ERROR: unable to output appended data set";
if (m_log != null) {
m_log.statusMessage(msg);
m_log.logMessage("[Appender] " + ex.getMessage());
}
}
// finished
m_completed = null;
m_tempBatchFiles = null;
if (m_log != null) {
m_log.statusMessage(statusMessagePrefix() + "Finished");
}
}
m_busy = false;
}
private Instance makeOutputInstance(Instances output,
Instance source) {
double[] newVals = new double[output.numAttributes()];
for (int i = 0; i < newVals.length; i++) {
newVals[i] = Utils.missingValue();
}
for (int i = 0; i < source.numAttributes(); i++) {
if (!source.isMissing(i)) {
Attribute s = source.attribute(i);
int outputIndex = output.attribute(s.name()).index();
if (s.isNumeric()) {
newVals[outputIndex] = source.value(s);
} else if (s.isString()) {
String sVal = source.stringValue(s);
newVals[outputIndex] =
output.attribute(outputIndex).addStringValue(sVal);
} else if (s.isRelationValued()) {
Instances rVal = source.relationalValue(s);
newVals[outputIndex] =
output.attribute(outputIndex).addRelation(rVal);
} else if (s.isNominal()) {
String nomVal = source.stringValue(s);
newVals[outputIndex] =
output.attribute(outputIndex).indexOfValue(nomVal);
}
}
}
Instance newInst = new DenseInstance(source.weight(), newVals);
newInst.setDataset(output);
return newInst;
}
private Instances makeOutputHeader() throws Exception {
// process each header in turn...
Map<String, Attribute> attLookup = new HashMap<String, Attribute>();
List<Attribute> attList = new ArrayList<Attribute>();
Map<String, Set<String>> nominalLookups = new HashMap<String, Set<String>>();
for (Instances h : m_completed.values()) {
for (int i = 0; i < h.numAttributes(); i++) {
Attribute a = h.attribute(i);
if (!attLookup.containsKey(a.name())) {
attLookup.put(a.name(), a);
attList.add(a);
if (a.isNominal()) {
TreeSet<String> nVals = new TreeSet<String>();
for (int j = 0; j < a.numValues(); j++) {
nVals.add(a.value(j));
}
nominalLookups.put(a.name(), nVals);
}
} else {
Attribute storedVersion = attLookup.get(a.name());
if (storedVersion.type() != a.type()) {
// mismatched types between headers - can't continue
throw new Exception("Conflicting types for attribute " +
"name '" + a.name() + "' between incoming " +
"instance sets");
}
if (storedVersion.isNominal()) {
Set<String> storedVals = nominalLookups.get(a.name());
for (int j = 0; j < a.numValues(); j++) {
storedVals.add(a.value(j));
}
}
}
}
}
ArrayList<Attribute> finalAttList = new ArrayList<Attribute>();
for (Attribute a : attList) {
Attribute newAtt = null;
if (a.isDate()) {
newAtt = new Attribute(a.name(), a.getDateFormat());
} else if (a.isNumeric()) {
newAtt = new Attribute(a.name());
} else if (a.isRelationValued()) {
newAtt = new Attribute(a.name(), a.relation());
} else if (a.isNominal()) {
Set<String> vals = nominalLookups.get(a.name());
List<String> newVals = new ArrayList<String>();
for (String v : vals) {
newVals.add(v);
}
newAtt = new Attribute(a.name(), newVals);
} else if (a.isString()) {
newAtt = new Attribute(a.name(), (List<String>)null);
// transfer all string values
/* for (int i = 0; i < a.numValues(); i++) {
newAtt.addStringValue(a.value(i));
} */
}
finalAttList.add(newAtt);
}
Instances outputHeader = new Instances("Appended_" + m_listenees.size()
+ "_sets", finalAttList, 0);
return outputHeader;
}
/**
* Add a data source listener
*
* @param dsl a <code>DataSourceListener</code> value
*/
public synchronized void addDataSourceListener(DataSourceListener dsl) {
m_dataListeners.add(dsl);
}
/**
* Remove a data source listener
*
* @param dsl a <code>DataSourceListener</code> value
*/
public synchronized void removeDataSourceListener(DataSourceListener dsl) {
m_dataListeners.remove(dsl);
}
/**
* Add an instance listener
*
* @param tsl an <code>InstanceListener</code> value
*/
public synchronized void addInstanceListener(InstanceListener tsl) {
m_instanceListeners.add(tsl);
}
/**
* Remove an instance listener
*
* @param tsl an <code>InstanceListener</code> value
*/
public synchronized void removeInstanceListener(InstanceListener tsl) {
m_instanceListeners.remove(tsl);
}
/**
* Use the default visual representation
*/
public void useDefaultVisual() {
m_visual.loadIcons(BeanVisual.ICON_PATH+"DefaultFilter.gif",
BeanVisual.ICON_PATH+"DefaultFilter_animated.gif");
m_visual.setText("Appender");
}
/**
* Set a new visual representation
*
* @param newVisual a <code>BeanVisual</code> value
*/
public void setVisual(BeanVisual newVisual) {
m_visual = newVisual;
}
/**
* Get the visual representation
*
* @return a <code>BeanVisual</code> value
*/
public BeanVisual getVisual() {
return m_visual;
}
/**
* Set a custom (descriptive) name for this bean
*
* @param name the name to use
*/
public void setCustomName(String name) {
m_visual.setText(name);
}
/**
* Get the custom (descriptive) name for this bean (if one has been set)
*
* @return the custom name (or the default name)
*/
public String getCustomName() {
return m_visual.getText();
}
/**
* Stop any processing that the bean might be doing.
*/
public void stop() {
// tell any upstream listenees to stop
if (m_listenees != null && m_listenees.size() > 0) {
for (Object l : m_listenees.values()) {
if (l instanceof BeanCommon) {
((BeanCommon)l).stop();
}
}
}
m_busy = false;
}
/**
* Returns true if. at this time, the bean is busy with some
* (i.e. perhaps a worker thread is performing some calculation).
*
* @return true if the bean is busy.
*/
public boolean isBusy() {
return m_busy;
}
/**
* Set a logger
*
* @param logger a <code>weka.gui.Logger</code> value
*/
public void setLog(Logger logger) {
m_log = logger;
}
/**
* Returns true if, at this time,
* the object will accept a connection via the named event
*
* @param esd the EventSetDescriptor for the event in question
* @return true if the object will accept a connection
*/
public boolean connectionAllowed(EventSetDescriptor esd) {
return connectionAllowed(esd.getName());
}
/**
* Returns true if, at this time,
* the object will accept a connection via the named event
*
* @param eventName the name of the event
* @return true if the object will accept a connection
*/
public boolean connectionAllowed(String eventName) {
if (!eventName.equals("dataSet") && !eventName.equals("trainingSet") &&
!eventName.equals("testSet") && !eventName.equals("instance")) {
return false;
}
if (m_listeneeTypes.size() == 0) {
return true;
}
if (m_listeneeTypes.contains("instance") && !eventName.equals("instance")) {
return false;
}
if (!m_listeneeTypes.contains("instance") && eventName.equals("instance")) {
return false;
}
return true;
}
/**
* Notify this object that it has been registered as a listener with
* a source for recieving events described by the named event
* This object is responsible for recording this fact.
*
* @param eventName the event
* @param source the source with which this object has been registered as
* a listener
*/
public void connectionNotification(String eventName, Object source) {
if (connectionAllowed(eventName)) {
m_listeneeTypes.add(eventName);
m_listenees.put(source, source);
}
}
/**
* Notify this object that it has been deregistered as a listener with
* a source for named event. This object is responsible
* for recording this fact.
*
* @param eventName the event
* @param source the source with which this object has been registered as
* a listener
*/
public void disconnectionNotification(String eventName, Object source) {
m_listenees.remove(source);
if (m_listenees.size() == 0) {
m_listeneeTypes.clear();
}
}
private String statusMessagePrefix() {
return getCustomName() + "$" + hashCode() + "|";
}
@SuppressWarnings("unchecked")
private void notifyInstanceListeners(InstanceEvent e) {
List<InstanceListener> l;
synchronized (this) {
l = (List<InstanceListener>) m_instanceListeners.clone();
}
if (l.size() > 0) {
for (InstanceListener il : l) {
il.acceptInstance(e);
}
}
}
@SuppressWarnings("unchecked")
private void notifyDataListeners(DataSetEvent e) {
List<DataSourceListener> l;
synchronized (this) {
l = (List<DataSourceListener>) m_dataListeners.clone();
}
if (l.size() > 0) {
for (DataSourceListener ds : l) {
ds.acceptDataSet(e);
}
}
}
}