ExampleSetToStream.java example

Explorer
rapidminer-vega-master
/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2011 by Rapid-I and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapid-i.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.operator.tools;

import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeRole;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Tools;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.DoubleSparseArrayDataRow;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.example.table.NominalMapping;
import com.rapidminer.example.table.PolynominalMapping;
import com.rapidminer.example.table.SparseDataRow;
import com.rapidminer.operator.Annotations;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.Ontology;

/** Writes and reads a example sets to and from from streams.
 *  TODO: Implement sparse counterpart.
 *  
 * @author Simon Fischer
 *
 */
public class ExampleSetToStream {

	/** Original version, used for RapidMiner beta 5*/
	public static final int VERSION_1 = 1;
	
	/** Fixes a problem with long strings in DataOutput.writeUTF() which restricts the length to 65k bytes. 
	 *  Used since RapidMiner 5.0 final release, revision 7197. */
	public static final int VERSION_2 = 2;
	
	/** Adds support for {@link Annotations} 
	 *  Used since revision 7430. */
	public static final int VERSION_3 = 3;
	
	/** Current version of the stream protocol. To add a new version:
	 *  - Add a constant here, and redirect the constant CURRENT_VERSION to the new constant. 
	 *  - Add SVN revision to the comment of the new version
	 *  - In {@link SerializationType} add a new enum constant for the new version and make it the default 
	 *  */
	public static final int CURRENT_VERSION = VERSION_3;

	private static final Charset STRING_CHARSET = Charset.forName("UTF-8");
	
	public enum ColumnType {
		NOMINAL_BYTE,
		NOMINAL_SHORT,
		NOMINAL_INTEGER,
		DOUBLE,
		INTEGER;
	}

	public static class Header {
		private final Annotations annotations;
		private final List<AttributeRole> allRoles;
		private final boolean sparse;
		
		protected Header(Annotations annotations, List<AttributeRole> allRoles, boolean sparse) {
			super();
			this.allRoles = allRoles;
			this.sparse = sparse;
			this.annotations = annotations;
		}
		public List<AttributeRole> getAllRoles() {
			return allRoles;
		}
		public boolean isSparse() {
			return sparse;
		}
		public Annotations getAnnotations() {
			return annotations;
		}		
	}

	private int version;
	
	public ExampleSetToStream(int version) {
		this.version = version;
		if (version != CURRENT_VERSION) {
			LogService.getRoot().warning("Using deprecated example set stream version "+version);
		}
	}
	
	/** Writes header and data of the example set to the stream. */
	public void write(ExampleSet exampleSet, OutputStream outputStream) throws IOException {
		DataOutputStream out = new DataOutputStream(outputStream);
		List<AttributeRole> allRoles = new LinkedList<AttributeRole>();
		Iterator<AttributeRole> i = exampleSet.getAttributes().allAttributeRoles();
		while (i.hasNext()) {
			allRoles.add(i.next());
		}
		boolean sparse = false;
		// TODO: Remove ugly instanceof check
		if ((exampleSet.size() > 0) && (exampleSet.getExample(0).getDataRow() instanceof SparseDataRow)) {
			sparse = true;
		}
		writeHeader(exampleSet.getAnnotations(), allRoles, out, sparse);
		writeData(exampleSet, out, allRoles, sparse);
		out.flush();		
	}
		
	/** Writes nominals and integers as integer, all others as double. All values are prefixed by a boolean
	 *  indicating whether the following value is missing, in which case the latter is not sent at all. 
	 *  
	 *  Iterates over all examples and all attributes
	 *    - For non-sparse representation, each attribute value is sent as the data type corresponding
	 *        to the respective {@link ColumnType}. For {@link ColumnType#INTEGER}, missing values are
	 *        sent as Integer.MIN_VALUE+1 plus a "true" (boolean). The value Integer.MIN_VALUE+1 itself
	 *        is sent as Integer.MIN_VALUE+1 plus a "false" (boolean). Otherwise missings are encoded
	 *        as -1 since nominal indices are always non-negative. 
	 *    - For sparse representation, only non-default attribute values are sent, prefixed by an int 
	 *        specifying the attribute index. An attribute index of -1 signals the end of an example  
	 */
	private void writeData(ExampleSet exampleSet, DataOutputStream out, List<AttributeRole> allRoles, boolean sparse) throws IOException {
		out.writeInt(exampleSet.size());
		ColumnType[] columnTypes = convertToColumnTypes(allRoles);
	
		for (Example example : exampleSet) {			
			int attributeIndex = 0;			
			for (AttributeRole role : allRoles) {
				Attribute attribute = role.getAttribute();				
				double value = example.getValue(attribute);				
				writeDatum(value, attributeIndex, attribute, columnTypes[attributeIndex], out, sparse);
				attributeIndex++;
			}
			if (sparse) {
				// indicates linebreaks
				out.writeInt(-1);
			}
		}
	}

	/** Writes the annotations, meta data, including nominal mappings, to the stream, in the following order:
	 *  - annotations {@link #writeAnnotations(DataOutput, Annotations)}
	 *  - number of attributes to come
	 *  - For each attribute
	 *    - name
	 *    - special name (empty string if not special!)
	 *    - value type name
	 *    - block type name
	 *    - If nominal, the number of nominal values, and for each nominal value
	 *      - the index
	 *      - the string  
	 *    - the annotations of the attribute
	 * After that follows a boolean indicating whether we are using sparse format.
	 * If yes, all default values will be sent as doubles, one per attribute.
	 */
	public void writeHeader(Annotations annotations, List<AttributeRole> allAttributes, DataOutputStream out, boolean sparse) throws IOException {
		writeAnnotations(out, annotations);
		out.writeInt(allAttributes.size());
		for (AttributeRole role : allAttributes) {
			Attribute att = role.getAttribute();
			writeString(out, att.getName());
			String specialName = role.getSpecialName();
			if (specialName != null) {
				writeString(out, specialName);
			} else {
				writeString(out, "");
			}
			writeString(out, Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(att.getValueType()));
			writeString(out, Ontology.ATTRIBUTE_BLOCK_TYPE.mapIndex(att.getBlockType()));
			if (att.isNominal()) {				
				NominalMapping mapping = att.getMapping();
				out.writeInt(mapping.size());
				for (String value : mapping.getValues()) {
					out.writeInt(mapping.mapString(value));
					writeString(out, value);
				}
			}
			writeAnnotations(out, att.getAnnotations());
		}		
		out.writeBoolean(sparse);
		if (sparse) {			
			for (AttributeRole role : allAttributes) {
				out.writeDouble(role.getAttribute().getDefault());
			}			
		}
	}
	
	/** Reads an example set as written by  {@link #write(ExampleSet, OutputStream)}. */	
	public ExampleSet read(InputStream inputStream) throws IOException {
		DataInputStream in = new DataInputStream(inputStream);
		
		// Extract Header information
		Header header =  readHeader(in);		
		List<AttributeRole> allAttributeRoles = header.getAllRoles();
		List<Attribute> allAttributes = new ArrayList<Attribute>();
		for (AttributeRole role : allAttributeRoles) {
			allAttributes.add(role.getAttribute());
		}
		ColumnType columnTypes[] = convertToColumnTypes(allAttributeRoles);
		boolean sparse = header.isSparse();
		
		// Create example table
		MemoryExampleTable exampleTable = new MemoryExampleTable(allAttributes);
		int size = in.readInt();
		
		// Read data
		for (int row = 0; row < size; row++) {
			if (sparse) {
				DoubleSparseArrayDataRow sparseRow = new DoubleSparseArrayDataRow(allAttributeRoles.size());
				while (true) {
					int index = in.readInt();
					if (index == -1) {
						break;
					} else {
						sparseRow.set(allAttributes.get(index), readDatum(in, columnTypes[index]));
					}
				}
				sparseRow.trim();
				exampleTable.addDataRow(sparseRow);	
			} else {
				double[] data = new double[allAttributeRoles.size()];
				readRow(in, data, columnTypes, sparse, null);
				exampleTable.addDataRow(new DoubleArrayDataRow(data));
			}
		}
		
		// Create example set
		ExampleSet exampleSet = exampleTable.createExampleSet();
		// finally, set special attributes
		for (AttributeRole role : allAttributeRoles) {
			if (role.isSpecial()) {
				Attribute att = exampleSet.getAttributes().get(role.getAttribute().getName());
				exampleSet.getAttributes().getRole(att).setSpecial(role.getSpecialName());
			}
		}
		exampleSet.getAnnotations().putAll(header.getAnnotations());
		return exampleSet;
	}
	
	/** Reads meta data information as written by {@link #writeHeader(List, DataOutputStream)}. 
	 *  TODO: This must return an ExampleSetHeader including the roles and the sparse flag. */
	public Header readHeader(DataInputStream in) throws IOException {
		Annotations annotations = readAnnotations(in);
		int numAttributes = in.readInt();
		List<AttributeRole> allRoles = new LinkedList<AttributeRole>();
		for (int i = 0; i < numAttributes; i++) {
			String name = readString(in);
			String special = readString(in);
			if (special.length() == 0) {
				special = null;
			}
			String tmp = readString(in);
			int valueType = Ontology.ATTRIBUTE_VALUE_TYPE.mapName(tmp);
			if (valueType == -1) {
				throw new IOException("Unknown value type: '"+ tmp+"'");
			}
			tmp = readString(in);
			int blockType = Ontology.ATTRIBUTE_BLOCK_TYPE.mapName(tmp);
			if (blockType == -1) {
				throw new IOException("Unknown value type: '"+ tmp + "'");
			}
			Attribute attribute = AttributeFactory.createAttribute(name, valueType, blockType);
			AttributeRole role = new AttributeRole(attribute);
			if (special != null) {
				role.setSpecial(special);
			}	
			allRoles.add(role);
			
			// read mapping
			if (attribute.isNominal()) {
				int numValues = in.readInt();
				if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.BINOMINAL)) {
					// in this case we have a binominal mapping and we can keep it.
					NominalMapping mapping = attribute.getMapping();					
					for (int j = 0; j < numValues; j++) {
						int index = in.readInt();
						String value = readString(in);						
						mapping.setMapping(value, index);
					}
				} else {
					Map<Integer,String> valueMap = new HashMap<Integer,String>();
					for (int j = 0; j < numValues; j++) {
						int index = in.readInt();
						String value = readString(in);
						valueMap.put(index, value);
					}
					attribute.setMapping(new PolynominalMapping(valueMap));
				}
			}
			Annotations attAnnotations = readAnnotations(in);
			attribute.getAnnotations().putAll(attAnnotations);
		}
		
		boolean sparse = in.readBoolean();
		if (sparse) {			
			for (AttributeRole role : allRoles) {
				role.getAttribute().setDefault(in.readDouble());
			}			
		}
		return new Header(annotations, allRoles, sparse);		
	}

	/** Extracts column types such that they have minimal memory consumption. */
	public ColumnType[] convertToColumnTypes(List<AttributeRole> allRoles) {
		ColumnType columnTypes[] = new ColumnType[allRoles.size()];
		for (int i = 0; i < columnTypes.length; i++) {
			Attribute att = allRoles.get(i).getAttribute();
			if (att.isNominal()) {
				if (att.getMapping().size() < Byte.MAX_VALUE) {
					columnTypes[i] = ColumnType.NOMINAL_BYTE;
				} else if (att.getMapping().size() < Short.MAX_VALUE) {
					columnTypes[i] = ColumnType.NOMINAL_SHORT;
				} else {
					columnTypes[i] = ColumnType.NOMINAL_INTEGER;
				}
			} else if (att.isNumerical()) {
				if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(att.getValueType(), Ontology.INTEGER)) {
					columnTypes[i] = ColumnType.INTEGER;
				} else {
					columnTypes[i] = ColumnType.DOUBLE;
				}
			} else {
				columnTypes[i] = ColumnType.DOUBLE;
			}
		}
		return columnTypes;
	}
	
	/** Writes a single datum with the given index. The data type is specified by the parameter columnType.
	 *  If sparse is true, the value is prefixed by the given attributeIndex. */
	public final void writeDatum(double value, int attributeIndex, Attribute attribute, ColumnType columnType, DataOutput out, boolean sparse) throws IOException {
		if (sparse) {
			if (Tools.isDefault(attribute.getDefault(), value)) {
				return;
			} else {
				out.writeInt(attributeIndex);
			}
		}

		switch (columnType) {
		case DOUBLE:
			out.writeDouble(value);
			break;
		case INTEGER:
			if (Double.isNaN(value)) {
				out.writeInt(Integer.MIN_VALUE+1);
				out.writeBoolean(true);
			} else {
				out.writeInt((int)value);
				if ((int)value == Integer.MIN_VALUE+1) {
					out.writeBoolean(false);
				}
			}
			break;
		// For the nominal values, we *can* use -1 to encode missings since all values are guaranteed to be non-negative
		case NOMINAL_BYTE:
			if (Double.isNaN(value)) {
				out.writeByte(-1);
			} else {				
				out.writeByte((byte)value);
			}
			break;
		case NOMINAL_INTEGER:
			if (Double.isNaN(value)) {
				out.writeInt(-1);
			} else {
				out.writeInt((int)value);
			}
			break;
		case NOMINAL_SHORT:
			if (Double.isNaN(value)) {
				out.writeShort(-1);
			} else {
				out.writeShort((short)value);
			}
			break;				
		default:
			// cannot happen
			throw new RuntimeException("Illegal type: "+columnType);
		}
	}
	
	/** Reads a single datum in non-sparse representation of the given type and returns it as a double. */
	private final double readDatum(DataInput in, ColumnType columnType) throws IOException {
		switch (columnType) {
		case DOUBLE:
			return in.readDouble();
		case INTEGER:
			int iValue = in.readInt();
			if (iValue == Integer.MIN_VALUE+1) {
				boolean isMissing = in.readBoolean();
				if (isMissing) {
					return Double.NaN;
				} else {
					return iValue;
				}
			} else {
				return iValue;
			}
		case NOMINAL_BYTE:
			byte bValue = in.readByte();
			if (bValue == -1) {
				return Double.NaN;
			} else {
				return bValue;
			}
		case NOMINAL_INTEGER:
			iValue = in.readInt();
			if (iValue == -1) {
				return Double.NaN;
			} else {
				return iValue;
			}
		case NOMINAL_SHORT:
			short sValue = in.readShort();
			if (sValue == -1) {
				return Double.NaN;
			} else {
				return sValue;
			}			
		default:
			// cannot happen
			throw new RuntimeException("Illegal type: "+columnType);
		}
	}
	
	/** Reads a single row from the stream. */
	public void readRow(DataInputStream in, 
			double[] data, 
			ColumnType[] columnTypes, 
			boolean sparse,
			double[] sparseDefaults) throws IOException {
		if (sparse) {
			System.arraycopy(sparseDefaults, 0, data, 0, sparseDefaults.length);
			while (true) {
				int index = in.readInt();
				if (index == -1) {
					break;
				} else {
					data[index] = readDatum(in, columnTypes[index]);
				}
			}				
		} else {
			for (int attIndex = 0; attIndex < columnTypes.length; attIndex++) {
				data[attIndex] = readDatum(in, columnTypes[attIndex]);
			}
		}		
	}

	private void writeString(DataOutput out, String value) throws IOException {
		switch (version) {
		case VERSION_1:
			out.writeUTF(value);
			break;
		case VERSION_2:
		case VERSION_3:
			byte[] bytes = value.getBytes(STRING_CHARSET);
			out.writeInt(bytes.length);
			out.write(bytes);
			break;
		default:
			throw new RuntimeException("Version not set");
		}
	}

	private String readString(DataInput in) throws IOException {
		switch (version) {
		case VERSION_1:
			return in.readUTF();
		case VERSION_2:
		case VERSION_3:
			int length = in.readInt();
			byte[] bytes = new byte[length];
			in.readFully(bytes);
			return new String(bytes, STRING_CHARSET);
		default:
			throw new RuntimeException("Version not set");
		}
	}
	
	public int getVersion() {
		return version;
	}
	
	/** One integer for size
	 *  For each annotation
	 *   - one string ({@link #writeString(DataOutput, String)} for key
	 *   - one string ({@link #writeString(DataOutput, String)} for value 
	 * */
	public void writeAnnotations(DataOutput out, Annotations annotations) throws IOException {
		if (version < VERSION_3) {
			LogService.getRoot().warning("Ignoring annotations in example set stream version "+version);
		} else {
			if (annotations == null) {
				out.writeInt(0);
			} else {
				out.writeInt(annotations.size());
				for (String key : annotations.getKeys()) {
					writeString(out, key);
					writeString(out, annotations.getAnnotation(key));
				}
			}
		}
	}
	
	public Annotations readAnnotations(DataInput in) throws IOException {
		if (version < VERSION_3) {
			return new Annotations();
		} else {
			Annotations result = new Annotations();
			int size = in.readInt();
			for (int i = 0; i < size; i++) {
				result.setAnnotation(readString(in), readString(in));
			}
			return result;
		}
	}
		
}