/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.io; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DataRow; import com.rapidminer.example.table.DataRowFactory; import com.rapidminer.example.table.MemoryExampleTable; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.RandomGenerator; import com.rapidminer.tools.Tools; import com.rapidminer.tools.att.AttributeSet; import com.rapidminer.tools.math.sampling.OrderedSamplingWithoutReplacement; /** * This operator can read stata files. Currently only stata * files of version 113 or 114 are supported. * * @rapidminer.index stata * @author Tobias Malbrecht * @version $Id: StataExampleSource.java,v 1.1 2008/06/03 22:26:50 tobiasmalbrecht Exp $ */ public class StataExampleSource extends BytewiseExampleSource { /** The parameter name for "Determines which attribute properties should be used for attribute naming." */ public static final String PARAMETER_ATTRIBUTE_NAMING_MODE = "attribute_naming_mode"; /** The parameter name for "Specifies how to handle attributes with value labels, i.e. whether to ignore the labels or how to use them." */ public static final String PARAMETER_HANDLE_VALUE_LABELS = "handle_value_labels"; /** The parameter name for "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)" */ public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio"; /** The parameter name for "The exact number of samples which should be read (-1 = all; if not -1, sample_ratio will not have any effect)" */ public static final String PARAMETER_SAMPLE_SIZE = "sample_size"; /** The parameter name for "Use the given random seed instead of global random numbers (only for permutation, -1: use global)." */ public static final String PARAMETER_LOCAL_RANDOM_SEED = "local_random_seed"; /** File suffix for stata files. */ private static final String STATA_FILE_SUFFIX = "dta"; /** Only use variable name as attribute name. */ public static final int USE_VAR_NAME = 0; /** Only use variable label as attribute name. */ public static final int USE_VAR_LABEL = 1; /** Use variable name with label in parentheses as attribute name. */ public static final int USE_VAR_NAME_LABELED = 2; /** Use variable label with name in parentheses as attribute name. */ public static final int USE_VAR_LABEL_NAMED = 3; /** String descriptions of attribute naming modes. */ public static final String[] ATTRIBUTE_NAMING_MODES = { "name", "label", "name (label)", "label (name)" }; /** Force attributes to be numeric even if value labels exist. */ public static final int FORCE_NUMERIC = 0; /** Ignore existing value labels but let attribute be nominal. */ public static final int IGNORE = 1; /** Use existing value labels for labeled values. */ public static final int USE_ADDITIONALLY = 2; /** Use existing value labels and set all values without labels to unknown. */ public static final int USE_EXCLUSIVELY = 3; /** String descriptions of value label handling modes. */ public static final String[] HANDLE_VALUE_LABELS_MODES = { "force numeric", "ignore", "use additionally", "use exclusively" }; /** File format constants... */ private static final int CODE_STRING_TERMINATOR = 0x0; private static final int CODE_DS_FORMAT_VERSION_113 = 0x71; private static final int CODE_DS_FORMAT_VERSION_114 = 0x72; private static final int CODE_BYTEORDER_HILO = 0x01; private static final int CODE_BYTEORDER_LOHI = 0x02; private static final int CODE_FILETYPE = 0x01; private static final int LENGTH_HEADER = 109; private static final int INDEX_HEADER_DS_FORMAT = 0; private static final int INDEX_HEADER_BYTEORDER = 1; private static final int INDEX_HEADER_FILETYPE = 2; private static final int INDEX_HEADER_NUMBER_OF_ATTRIBUTES = 4; private static final int INDEX_HEADER_NUMBER_OF_EXAMPLES = 6; private static final int CODE_TYPE_BYTE = 0xfb; private static final int CODE_TYPE_INT = 0xfc; private static final int CODE_TYPE_LONG = 0xfd; private static final int CODE_TYPE_FLOAT = 0xfe; private static final int CODE_TYPE_DOUBLE = 0xff; private static final int LENGTH_TYPE_BYTE = 1; private static final int LENGTH_TYPE_INT = 2; private static final int LENGTH_TYPE_LONG = 4; private static final int LENGTH_TYPE_FLOAT = 4; private static final int LENGTH_TYPE_DOUBLE = 8; private static final int LENGTH_ATTRIBUTE_NAME = 33; private static final int LENGTH_ATTRIBUTE_FORMAT_VERSION_113 = 12; private static final int LENGTH_ATTRIBUTE_FORMAT_VERSION_114 = 49; private static final int LENGTH_ATTRIBUTE_VALUE_LABEL_IDENTIFIER = 33; private static final int LENGTH_ATTRIBUTE_LABEL = 81; private static final int LENGTH_EXPANSION_FIELD_HEADER = 5; private static final int INDEX_EXPANSION_FIELD_HEADER_TYPE = 0; private static final int INDEX_EXPANSION_FIELD_HEADER_LENGTH = 1; private static final int LENGTH_VALUE_LABEL_HEADER = 40; private static final int INDEX_VALUE_LABEL_HEADER_LENGTH = 0; private static final int INDEX_VALUE_LABEL_HEADER_NAME = 4; private static final int LENGTH_VALUE_LABEL_HEADER_NAME = 33; private static final int INDEX_VALUE_LABEL_TABLE_NUMBER_OF_ENTRIES = 0; private static final int INDEX_VALUE_LABEL_TABLE_TEXT_LENGTH = 4; private static final int INDEX_VALUE_LABEL_TABLE_OFFSETS = 8; private static final byte CODE_MAXIMUM_NONMISSING_BYTE = 100; private static final int CODE_MAXIMUM_NONMISSING_INT = 32740; private static final int CODE_MAXIMUM_NONMISSING_LONG = 2147483620; private static final double CODE_MAXIMUM_NONMISSING_FLOAT = 1.701e+38; private static final double CODE_MAXIMUM_NONMISSING_DOUBLE = 8.988e+307; public StataExampleSource(OperatorDescription description) { super(description); } protected String getFileSuffix() { return STATA_FILE_SUFFIX; } protected ExampleSet readFile(File file, DataRowFactory dataRowFactory) throws IOException, UndefinedParameterError { int attributeNamingMode = getParameterAsInt(PARAMETER_ATTRIBUTE_NAMING_MODE); int handleValueLabelsMode = getParameterAsInt(PARAMETER_HANDLE_VALUE_LABELS); double sampleRatio = getParameterAsDouble(PARAMETER_SAMPLE_RATIO); int sampleSize = getParameterAsInt(PARAMETER_SAMPLE_SIZE); RandomGenerator randomGenerator = RandomGenerator.getRandomGenerator(getParameterAsInt(PARAMETER_LOCAL_RANDOM_SEED)); FileInputStream fileReader = new FileInputStream(file); byte[] buffer = new byte[500]; boolean reverseEndian = false; // read and check header read(fileReader, buffer, LENGTH_HEADER); int dataSetFormat = 0x000000FF & buffer[INDEX_HEADER_DS_FORMAT]; if (dataSetFormat != CODE_DS_FORMAT_VERSION_113 && dataSetFormat != CODE_DS_FORMAT_VERSION_114) { throw new IOException("Unsupported data set format"); } if (buffer[INDEX_HEADER_FILETYPE] != CODE_FILETYPE) { throw new IOException(GENERIC_ERROR_MESSAGE); } byte byteOrder = buffer[INDEX_HEADER_BYTEORDER]; if (byteOrder != CODE_BYTEORDER_LOHI && byteOrder != CODE_BYTEORDER_HILO) { throw new IOException(GENERIC_ERROR_MESSAGE); } reverseEndian = (byteOrder == CODE_BYTEORDER_LOHI) ? true : false; int numberOfAttributes = extract2ByteInt(buffer, INDEX_HEADER_NUMBER_OF_ATTRIBUTES, reverseEndian); int numberOfExamples = extractInt(buffer, INDEX_HEADER_NUMBER_OF_EXAMPLES, reverseEndian); // read descriptors byte[] attributeTypes = new byte[numberOfAttributes]; read(fileReader, buffer, numberOfAttributes); for (int i = 0; i < numberOfAttributes; i++) { attributeTypes[i] = buffer[i]; } String[] attributeNames = new String[numberOfAttributes]; for (int i = 0; i < numberOfAttributes; i++) { read(fileReader, buffer, LENGTH_ATTRIBUTE_NAME); String attributeNameString = new String(buffer, 0, LENGTH_ATTRIBUTE_NAME); attributeNames[i] = attributeNameString.substring(0, attributeNameString.indexOf(CODE_STRING_TERMINATOR)).trim(); } // read sort list read(fileReader, buffer, 2 * (numberOfAttributes + 1)); // read format list for (int i = 0; i < numberOfAttributes; i++) { if (dataSetFormat == CODE_DS_FORMAT_VERSION_113) { read(fileReader, buffer, LENGTH_ATTRIBUTE_FORMAT_VERSION_113); } else if (dataSetFormat == CODE_DS_FORMAT_VERSION_114) { read(fileReader, buffer, LENGTH_ATTRIBUTE_FORMAT_VERSION_114); } } // read value label identifiers String[] valueLabelsIdentifiers = new String[numberOfAttributes]; boolean[] labeled = new boolean[numberOfAttributes]; for (int i = 0; i < numberOfAttributes; i++) { read(fileReader, buffer, LENGTH_ATTRIBUTE_VALUE_LABEL_IDENTIFIER); labeled[i] = buffer[0] != 0; String valueLabelsIdentifierString = new String(buffer, 0, LENGTH_ATTRIBUTE_VALUE_LABEL_IDENTIFIER); valueLabelsIdentifiers[i] = valueLabelsIdentifierString.substring(0, valueLabelsIdentifierString.indexOf(CODE_STRING_TERMINATOR)).trim(); if (valueLabelsIdentifiers[i].equals("")) { valueLabelsIdentifiers[i] = null; } } // read attribute labels String[] attributeLabels = new String[numberOfAttributes]; for (int i = 0; i < numberOfAttributes; i++) { read(fileReader, buffer, LENGTH_ATTRIBUTE_LABEL); String attributeLabelString = new String(buffer, 0, LENGTH_ATTRIBUTE_LABEL); attributeLabels[i] = attributeLabelString.substring(0, attributeLabelString.indexOf(CODE_STRING_TERMINATOR)).trim(); if (attributeLabels[i].equals("")) { attributeLabels[i] = null; } } // read expansion fields for (;;) { read(fileReader, buffer, LENGTH_EXPANSION_FIELD_HEADER); int expansionFieldContentsLength = extractInt(buffer, INDEX_EXPANSION_FIELD_HEADER_LENGTH, reverseEndian); if (buffer[INDEX_EXPANSION_FIELD_HEADER_TYPE] == 0 && expansionFieldContentsLength == 0) { break; } else { read(fileReader, buffer, expansionFieldContentsLength); } } // create attributes LinkedHashMap<String, List<Attribute>> attributeValueLabelIdentifiersMap = new LinkedHashMap<String, List<Attribute>>(); AttributeSet attributeSet = new AttributeSet(numberOfAttributes); for (int i = 0; i < numberOfAttributes; i++) { int valueType = Ontology.ATTRIBUTE_VALUE; switch (0x000000FF & (int) attributeTypes[i]) { case CODE_TYPE_BYTE: valueType = Ontology.INTEGER; break; case CODE_TYPE_INT: valueType = Ontology.INTEGER; break; case CODE_TYPE_LONG: valueType = Ontology.INTEGER; break; case CODE_TYPE_FLOAT: valueType = Ontology.NUMERICAL; break; case CODE_TYPE_DOUBLE: valueType = Ontology.NUMERICAL; break; default: valueType = Ontology.NOMINAL; } if (labeled[i]) { if (handleValueLabelsMode != FORCE_NUMERIC) { valueType = Ontology.NOMINAL; } } String attributeName = null; switch (attributeNamingMode) { case USE_VAR_NAME: attributeName = attributeNames[i]; break; case USE_VAR_LABEL: attributeName = attributeLabels[i] == null ? attributeNames[i] : attributeLabels[i]; break; case USE_VAR_NAME_LABELED: attributeName = attributeLabels[i] == null ? attributeNames[i] : attributeNames[i] + " (" + attributeLabels[i] + ")"; break; case USE_VAR_LABEL_NAMED: attributeName = attributeLabels[i] == null ? attributeNames[i] : attributeLabels[i] + " (" + attributeNames[i] + ")"; break; default: attributeName = attributeNames[i]; } Attribute attribute = AttributeFactory.createAttribute(attributeName, valueType); attributeSet.addAttribute(attribute); if (attributeValueLabelIdentifiersMap.get(valueLabelsIdentifiers[i]) == null) { attributeValueLabelIdentifiersMap.put(valueLabelsIdentifiers[i], new LinkedList<Attribute>()); } if (valueLabelsIdentifiers[i] != null) { attributeValueLabelIdentifiersMap.get(valueLabelsIdentifiers[i]).add(attribute); } } // initialize sampling functionality OrderedSamplingWithoutReplacement sampling = null; if (sampleSize != -1) { sampling = new OrderedSamplingWithoutReplacement(randomGenerator, numberOfExamples, sampleSize); } else { sampling = new OrderedSamplingWithoutReplacement(randomGenerator, numberOfExamples, sampleRatio); } // read data MemoryExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes()); for (int j = 0; j < numberOfExamples; j++) { DataRow dataRow = dataRowFactory.create(numberOfAttributes); for (int i = 0; i < numberOfAttributes; i++) { Attribute attribute = attributeSet.getAttribute(i); double value = Double.NaN; switch (0x000000FF & (int) attributeTypes[i]) { case CODE_TYPE_BYTE: read(fileReader, buffer, LENGTH_TYPE_BYTE); byte byteValue = (byte) buffer[0]; value = byteValue > CODE_MAXIMUM_NONMISSING_BYTE ? Double.NaN : byteValue; break; case CODE_TYPE_INT: read(fileReader, buffer, LENGTH_TYPE_INT); int intValue = extract2ByteInt(buffer, 0, reverseEndian); value = intValue > CODE_MAXIMUM_NONMISSING_INT ? Double.NaN : intValue; break; case CODE_TYPE_LONG: read(fileReader, buffer, LENGTH_TYPE_LONG); int longValue = extractInt(buffer, 0, reverseEndian); value = longValue > CODE_MAXIMUM_NONMISSING_LONG ? Double.NaN : longValue; break; case CODE_TYPE_FLOAT: read(fileReader, buffer, LENGTH_TYPE_FLOAT); float floatValue = extractFloat(buffer, 0, reverseEndian); value = floatValue > CODE_MAXIMUM_NONMISSING_FLOAT ? Double.NaN : floatValue; break; case CODE_TYPE_DOUBLE: read(fileReader, buffer, LENGTH_TYPE_DOUBLE); double doubleValue = extractDouble(buffer, 0, reverseEndian); value = doubleValue > CODE_MAXIMUM_NONMISSING_DOUBLE ? Double.NaN : doubleValue; break; default: int length = (int) 0x000000FF & attributeTypes[i]; read(fileReader, buffer, length); String stringValue = new String(buffer, 0, length); int stringTerminatorIndex = stringValue.indexOf(CODE_STRING_TERMINATOR); if (stringTerminatorIndex < 0 || stringTerminatorIndex >= length) { value = attribute.getMapping().mapString(stringValue.trim()); } else { value = attribute.getMapping().mapString(stringValue.substring(0, stringTerminatorIndex).trim()); } } dataRow.set(attribute, value); } // add data to table if (sampling == null) { table.addDataRow(dataRow); } else { if (sampling.acceptElement()) { table.addDataRow(dataRow); } } } // read value labels int readLength = -1; LinkedHashMap<Attribute, LinkedHashMap<Double, String>> valueMappingsMap = new LinkedHashMap<Attribute, LinkedHashMap<Double, String>>(); do { readLength = readWithoutLengthCheck(fileReader, buffer, LENGTH_VALUE_LABEL_HEADER); if (readLength > 0) { int length = extractInt(buffer, INDEX_VALUE_LABEL_HEADER_LENGTH, reverseEndian); String valueLabelIdentifierString = new String(buffer, INDEX_VALUE_LABEL_HEADER_NAME, LENGTH_VALUE_LABEL_HEADER_NAME); String valueLabelIdentifier = valueLabelIdentifierString.substring(0, valueLabelIdentifierString.indexOf(CODE_STRING_TERMINATOR)).trim(); LinkedHashMap<Double, String> valueMap = new LinkedHashMap<Double, String>(); if (length > 500) { buffer = new byte[length]; } read(fileReader, buffer, length); int numberOfEntries = extractInt(buffer, INDEX_VALUE_LABEL_TABLE_NUMBER_OF_ENTRIES, reverseEndian); int textLength = extractInt(buffer, INDEX_VALUE_LABEL_TABLE_TEXT_LENGTH, reverseEndian); int[] offset = new int[numberOfEntries]; for (int i = 0; i < numberOfEntries; i++) { offset[i] = extractInt(buffer, INDEX_VALUE_LABEL_TABLE_OFFSETS + i * LENGTH_INT_32, reverseEndian); } double[] values = new double[numberOfEntries]; for (int i = 0; i < numberOfEntries; i++) { values[i] = extractInt(buffer, INDEX_VALUE_LABEL_TABLE_OFFSETS + numberOfEntries * LENGTH_INT_32 + i * LENGTH_INT_32, reverseEndian); } String[] nominalValues = new String[numberOfEntries]; for (int i = 0; i < numberOfEntries; i++) { nominalValues[i] = extractString(buffer, INDEX_VALUE_LABEL_TABLE_OFFSETS + 2 * numberOfEntries * LENGTH_INT_32 + offset[i], textLength - offset[i]); int stringTerminatorIndex = nominalValues[i].indexOf(CODE_STRING_TERMINATOR); if (stringTerminatorIndex < 0) { valueMap.put(values[i], nominalValues[i].trim()); } else { valueMap.put(values[i], nominalValues[i].substring(0, nominalValues[i].indexOf(CODE_STRING_TERMINATOR)).trim()); } } for (Attribute attribute : attributeValueLabelIdentifiersMap.get(valueLabelIdentifier)) { valueMappingsMap.put(attribute, valueMap); } } } while (readLength >= 0); fileReader.close(); // add value labels to data if (handleValueLabelsMode != FORCE_NUMERIC) { Attribute[] attributes = table.getAttributes(); LinkedHashMap[] attributeValueMaps = new LinkedHashMap[numberOfAttributes]; for (int i = 0; i < attributes.length; i++) { attributeValueMaps[i] = valueMappingsMap.get(attributes[i]); } for (Iterator<DataRow> iterator = table.getDataRowReader(); iterator.hasNext(); ) { DataRow dataRow = iterator.next(); for (int i = 0; i < attributes.length; i++) { if (labeled[i] && attributeValueMaps[i] != null) { double originalValue = dataRow.get(attributes[i]); double value = Double.NaN; switch (handleValueLabelsMode) { case IGNORE: value = attributes[i].getMapping().mapString(Tools.formatIntegerIfPossible(originalValue)); break; case USE_ADDITIONALLY: { String nominalValue = (String) attributeValueMaps[i].get(originalValue); if (nominalValue != null) { value = attributes[i].getMapping().mapString(nominalValue); } else { value = attributes[i].getMapping().mapString(Tools.formatIntegerIfPossible(originalValue)); } } break; case USE_EXCLUSIVELY: { String nominalValue = (String) attributeValueMaps[i].get(originalValue); if (nominalValue != null) { value = attributes[i].getMapping().mapString(nominalValue); } else { value = Double.NaN; } } break; } dataRow.set(attributes[i], value); } } } } // create example set ExampleSet exampleSet = table.createExampleSet(); return exampleSet; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeCategory(PARAMETER_ATTRIBUTE_NAMING_MODE, "Determines which variable properties should be used for attribute naming.", ATTRIBUTE_NAMING_MODES, USE_VAR_NAME); type.setExpert(false); types.add(type); type = new ParameterTypeCategory(PARAMETER_HANDLE_VALUE_LABELS, "Specifies how to handle attributes with value labels, i.e. whether to ignore the labels or how to use them.", HANDLE_VALUE_LABELS_MODES, USE_ADDITIONALLY); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)", 0.0d, 1.0d, 1.0d); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The exact number of samples which should be read (-1 = all; if not -1, sample_ratio will not have any effect)", -1, Integer.MAX_VALUE, -1); type.setExpert(true); types.add(type); type = new ParameterTypeInt(PARAMETER_LOCAL_RANDOM_SEED, "Use the given random seed instead of global random numbers (for sampling by ratio, -1: use global).", -1, Integer.MAX_VALUE, -1); type.setExpert(true); types.add(type); return types; } }