MappingInfo.java example

Explorer
android-ml-weka-master
- src
  - weka
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    MappingInfo.java
 *    Copyright (C) 2008-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.pmml;

import java.io.Serializable;
import java.util.ArrayList;

import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;
import weka.gui.Logger;

/**
 * Class that maintains the mapping between incoming data set structure
 * and that of the mining schema.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com
 * @version $Revision: 8034 $
 */
public class MappingInfo implements Serializable {
  
  /** For serialization */
  
  /** 
   * Index for incoming nominal values that are not defined in the mining
   * schema.
   */
  public static final int UNKNOWN_NOMINAL_VALUE = -1;
  
  /** 
   * Map the incoming attributes to the mining schema attributes.
   * Each entry holds the index of the incoming attribute that
   * corresponds to this mining schema attribute.  
   */
  private int[] m_fieldsMap = null;
  
  /** 
   * Map indexes for nominal values in incoming structure to those
   * in the mining schema. There will be as many entries as there are
   * attributes in this array. Non-nominal attributes will have
   * null entries. Each non-null entry is an array of integer indexes.
   * Each entry in a given array (for a given attribute) holds the index of
   * the mining schema value that corresponds to this incoming value.
   * UNKNOWN_NOMINAL_VALUE is used as the index for those incoming values 
   * that are not defined in the mining schema. 
   */
  private int[][] m_nominalValueMaps = null;
  
  /** Holds a textual description of the fields mapping */
  private String m_fieldsMappingText = null;
  
  /** For logging */
  private Logger m_log = null;
  
  public MappingInfo(Instances dataSet, MiningSchema miningSchema,
                     Logger log) throws Exception {
    m_log = log;
    //miningSchema.convertStringAttsToNominal();
    Instances fieldsI = miningSchema.getMiningSchemaAsInstances();
 
    m_fieldsMap = new int[fieldsI.numAttributes()];
    m_nominalValueMaps = new int[fieldsI.numAttributes()][];
    
    for (int i = 0; i < fieldsI.numAttributes(); i++) {
      String schemaAttName = fieldsI.attribute(i).name();
      boolean found = false;
      for (int j = 0; j < dataSet.numAttributes(); j++) {
        if (dataSet.attribute(j).name().equals(schemaAttName)) {
          Attribute miningSchemaAtt = fieldsI.attribute(i);
          Attribute incomingAtt = dataSet.attribute(j);
          // check type match
          if (miningSchemaAtt.type() != incomingAtt.type()) {
            if (miningSchemaAtt.isString() && incomingAtt.isNominal()) {
              // don't worry about String attributes in the mining schema
              // (as long as the corresponding incoming is a String or nominal),
              // since values for the String attributes are more than likely revealed
              // by FieldRef elements in the actual model itself
            } else {
              throw new Exception("[MappingInfo] type mismatch for field " +
                  schemaAttName + ". Mining schema type " + 
                  miningSchemaAtt.toString() + ". Incoming type " +
                  incomingAtt.toString() + ".");
            }
          }

          // check nominal values (number, names...)
          if (miningSchemaAtt.numValues() != incomingAtt.numValues()) {
            String warningString = "[MappingInfo] WARNING: incoming nominal attribute "
              + incomingAtt.name() + " does not have the same "
              + "number of values as the corresponding mining "
              + "schema attribute.";
            if (m_log != null) {
              m_log.logMessage(warningString);
            } else {
              System.err.println(warningString);
            }
          }
          if (miningSchemaAtt.isNominal() || miningSchemaAtt.isString()) {
            int[] valuesMap = new int[incomingAtt.numValues()];
            for (int k = 0; k < incomingAtt.numValues(); k++) {
              String incomingNomVal = incomingAtt.value(k);
              int indexInSchema = miningSchemaAtt.indexOfValue(incomingNomVal);
              if (indexInSchema < 0) {
                String warningString = "[MappingInfo] WARNING: incoming nominal attribute "
                  + incomingAtt.name() + " has value "
                  + incomingNomVal + " that doesn't occur in the mining schema.";
                if (m_log != null) {
                  m_log.logMessage(warningString);
                } else {
                  System.err.println(warningString);
                }
                valuesMap[k] = UNKNOWN_NOMINAL_VALUE;
              } else {
                valuesMap[k] = indexInSchema;
              }
            }
            m_nominalValueMaps[i] = valuesMap;
          }

          /*if (miningSchemaAtt.isNominal()) {
            for (int k = 0; k < miningSchemaAtt.numValues(); k++) {
              if (!miningSchemaAtt.value(k).equals(incomingAtt.value(k))) {
                throw new Exception("[PMMLUtils] value " + k + " (" + 
                                    miningSchemaAtt.value(k) + ") does not match " +
                                    "incoming value (" + incomingAtt.value(k) +
                                    ") for attribute " + miningSchemaAtt.name() +
                                    ".");

              }
            }
          }*/
          found = true;
          m_fieldsMap[i] = j;
        }
      }
      if (!found) {
        throw new Exception("[MappingInfo] Unable to find a match for mining schema "
            + "attribute " + schemaAttName + " in the "
            + "incoming instances!");
      }
    }

    // check class attribute (if set)
   if (fieldsI.classIndex() >= 0) {
      if (dataSet.classIndex() < 0) {
	// first see if we can find a matching class
	String className = fieldsI.classAttribute().name();
	Attribute classMatch = dataSet.attribute(className);
	if (classMatch == null) {
	  throw new Exception("[MappingInfo] Can't find match for target field " + className
	      + "in incoming instances!");
	}
	dataSet.setClass(classMatch);
      } else if (!fieldsI.classAttribute().name().equals(dataSet.classAttribute().name())) {
        throw new Exception("[MappingInfo] class attribute in mining schema does not match "
            + "class attribute in incoming instances!");
      }
    }
    
    // Set up the textual description of the mapping
    fieldsMappingString(fieldsI, dataSet);
  }
  
  private void fieldsMappingString(Instances miningSchemaI, Instances incomingI) {
    StringBuffer result = new StringBuffer();
    
    int maxLength = 0;
    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      if (miningSchemaI.attribute(i).name().length() > maxLength) {
        maxLength = miningSchemaI.attribute(i).name().length();
      }
    }
    maxLength += 12; // length of " (nominal)"/" (numeric)"
    
    int minLength = 13; // "Mining schema".length()                                                        
    String headerS = "Mining schema";
    String sep = "-------------";

    if (maxLength < minLength) {
      maxLength = minLength;
    }
    
    headerS = PMMLUtils.pad(headerS, " ", maxLength, false);
    sep = PMMLUtils.pad(sep, "-", maxLength, false);
    
    sep += "\t    ----------------\n";
    headerS += "\t    Incoming fields\n";
    result.append(headerS);
    result.append(sep);
    
    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      Attribute temp = miningSchemaI.attribute(i);
      String attName = "("
        + ((temp.isNumeric())
           ? "numeric)"
           : "nominal)")
        + " " + temp.name();
      attName = PMMLUtils.pad(attName, " ", maxLength, false);
      attName +=  "\t--> ";
      result.append(attName);
      
      Attribute incoming = incomingI.attribute(m_fieldsMap[i]);
      String fieldName = "" + (m_fieldsMap[i] + 1) + " ("
        + ((incoming.isNumeric())
            ? "numeric)"
            : "nominal)");
      fieldName += " " + incoming.name();
      result.append(fieldName + "\n");
    }
    
    m_fieldsMappingText = result.toString();
  }
  
  /**
   * Convert an <code>Instance</code> to an array of values that matches the
   * format of the mining schema. First maps raw attribute values and then
   * applies rules for missing values, outliers etc.
   *
   * @param inst the <code>Instance</code> to convert
   * @param miningSchema the mining schema
   * incoming instance attributes
   * @return an array of doubles that are values from the incoming Instances,
   * correspond to the format of the mining schema and have had missing values,
   * outliers etc. dealt with.
   * @throws Exception if something goes wrong
   */
  public double[] instanceToSchema(Instance inst, 
                                   MiningSchema miningSchema) throws Exception {
    Instances miningSchemaI = miningSchema.getMiningSchemaAsInstances();
    
    // allocate enough space for both mining schema fields and any derived fields
    double[] result = new double[miningSchema.getFieldsAsInstances().numAttributes()];

    // Copy over the values
    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      //if (miningSchemaI.attribute(i).isNumeric()) {
      result[i] = inst.value(m_fieldsMap[i]);
      if (miningSchemaI.attribute(i).isNominal() || 
          miningSchemaI.attribute(i).isString()) {
        // If not missing, look up the index of this incoming categorical value in
        // the mining schema
        if (!Utils.isMissingValue(inst.value(m_fieldsMap[i]))) {
          int[] valueMap = m_nominalValueMaps[i];
          int index = valueMap[(int)inst.value(m_fieldsMap[i])];
          String incomingAttValue = 
            inst.attribute(m_fieldsMap[i]).value((int)inst.value(m_fieldsMap[i]));
          /*int index = miningSchemaI.attribute(i).indexOfValue(incomingAttValue); */
          if (index >= 0) {
            result[i] = index;
          } else {
            // set this to "unknown" (-1) for nominal valued attributes
            result[i] = UNKNOWN_NOMINAL_VALUE;
            String warningString = "[MappingInfo] WARNING: Can't match nominal value "
              + incomingAttValue;
            if (m_log != null) {
              m_log.logMessage(warningString);
            } else {
              System.err.println(warningString);
            }
          }
        }
      }
    }

    // Now deal with missing values and outliers...
    miningSchema.applyMissingAndOutlierTreatments(result);
    //    printInst(result);
    
    // now fill in any derived values
    ArrayList<DerivedFieldMetaInfo> derivedFields = miningSchema.getDerivedFields();
    for (int i = 0; i < derivedFields.size(); i++) {
      DerivedFieldMetaInfo temp = derivedFields.get(i);
//      System.err.println("Applying : " + temp);
      double r = temp.getDerivedValue(result);
      result[i + miningSchemaI.numAttributes()] = r;
    }
    
    /*System.err.print("==> ");
    for (int i = 0; i < result.length; i++) {
      System.err.print(" " + result[i]);
    }
    System.err.println();*/
    
    return result;
  }
  
  /**
   * Get a textual description of them mapping between mining schema
   * fields and incoming data fields.
   * 
   * @return a description of the fields mapping as a String
   */
  public String getFieldsMappingString() {
    if (m_fieldsMappingText == null) {
      return "No fields mapping constructed!";
    }
    return m_fieldsMappingText;
  }
}