TfUtils.java example

Explorer
incubator-systemml-master
- dev
  - release
    - src
      - test
        java
        org
        apache
        sysml
        validation
        Constants.java
        Utility.java
        ValidateLicAndNotice.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.transform;

import java.io.EOFException;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.regex.Pattern;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.parser.DataExpression;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.io.MatrixReader;
import org.apache.sysml.runtime.matrix.CSVReblockMR;
import org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount;
import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.util.MapReduceTool;
import org.apache.sysml.runtime.util.UtilFunctions;

@SuppressWarnings("deprecation")
public class TfUtils implements Serializable{
	
	private static final long serialVersionUID = 526252850872633125L;

	protected enum ColumnTypes { 
		SCALE,
		NOMINAL,
		ORDINAL,
		DUMMYCODED;
	
		protected byte toID() { 
			switch(this) {
				case SCALE: return 1;
				case NOMINAL: return 2;
				case ORDINAL: return 3;
				// Ideally, dummycoded columns should be of a different type. Treating them as SCALE is incorrect, semantically.
				case DUMMYCODED: return 1; 
				default:
					throw new RuntimeException("Invalid Column Type: " + this);
			}
		}	
	}
	
	//transform methods
	public static final String TXMETHOD_IMPUTE    = "impute";
	public static final String TXMETHOD_RECODE    = "recode";
	public static final String TXMETHOD_BIN       = "bin";
	public static final String TXMETHOD_DUMMYCODE = "dummycode";
	public static final String TXMETHOD_SCALE     = "scale";
	public static final String TXMETHOD_OMIT      = "omit";
	public static final String TXMETHOD_MVRCD     = "mvrcd";
		
	//transform meta data constants (frame-based transform)
	public static final String TXMTD_MVPREFIX = "#Meta"+Lop.DATATYPE_PREFIX+"MV";
	public static final String TXMTD_NDPREFIX = "#Meta"+Lop.DATATYPE_PREFIX+"ND";
	
	//transform meta data constants (old file-based transform)
	public static final String TXMTD_SEP         = ",";
	public static final String TXMTD_COLTYPES    = "coltypes.csv";	
	public static final String TXMTD_COLNAMES    = "column.names";
	public static final String TXMTD_DC_COLNAMES = "dummycoded.column.names";	
	public static final String TXMTD_RCD_MAP_SUFFIX      = ".map";
	public static final String TXMTD_RCD_DISTINCT_SUFFIX = ".ndistinct";
	public static final String TXMTD_BIN_FILE_SUFFIX     = ".bin";
	public static final String TXMTD_MV_FILE_SUFFIX      = ".impute";
	
	public static final String JSON_ATTRS 	= "attributes"; 
	public static final String JSON_MTHD 	= "methods"; 
	public static final String JSON_CONSTS = "constants"; 
	public static final String JSON_NBINS 	= "numbins"; 		
	protected static final String MODE_FILE_SUFFIX 		= ".mode";
	protected static final String SCALE_FILE_SUFFIX		= ".scale";
	protected static final String DCD_FILE_NAME 		= "dummyCodeMaps.csv";	
	protected static final String DCD_NAME_SEP 	= "_";
	
	
	private OmitAgent _oa = null;
	private MVImputeAgent _mia = null;
	private RecodeAgent _ra = null;	
	private BinAgent _ba = null;
	private DummycodeAgent _da = null;
	
	private long _numRecordsInPartFile;		// Total number of records in the data file
	private long _numValidRecords;			// (_numRecordsInPartFile - #of omitted records)
	private long _numTransformedRows; 		// Number of rows after applying transformations
	private long _numTransformedColumns; 	// Number of columns after applying transformations

	private String _headerLine = null;
	private boolean _hasHeader;
	private Pattern _delim = null;
	private String _delimString = null;
	private String[] _NAstrings = null;
	private String[] _outputColumnNames = null;
	private int _numInputCols = -1;
	
	private String _tfMtdDir = null;
	private String _spec = null;
	private String _offsetFile = null;
	private String _tmpDir = null;
	private String _outputPath = null;
	
	public TfUtils(JobConf job, boolean minimal) 
		throws IOException, JSONException 
	{
		if( !InfrastructureAnalyzer.isLocalMode(job) ) {
			ConfigurationManager.setCachedJobConf(job);
		}		
		_NAstrings = TfUtils.parseNAStrings(job);
		_spec = job.get(MRJobConfiguration.TF_SPEC);
		_oa = new OmitAgent(new JSONObject(_spec), null, -1);
	}
	
	// called from GenTFMtdMapper, ApplyTf (Hadoop)
	public TfUtils(JobConf job) 
		throws IOException, JSONException 
	{
		if( !InfrastructureAnalyzer.isLocalMode(job) ) {
			ConfigurationManager.setCachedJobConf(job);
		}
		
		boolean hasHeader = Boolean.parseBoolean(job.get(MRJobConfiguration.TF_HAS_HEADER));
		String[] naStrings = TfUtils.parseNAStrings(job);
		long numCols = UtilFunctions.parseToLong( job.get(MRJobConfiguration.TF_NUM_COLS) ); // #cols input data
		String spec = job.get(MRJobConfiguration.TF_SPEC);
		String offsetFile = job.get(MRJobConfiguration.TF_OFFSETS_FILE);
		String tmpPath = job.get(MRJobConfiguration.TF_TMP_LOC);
		String outputPath = FileOutputFormat.getOutputPath(job).toString();
		JSONObject jspec = new JSONObject(spec);
		
		init(job.get(MRJobConfiguration.TF_HEADER), hasHeader, job.get(MRJobConfiguration.TF_DELIM), naStrings, jspec, numCols, offsetFile, tmpPath, outputPath);
	}
	
	// called from GenTfMtdReducer 
	public TfUtils(JobConf job, String tfMtdDir) throws IOException, JSONException 
	{
		this(job);
		_tfMtdDir = tfMtdDir;
	}
	
	// called from GenTFMtdReducer and ApplyTf (Spark)
	public TfUtils(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long ncol, String tfMtdDir, String offsetFile, String tmpPath) throws IOException, JSONException {
		init (headerLine, hasHeader, delim, naStrings, spec, ncol, offsetFile, tmpPath, null);
		_tfMtdDir = tfMtdDir;
	}

	protected static boolean checkValidInputFile(FileSystem fs, Path path, boolean err)
			throws IOException {
		// check non-existing file
		if (!fs.exists(path))
			if ( err )
				throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS.");
			else
				return false;

		// check for empty file
		if (MapReduceTool.isFileEmpty(fs, path.toString()))
			if ( err )
			throw new EOFException("Empty input file " + path.toString() + ".");
			else
				return false;
		
		return true;
	}
	
	public static String getPartFileName(JobConf job) throws IOException {
		FileSystem fs = FileSystem.get(job);
		Path thisPath=new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE)).makeQualified(fs);
		return thisPath.toString();
	}
	
	public static boolean isPartFileWithHeader(JobConf job) throws IOException {
		FileSystem fs = FileSystem.get(job);
		
		String thisfile=getPartFileName(job);
		Path smallestFilePath=new Path(job.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
		
		if(thisfile.toString().equals(smallestFilePath.toString()))
			return true;
		else
			return false;
	}
	
	/**
	 * Prepare NA strings so that they can be sent to workers via JobConf.
	 * A "dummy" string is added at the end to handle the case of empty strings.
	 * @param na NA string
	 * @return NA string concatenated with NA string separator concatenated with "dummy"
	 */
	public static String prepNAStrings(String na) {
		return na  + DataExpression.DELIM_NA_STRING_SEP + "dummy";
	}
	
	public static String[] parseNAStrings(String na) 
	{
		if ( na == null )
			return null;
		
		String[] tmp = Pattern.compile(Pattern.quote(DataExpression.DELIM_NA_STRING_SEP)).split(na, -1);
		return tmp; //Arrays.copyOf(tmp, tmp.length-1);
	}
	
	public static String[] parseNAStrings(JobConf job) 
	{
		return parseNAStrings(job.get(MRJobConfiguration.TF_NA_STRINGS));
	}
	
	private void createAgents(JSONObject spec, String[] naStrings) 
		throws IOException, JSONException 
	{
		_oa = new OmitAgent(spec, _outputColumnNames, _numInputCols);
		_mia = new MVImputeAgent(spec, null, naStrings, _numInputCols);
		_ra = new RecodeAgent(spec, _outputColumnNames, _numInputCols);
		_ba = new BinAgent(spec, _outputColumnNames, _numInputCols);
		_da = new DummycodeAgent(spec, _outputColumnNames, _numInputCols);
	}
	
	private void parseColumnNames() {
		_outputColumnNames = _delim.split(_headerLine, -1);
		for(int i=0; i < _outputColumnNames.length; i++)
			_outputColumnNames[i] = UtilFunctions.unquote(_outputColumnNames[i]);
	}
	
	private void init(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long numCols, String offsetFile, String tmpPath, String outputPath) throws IOException, JSONException
	{
		_numRecordsInPartFile = 0;
		_numValidRecords = 0;
		_numTransformedRows = 0;
		_numTransformedColumns = 0;
		
		//TODO: fix hard-wired header propagation to meta data column names
		
		_headerLine = headerLine;
		_hasHeader = hasHeader;
		_delimString = delim;
		_delim = Pattern.compile(Pattern.quote(delim));
		_NAstrings = naStrings;
		_numInputCols = (int)numCols;
		_offsetFile = offsetFile;
		_tmpDir = tmpPath;
		_outputPath = outputPath;
		
		parseColumnNames();		
		createAgents(spec, naStrings);
	}
	
	public void incrValid() { _numValidRecords++; }
	public long getValid()  { return _numValidRecords; }
	public long getTotal()  { return _numRecordsInPartFile; }
	public long getNumTransformedRows() 	{ return _numTransformedRows; }
	public long getNumTransformedColumns() 	{ return _numTransformedColumns; }
	
	public String getHeader() 		{ return _headerLine; }
	public boolean hasHeader() 		{ return _hasHeader; }
	public String getDelimString() 	{ return _delimString; }
	public Pattern getDelim() 		{ return _delim; }
	public String[] getNAStrings() 	{ return _NAstrings; }
	public long getNumCols() 		{ return _numInputCols; }
	
	public String getSpec() 	{ return _spec; }
	public String getTfMtdDir() 	{ return _tfMtdDir; }
	public String getOffsetFile() 	{ return _offsetFile; }
	public String getTmpDir() 		{ return _tmpDir; }
	public String getOutputPath()	{ return _outputPath; }
	
	public String getName(int colID) { return _outputColumnNames[colID-1]; }
	
	public void setValid(long n) { _numValidRecords = n;}
	public void incrTotal() { _numRecordsInPartFile++; }
	public void setTotal(long n) { _numRecordsInPartFile = n;}
	
	public OmitAgent 	  getOmitAgent() 	{ 	return _oa; }
	public MVImputeAgent  getMVImputeAgent(){ 	return _mia;}
	public RecodeAgent 	  getRecodeAgent() 	{ 	return _ra; }
	public BinAgent 	  getBinAgent() 	{ 	return _ba; }
	public DummycodeAgent getDummycodeAgent() { return _da; }
	
	/**
	 * Function that checks if the given string is one of NA strings.
	 * 
	 * @param NAstrings array of NA strings
	 * @param w string to check
	 * @return true if w is a NAstring
	 */
	public static boolean isNA(String[] NAstrings, String w) {
		if(NAstrings == null)
			return false;
		
		for(String na : NAstrings) {
			if(w.equals(na))
				return true;
		}
		return false;
	}
	
	public String[] getWords(Text line) {
		return getWords(line.toString());
	}
	

	public String[] getWords(String line) {
		return getDelim().split(line.trim(), -1);
	}
	
	/**
	 * Process a given row to construct transformation metadata.
	 * 
	 * @param line string to break into words
	 * @return string array of words from the line
	 * @throws IOException if IOException occurs
	 */
	public String[] prepareTfMtd(String line) throws IOException {
		String[] words = getWords(line);
		if(!getOmitAgent().omit(words, this))
		{
			getMVImputeAgent().prepare(words);
			getRecodeAgent().prepare(words, this);
			getBinAgent().prepare(words, this);
			incrValid();
		}
		incrTotal();
		
		return words;
	}
	
	public void loadTfMetadata() throws IOException 
	{
		JobConf job = ConfigurationManager.getCachedJobConf();
		loadTfMetadata(job, false);
	}
	
	public void loadTfMetadata(JobConf job, boolean fromLocalFS) throws IOException
	{
		Path tfMtdDir = null; 
		FileSystem fs = null;
		
		if(fromLocalFS) {
			// metadata must be read from local file system (e.g., distributed cache in the case of Hadoop)
			tfMtdDir = (DistributedCache.getLocalCacheFiles(job))[0];
			fs = FileSystem.getLocal(job);
		}
		else {
			fs = FileSystem.get(job);
			tfMtdDir = new Path(getTfMtdDir());
		}
		
		// load transformation metadata 
		getMVImputeAgent().loadTxMtd(job, fs, tfMtdDir, this);
		getRecodeAgent().loadTxMtd(job, fs, tfMtdDir, this);
		getBinAgent().loadTxMtd(job, fs, tfMtdDir, this);
		
		// associate recode maps and bin definitions with dummycoding agent,
		// as recoded and binned columns are typically dummycoded
		getDummycodeAgent().setRecodeMaps( getRecodeAgent().getRecodeMaps() );
		getDummycodeAgent().setNumBins(getBinAgent().getColList(), getBinAgent().getNumBins());
		getDummycodeAgent().loadTxMtd(job, fs, tfMtdDir, this);

	}

	public String processHeaderLine() throws IOException 
	{
		//TODO: fix hard-wired header propagation to meta data column names
		
		FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
		String dcdHeader = getDummycodeAgent().constructDummycodedHeader(getHeader(), getDelim());
		getDummycodeAgent().genDcdMapsAndColTypes(fs, getTmpDir(), (int) getNumCols(), this);
		
		// write header information (before and after transformation) to temporary path
		// these files are copied into txMtdPath, once the ApplyTf job is complete.
		DataTransform.generateHeaderFiles(fs, getTmpDir(), getHeader(), dcdHeader);

		return dcdHeader;
		//_numTransformedColumns = getDelim().split(dcdHeader, -1).length; 
		//return _numTransformedColumns;
	}

	public boolean omit(String[] words) {
		if(getOmitAgent() == null)
			return false;
		return getOmitAgent().omit(words, this);
	}
	
	/**
	 * Function to apply transformation metadata on a given row.
	 * 
	 * @param words string array of words
	 * @return string array of transformed words
	 */
	public String[] apply( String[] words ) {
		words = getMVImputeAgent().apply(words);
		words = getRecodeAgent().apply(words);
		words = getBinAgent().apply(words);
		words = getDummycodeAgent().apply(words);		
		_numTransformedRows++;
		
		return words;
	}
	
	public void check(String []words) throws DMLRuntimeException 
	{
		boolean checkEmptyString = ( getNAStrings() != null );
		if ( checkEmptyString ) 
		{
			final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
			for(int i=0; i<words.length; i++) 
				if ( words[i] != null && words[i].equals(""))
					throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i+1));
		}
	}
	
	public String checkAndPrepOutputString(String []words) throws DMLRuntimeException {
		return checkAndPrepOutputString(words, new StringBuilder());
	}
	
	public String checkAndPrepOutputString(String []words, StringBuilder sb) throws DMLRuntimeException 
	{
		/*
		 * Check if empty strings ("") have to be handled.
		 * 
		 * Unless na.strings are provided, empty strings are (implicitly) considered as value zero.
		 * When na.strings are provided, then "" is considered a missing value indicator, and the 
		 * user is expected to provide an appropriate imputation method. Therefore, when na.strings 
		 * are provided, "" encountered in any column (after all transformations are applied) 
		 * denotes an erroneous condition.  
		 */
		boolean checkEmptyString = ( getNAStrings() != null ); //&& !MVImputeAgent.isNA("", TransformationAgent.NAstrings) ) {
		
		//StringBuilder sb = new StringBuilder();
		sb.setLength(0);
		int i =0;
		
		if ( checkEmptyString ) 
		{
			final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
			if ( words[0] != null ) 
				if ( words[0].equals("") )
					throw new DMLRuntimeException( msg + getDummycodeAgent().mapDcdColumnID(1));
				else 
					sb.append(words[0]);
			else
				sb.append("0");
			
			for(i=1; i<words.length; i++) 
			{
				sb.append(_delimString);
				
				if ( words[i] != null ) 
					if ( words[i].equals("") )
						throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i+1));
					else 
						sb.append(words[i]);
				else
					sb.append("0");
			}
		}
		else 
		{
			sb.append(words[0] != null ? words[0] : "0");
			for(i=1; i<words.length; i++) 
			{
				sb.append(_delimString);
				sb.append(words[i] != null ? words[i] : "0");
			}
		}
		
		return sb.toString();
	}

	private Reader initOffsetsReader(JobConf job) throws IOException 
	{
		Path path=new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
		FileSystem fs = FileSystem.get(job);
		Path[] files = MatrixReader.getSequenceFilePaths(fs, path);
		if ( files.length != 1 )
			throw new IOException("Expecting a single file under counters file: " + path.toString());
		
		Reader reader = new SequenceFile.Reader(fs, files[0], job);
		
		return reader;
	}
	
	/**
	 * Function to generate custom file names (transform-part-.....) for
	 * mappers' output for ApplyTfCSV job. The idea is to find the index 
	 * of (thisfile, fileoffset) in the list of all offsets from the 
	 * counters/offsets file, which was generated from either GenTfMtdMR
	 * or AssignRowIDMR job.
	 * 
	 * @param job job configuration
	 * @param offset file offset
	 * @return part file id (ie, 00001, 00002, etc)
	 * @throws IOException if IOException occurs
	 */
	public String getPartFileID(JobConf job, long offset) throws IOException
	{
		Reader reader = null;
		int id = 0;
		try {
			reader = initOffsetsReader(job);
			ByteWritable key=new ByteWritable();
			OffsetCount value=new OffsetCount();
			String thisFile = TfUtils.getPartFileName(job);
			while (reader.next(key, value)) {
				if ( thisFile.equals(value.filename) && value.fileOffset == offset ) 
					break;
				id++;
			}
		}
		finally {
			IOUtilFunctions.closeSilently(reader);
		}
		
		String sid = Integer.toString(id);
		char[] carr = new char[5-sid.length()];
		Arrays.fill(carr, '0');
		String ret = (new String(carr)).concat(sid);
		
		return ret;
	}
}