TfUtils.java example

Explorer
systemml-master
- system-ml
  - src
/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package com.ibm.bi.dml.runtime.transform;

import java.io.BufferedReader;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Arrays;
import java.util.regex.Pattern;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.parser.DataExpression;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import com.ibm.bi.dml.runtime.io.MatrixReader;
import com.ibm.bi.dml.runtime.matrix.CSVReblockMR;
import com.ibm.bi.dml.runtime.matrix.CSVReblockMR.OffsetCount;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;
import com.ibm.bi.dml.runtime.util.MapReduceTool;
import com.ibm.bi.dml.runtime.util.UtilFunctions;
import com.ibm.bi.dml.utils.JSONHelper;


@SuppressWarnings("deprecation")
public class TfUtils implements Serializable{
	
	private static final long serialVersionUID = 526252850872633125L;

	private OmitAgent _oa = null;
	private MVImputeAgent _mia = null;
	private RecodeAgent _ra = null;	
	private BinAgent _ba = null;
	private DummycodeAgent _da = null;
	
	private long _numRecordsInPartFile;		// Total number of records in the data file
	private long _numValidRecords;			// (_numRecordsInPartFile - #of omitted records)
	private long _numTransformedRows; 		// Number of rows after applying transformations
	private long _numTransformedColumns; 	// Number of columns after applying transformations

	private String _headerLine = null;
	private boolean _hasHeader;
	private Pattern _delim = null;
	private String _delimString = null;
	private String[] _NAstrings = null;
	private String[] _outputColumnNames = null;
	private long _numInputCols = -1;
	
	private String _tfMtdDir = null;
	private String _specFile = null;
	private String _offsetFile = null;
	private String _tmpDir = null;
	private String _outputPath = null;
	
	protected static boolean checkValidInputFile(FileSystem fs, Path path, boolean err)
			throws IOException {
		// check non-existing file
		if (!fs.exists(path))
			if ( err )
				throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS.");
			else
				return false;

		// check for empty file
		if (MapReduceTool.isFileEmpty(fs, path.toString()))
			if ( err )
			throw new EOFException("Empty input file " + path.toString() + ".");
			else
				return false;
		
		return true;
	}
	
	public static String getPartFileName(JobConf job) throws IOException {
		FileSystem fs = FileSystem.get(job);
		Path thisPath=new Path(job.get("map.input.file")).makeQualified(fs);
		return thisPath.toString();
	}
	
	public static boolean isPartFileWithHeader(JobConf job) throws IOException {
		FileSystem fs = FileSystem.get(job);
		
		String thisfile=getPartFileName(job);
		Path smallestFilePath=new Path(job.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
		
		if(thisfile.toString().equals(smallestFilePath.toString()))
			return true;
		else
			return false;
	}
	
	public static JSONObject readSpec(FileSystem fs, String specFile) throws IOException {
		BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFile))));
		JSONObject obj = JSONHelper.parse(br);
		br.close();
		return obj;
	}
	
	/**
	 * Prepare NA strings so that they can be sent to workers via JobConf.
	 * A "dummy" string is added at the end to handle the case of empty strings.
	 * @param na
	 * @return
	 */
	public static String prepNAStrings(String na) {
		return na  + DataExpression.DELIM_NA_STRING_SEP + "dummy";
	}
	
	public static String[] parseNAStrings(String na) 
	{
		if ( na == null )
			return null;
		
		String[] tmp = Pattern.compile(Pattern.quote(DataExpression.DELIM_NA_STRING_SEP)).split(na, -1);
		return tmp; //Arrays.copyOf(tmp, tmp.length-1);
	}
	
	public static String[] parseNAStrings(JobConf job) 
	{
		return parseNAStrings(job.get(MRJobConfiguration.TF_NA_STRINGS));
	}
	
	private void createAgents(JSONObject spec) throws IOException, JSONException {
		_oa = new OmitAgent(spec);
		_mia = new MVImputeAgent(spec);
		_ra = new RecodeAgent(spec);
		_ba = new BinAgent(spec);
		_da = new DummycodeAgent(spec, _numInputCols);
	}
	
	public void setupAgents(OmitAgent oa, MVImputeAgent mia, RecodeAgent ra, BinAgent ba, DummycodeAgent da)  {
		_oa = oa;
		_mia = mia;
		_ra = ra;
		_ba = ba;
		_da = da;
	}
	
	private void parseColumnNames() {
		_outputColumnNames = _delim.split(_headerLine, -1);
		for(int i=0; i < _outputColumnNames.length; i++)
			_outputColumnNames[i] = UtilFunctions.unquote(_outputColumnNames[i]);
	}
	
	private void init(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long numCols, String offsetFile, String tmpPath, String outputPath) throws IOException, JSONException
	{
		_numRecordsInPartFile = 0;
		_numValidRecords = 0;
		_numTransformedRows = 0;
		_numTransformedColumns = 0;
		
		_headerLine = headerLine;
		_hasHeader = hasHeader;
		_delimString = delim;
		_delim = Pattern.compile(Pattern.quote(delim));
		_NAstrings = naStrings;
		_numInputCols = numCols;
		_offsetFile = offsetFile;
		_tmpDir = tmpPath;
		_outputPath = outputPath;
		
		parseColumnNames();		
		createAgents(spec);
	}
	
	public TfUtils(JobConf job, boolean minimal) 
		throws IOException, JSONException 
	{
		if( !InfrastructureAnalyzer.isLocalMode(job) ) {
			ConfigurationManager.setCachedJobConf(job);
		}
		
		_NAstrings = TfUtils.parseNAStrings(job);
		_specFile = job.get(MRJobConfiguration.TF_SPEC_FILE);
		
		FileSystem fs = FileSystem.get(job);
		JSONObject spec = TfUtils.readSpec(fs, _specFile);
		
		_oa = new OmitAgent(spec);
	}
	
	// called from GenTFMtdMapper, ApplyTf (Hadoop)
	public TfUtils(JobConf job) 
		throws IOException, JSONException 
	{
		if( !InfrastructureAnalyzer.isLocalMode(job) ) {
			ConfigurationManager.setCachedJobConf(job);
		}
		
		boolean hasHeader = Boolean.parseBoolean(job.get(MRJobConfiguration.TF_HAS_HEADER));
		//Pattern delim = Pattern.compile(Pattern.quote(job.get(MRJobConfiguration.TF_DELIM)));
		String[] naStrings = TfUtils.parseNAStrings(job);
		
		long numCols = UtilFunctions.parseToLong( job.get(MRJobConfiguration.TF_NUM_COLS) );		// #of columns in input data
			
		String specFile = job.get(MRJobConfiguration.TF_SPEC_FILE);
		String offsetFile = job.get(MRJobConfiguration.TF_OFFSETS_FILE);
		String tmpPath = job.get(MRJobConfiguration.TF_TMP_LOC);
		String outputPath = FileOutputFormat.getOutputPath(job).toString();
		FileSystem fs = FileSystem.get(job);
		JSONObject spec = TfUtils.readSpec(fs, specFile);
		
		init(job.get(MRJobConfiguration.TF_HEADER), hasHeader, job.get(MRJobConfiguration.TF_DELIM), naStrings, spec, numCols, offsetFile, tmpPath, outputPath);
	}
	
	// called from GenTfMtdReducer 
	public TfUtils(JobConf job, String tfMtdDir) throws IOException, JSONException 
	{
		this(job);
		_tfMtdDir = tfMtdDir;
	}
	
	// called from GenTFMtdReducer and ApplyTf (Spark)
	public TfUtils(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long ncol, String tfMtdDir, String offsetFile, String tmpPath) throws IOException, JSONException {
		init (headerLine, hasHeader, delim, naStrings, spec, ncol, offsetFile, tmpPath, null);
		_tfMtdDir = tfMtdDir;
	}
	
	public void incrValid() { _numValidRecords++; }
	public long getValid()  { return _numValidRecords; }
	public long getTotal()  { return _numRecordsInPartFile; }
	public long getNumTransformedRows() 	{ return _numTransformedRows; }
	public long getNumTransformedColumns() 	{ return _numTransformedColumns; }
	
	public String getHeader() 		{ return _headerLine; }
	public boolean hasHeader() 		{ return _hasHeader; }
	public String getDelimString() 	{ return _delimString; }
	public Pattern getDelim() 		{ return _delim; }
	public String[] getNAStrings() 	{ return _NAstrings; }
	public long getNumCols() 		{ return _numInputCols; }
	
	public String getSpecFile() 	{ return _specFile; }
	public String getTfMtdDir() 	{ return _tfMtdDir; }
	public String getOffsetFile() 	{ return _offsetFile; }
	public String getTmpDir() 		{ return _tmpDir; }
	public String getOutputPath()	{ return _outputPath; }
	
	public String getName(int colID) { return _outputColumnNames[colID-1]; }
	
	public void setValid(long n) { _numValidRecords = n;}
	public void incrTotal() { _numRecordsInPartFile++; }
	public void setTotal(long n) { _numRecordsInPartFile = n;}
	
	public OmitAgent 	  getOmitAgent() 	{ 	return _oa; }
	public MVImputeAgent  getMVImputeAgent(){ 	return _mia;}
	public RecodeAgent 	  getRecodeAgent() 	{ 	return _ra; }
	public BinAgent 	  getBinAgent() 	{ 	return _ba; }
	public DummycodeAgent getDummycodeAgent() { return _da; }
	
	/**
	 * Function that checks if the given string is one of NA strings.
	 * 
	 * @param w
	 * @return
	 */
	public boolean isNA(String w) {
		if(_NAstrings == null)
			return false;
		
		for(String na : _NAstrings) {
			if(w.equals(na))
				return true;
		}
		return false;
	}
	
	public String[] getWords(Text line)
	{
		return getWords(line.toString());
	}
	

	public String[] getWords(String line) 
	{
		return getDelim().split(line.trim(), -1);
	}
	
	/**
	 * Process a given row to construct transformation metadata.
	 * 
	 * @param line
	 * @return
	 * @throws IOException
	 */
	public String[] prepareTfMtd(String line) throws IOException {
		String[] words = getWords(line);
		if(!getOmitAgent().omit(words, this))
		{
			getMVImputeAgent().prepare(words, this);
			getRecodeAgent().prepare(words, this);
			getBinAgent().prepare(words, this);
			incrValid();;
		}
		incrTotal();
		
		return words;
	}
	
	public void loadTfMetadata() throws IOException 
	{
		JobConf job = ConfigurationManager.getCachedJobConf();
		loadTfMetadata(job, false);
	}
	
	public void loadTfMetadata(JobConf job, boolean fromLocalFS) throws IOException
	{
		Path tfMtdDir = null; 
		FileSystem fs = null;
		
		if(fromLocalFS) {
			// metadata must be read from local file system (e.g., distributed cache in the case of Hadoop)
			tfMtdDir = (DistributedCache.getLocalCacheFiles(job))[0];
			fs = FileSystem.getLocal(job);
		}
		else {
			fs = FileSystem.get(job);
			tfMtdDir = new Path(getTfMtdDir());
		}
		
		// load transformation metadata 
		getMVImputeAgent().loadTxMtd(job, fs, tfMtdDir, this);
		getRecodeAgent().loadTxMtd(job, fs, tfMtdDir, this);
		getBinAgent().loadTxMtd(job, fs, tfMtdDir, this);
		
		// associate recode maps and bin definitions with dummycoding agent,
		// as recoded and binned columns are typically dummycoded
		getDummycodeAgent().setRecodeMaps( getRecodeAgent().getRecodeMaps() );
		getDummycodeAgent().setNumBins(getBinAgent().getBinList(), getBinAgent().getNumBins());
		getDummycodeAgent().loadTxMtd(job, fs, tfMtdDir, this);

	}
	
	/*public void loadTfMetadata () throws IOException
	{
		Path tfMtdDir = (DistributedCache.getLocalCacheFiles(_rJob))[0];
		FileSystem localFS = FileSystem.getLocal(_rJob);
		
		loadTfMetadata(_rJob, localFS, tfMtdDir);
		
		FileSystem fs;
		fs = FileSystem.get(_rJob);
		Path thisPath=new Path(_rJob.get("map.input.file")).makeQualified(fs);
		String thisfile=thisPath.toString();
			
		Path smallestFilePath=new Path(_rJob.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
		if(thisfile.toString().equals(smallestFilePath.toString()))
			_partFileWithHeader=true;
		else
			_partFileWithHeader = false;
	}*/


	public String processHeaderLine() throws IOException 
	{
		FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
		String dcdHeader = getDummycodeAgent().constructDummycodedHeader(getHeader(), getDelim());
		getDummycodeAgent().genDcdMapsAndColTypes(fs, getTmpDir(), (int) getNumCols(), this);
		
		// write header information (before and after transformation) to temporary path
		// these files are copied into txMtdPath, once the ApplyTf job is complete.
		DataTransform.generateHeaderFiles(fs, getTmpDir(), getHeader(), dcdHeader);

		return dcdHeader;
		//_numTransformedColumns = getDelim().split(dcdHeader, -1).length; 
		//return _numTransformedColumns;
	}

	public boolean omit(String[] words) {
		if(getOmitAgent() == null)
			return false;
		return getOmitAgent().omit(words, this);
	}
	
	
	public String[] apply(String[] words) {
		return apply(words, false);
	}
	
	/**
	 * Function to apply transformation metadata on a given row.
	 * 
	 * @param words
	 * @param optimizeMaps
	 * @return
	 */
	public String[] apply ( String[] words, boolean optimizeMaps ) 
	{
		words = getMVImputeAgent().apply(words, this);
		
		if(optimizeMaps)
			// specific case of transform() invoked from CP (to save boxing and unboxing)
			words = getRecodeAgent().cp_apply(words, this);
		else
			words = getRecodeAgent().apply(words, this);

		words = getBinAgent().apply(words, this);
		words = getDummycodeAgent().apply(words, this);
		
		_numTransformedRows++;
		
		return words;
	}
	
	public void check(String []words) throws DMLRuntimeException 
	{
		boolean checkEmptyString = ( getNAStrings() != null );
		if ( checkEmptyString ) 
		{
			final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
			for(int i=0; i<words.length; i++) 
				if ( words[i] != null && words[i].equals(""))
					throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i+1));
		}
	}
	
	public String checkAndPrepOutputString(String []words) throws DMLRuntimeException 
	{
		return checkAndPrepOutputString(words, new StringBuilder());
	}
	
	public String checkAndPrepOutputString(String []words, StringBuilder sb) throws DMLRuntimeException 
	{
		/*
		 * Check if empty strings ("") have to be handled.
		 * 
		 * Unless na.strings are provided, empty strings are (implicitly) considered as value zero.
		 * When na.strings are provided, then "" is considered a missing value indicator, and the 
		 * user is expected to provide an appropriate imputation method. Therefore, when na.strings 
		 * are provided, "" encountered in any column (after all transformations are applied) 
		 * denotes an erroneous condition.  
		 */
		boolean checkEmptyString = ( getNAStrings() != null ); //&& !MVImputeAgent.isNA("", TransformationAgent.NAstrings) ) {
		
		//StringBuilder sb = new StringBuilder();
		sb.setLength(0);
		int i =0;
		
		if ( checkEmptyString ) 
		{
			final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
			if ( words[0] != null ) 
				if ( words[0].equals("") )
					throw new DMLRuntimeException( msg + getDummycodeAgent().mapDcdColumnID(1));
				else 
					sb.append(words[0]);
			else
				sb.append("0");
			
			for(i=1; i<words.length; i++) 
			{
				sb.append(_delimString);
				
				if ( words[i] != null ) 
					if ( words[i].equals("") )
						throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i+1));
					else 
						sb.append(words[i]);
				else
					sb.append("0");
			}
		}
		else 
		{
			sb.append(words[0] != null ? words[0] : "0");
			for(i=1; i<words.length; i++) 
			{
				sb.append(_delimString);
				sb.append(words[i] != null ? words[i] : "0");
			}
		}
		
		return sb.toString();
	}

	private Reader initOffsetsReader(JobConf job) throws IOException 
	{
		Path path=new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
		FileSystem fs = FileSystem.get(job);
		Path[] files = MatrixReader.getSequenceFilePaths(fs, path);
		if ( files.length != 1 )
			throw new IOException("Expecting a single file under counters file: " + path.toString());
		
		Reader reader = new SequenceFile.Reader(fs, files[0], job);
		
		return reader;
	}
	
	/**
	 * Function to generate custom file names (transform-part-.....) for
	 * mappers' output for ApplyTfCSV job. The idea is to find the index 
	 * of (thisfile, fileoffset) in the list of all offsets from the 
	 * counters/offsets file, which was generated from either GenTfMtdMR
	 * or AssignRowIDMR job.
	 * 
	 */
	public String getPartFileID(JobConf job, long offset) throws IOException
	{
		Reader reader = initOffsetsReader(job);
		
		ByteWritable key=new ByteWritable();
		OffsetCount value=new OffsetCount();
		String thisFile = TfUtils.getPartFileName(job);
		
		int id = 0;
		while (reader.next(key, value)) {
			if ( thisFile.equals(value.filename) && value.fileOffset == offset ) 
				break;
			id++;
		}
		reader.close();
		
		String sid = Integer.toString(id);
		char[] carr = new char[5-sid.length()];
		Arrays.fill(carr, '0');
		String ret = (new String(carr)).concat(sid);
		
		return ret;
	}
}