/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.transform; import java.io.EOFException; import java.io.IOException; import java.io.Serializable; import java.util.Arrays; import java.util.regex.Pattern; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.Reader; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.lops.Lop; import org.apache.sysml.parser.DataExpression; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.io.MatrixReader; import org.apache.sysml.runtime.matrix.CSVReblockMR; import org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount; import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames; import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration; import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.runtime.util.UtilFunctions; @SuppressWarnings("deprecation") public class TfUtils implements Serializable{ private static final long serialVersionUID = 526252850872633125L; protected enum ColumnTypes { SCALE, NOMINAL, ORDINAL, DUMMYCODED; protected byte toID() { switch(this) { case SCALE: return 1; case NOMINAL: return 2; case ORDINAL: return 3; // Ideally, dummycoded columns should be of a different type. Treating them as SCALE is incorrect, semantically. case DUMMYCODED: return 1; default: throw new RuntimeException("Invalid Column Type: " + this); } } } //transform methods public static final String TXMETHOD_IMPUTE = "impute"; public static final String TXMETHOD_RECODE = "recode"; public static final String TXMETHOD_BIN = "bin"; public static final String TXMETHOD_DUMMYCODE = "dummycode"; public static final String TXMETHOD_SCALE = "scale"; public static final String TXMETHOD_OMIT = "omit"; public static final String TXMETHOD_MVRCD = "mvrcd"; //transform meta data constants (frame-based transform) public static final String TXMTD_MVPREFIX = "#Meta"+Lop.DATATYPE_PREFIX+"MV"; public static final String TXMTD_NDPREFIX = "#Meta"+Lop.DATATYPE_PREFIX+"ND"; //transform meta data constants (old file-based transform) public static final String TXMTD_SEP = ","; public static final String TXMTD_COLTYPES = "coltypes.csv"; public static final String TXMTD_COLNAMES = "column.names"; public static final String TXMTD_DC_COLNAMES = "dummycoded.column.names"; public static final String TXMTD_RCD_MAP_SUFFIX = ".map"; public static final String TXMTD_RCD_DISTINCT_SUFFIX = ".ndistinct"; public static final String TXMTD_BIN_FILE_SUFFIX = ".bin"; public static final String TXMTD_MV_FILE_SUFFIX = ".impute"; public static final String JSON_ATTRS = "attributes"; public static final String JSON_MTHD = "methods"; public static final String JSON_CONSTS = "constants"; public static final String JSON_NBINS = "numbins"; protected static final String MODE_FILE_SUFFIX = ".mode"; protected static final String SCALE_FILE_SUFFIX = ".scale"; protected static final String DCD_FILE_NAME = "dummyCodeMaps.csv"; protected static final String DCD_NAME_SEP = "_"; private OmitAgent _oa = null; private MVImputeAgent _mia = null; private RecodeAgent _ra = null; private BinAgent _ba = null; private DummycodeAgent _da = null; private long _numRecordsInPartFile; // Total number of records in the data file private long _numValidRecords; // (_numRecordsInPartFile - #of omitted records) private long _numTransformedRows; // Number of rows after applying transformations private long _numTransformedColumns; // Number of columns after applying transformations private String _headerLine = null; private boolean _hasHeader; private Pattern _delim = null; private String _delimString = null; private String[] _NAstrings = null; private String[] _outputColumnNames = null; private int _numInputCols = -1; private String _tfMtdDir = null; private String _spec = null; private String _offsetFile = null; private String _tmpDir = null; private String _outputPath = null; public TfUtils(JobConf job, boolean minimal) throws IOException, JSONException { if( !InfrastructureAnalyzer.isLocalMode(job) ) { ConfigurationManager.setCachedJobConf(job); } _NAstrings = TfUtils.parseNAStrings(job); _spec = job.get(MRJobConfiguration.TF_SPEC); _oa = new OmitAgent(new JSONObject(_spec), null, -1); } // called from GenTFMtdMapper, ApplyTf (Hadoop) public TfUtils(JobConf job) throws IOException, JSONException { if( !InfrastructureAnalyzer.isLocalMode(job) ) { ConfigurationManager.setCachedJobConf(job); } boolean hasHeader = Boolean.parseBoolean(job.get(MRJobConfiguration.TF_HAS_HEADER)); String[] naStrings = TfUtils.parseNAStrings(job); long numCols = UtilFunctions.parseToLong( job.get(MRJobConfiguration.TF_NUM_COLS) ); // #cols input data String spec = job.get(MRJobConfiguration.TF_SPEC); String offsetFile = job.get(MRJobConfiguration.TF_OFFSETS_FILE); String tmpPath = job.get(MRJobConfiguration.TF_TMP_LOC); String outputPath = FileOutputFormat.getOutputPath(job).toString(); JSONObject jspec = new JSONObject(spec); init(job.get(MRJobConfiguration.TF_HEADER), hasHeader, job.get(MRJobConfiguration.TF_DELIM), naStrings, jspec, numCols, offsetFile, tmpPath, outputPath); } // called from GenTfMtdReducer public TfUtils(JobConf job, String tfMtdDir) throws IOException, JSONException { this(job); _tfMtdDir = tfMtdDir; } // called from GenTFMtdReducer and ApplyTf (Spark) public TfUtils(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long ncol, String tfMtdDir, String offsetFile, String tmpPath) throws IOException, JSONException { init (headerLine, hasHeader, delim, naStrings, spec, ncol, offsetFile, tmpPath, null); _tfMtdDir = tfMtdDir; } protected static boolean checkValidInputFile(FileSystem fs, Path path, boolean err) throws IOException { // check non-existing file if (!fs.exists(path)) if ( err ) throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS."); else return false; // check for empty file if (MapReduceTool.isFileEmpty(fs, path.toString())) if ( err ) throw new EOFException("Empty input file " + path.toString() + "."); else return false; return true; } public static String getPartFileName(JobConf job) throws IOException { FileSystem fs = FileSystem.get(job); Path thisPath=new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE)).makeQualified(fs); return thisPath.toString(); } public static boolean isPartFileWithHeader(JobConf job) throws IOException { FileSystem fs = FileSystem.get(job); String thisfile=getPartFileName(job); Path smallestFilePath=new Path(job.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs); if(thisfile.toString().equals(smallestFilePath.toString())) return true; else return false; } /** * Prepare NA strings so that they can be sent to workers via JobConf. * A "dummy" string is added at the end to handle the case of empty strings. * @param na NA string * @return NA string concatenated with NA string separator concatenated with "dummy" */ public static String prepNAStrings(String na) { return na + DataExpression.DELIM_NA_STRING_SEP + "dummy"; } public static String[] parseNAStrings(String na) { if ( na == null ) return null; String[] tmp = Pattern.compile(Pattern.quote(DataExpression.DELIM_NA_STRING_SEP)).split(na, -1); return tmp; //Arrays.copyOf(tmp, tmp.length-1); } public static String[] parseNAStrings(JobConf job) { return parseNAStrings(job.get(MRJobConfiguration.TF_NA_STRINGS)); } private void createAgents(JSONObject spec, String[] naStrings) throws IOException, JSONException { _oa = new OmitAgent(spec, _outputColumnNames, _numInputCols); _mia = new MVImputeAgent(spec, null, naStrings, _numInputCols); _ra = new RecodeAgent(spec, _outputColumnNames, _numInputCols); _ba = new BinAgent(spec, _outputColumnNames, _numInputCols); _da = new DummycodeAgent(spec, _outputColumnNames, _numInputCols); } private void parseColumnNames() { _outputColumnNames = _delim.split(_headerLine, -1); for(int i=0; i < _outputColumnNames.length; i++) _outputColumnNames[i] = UtilFunctions.unquote(_outputColumnNames[i]); } private void init(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long numCols, String offsetFile, String tmpPath, String outputPath) throws IOException, JSONException { _numRecordsInPartFile = 0; _numValidRecords = 0; _numTransformedRows = 0; _numTransformedColumns = 0; //TODO: fix hard-wired header propagation to meta data column names _headerLine = headerLine; _hasHeader = hasHeader; _delimString = delim; _delim = Pattern.compile(Pattern.quote(delim)); _NAstrings = naStrings; _numInputCols = (int)numCols; _offsetFile = offsetFile; _tmpDir = tmpPath; _outputPath = outputPath; parseColumnNames(); createAgents(spec, naStrings); } public void incrValid() { _numValidRecords++; } public long getValid() { return _numValidRecords; } public long getTotal() { return _numRecordsInPartFile; } public long getNumTransformedRows() { return _numTransformedRows; } public long getNumTransformedColumns() { return _numTransformedColumns; } public String getHeader() { return _headerLine; } public boolean hasHeader() { return _hasHeader; } public String getDelimString() { return _delimString; } public Pattern getDelim() { return _delim; } public String[] getNAStrings() { return _NAstrings; } public long getNumCols() { return _numInputCols; } public String getSpec() { return _spec; } public String getTfMtdDir() { return _tfMtdDir; } public String getOffsetFile() { return _offsetFile; } public String getTmpDir() { return _tmpDir; } public String getOutputPath() { return _outputPath; } public String getName(int colID) { return _outputColumnNames[colID-1]; } public void setValid(long n) { _numValidRecords = n;} public void incrTotal() { _numRecordsInPartFile++; } public void setTotal(long n) { _numRecordsInPartFile = n;} public OmitAgent getOmitAgent() { return _oa; } public MVImputeAgent getMVImputeAgent(){ return _mia;} public RecodeAgent getRecodeAgent() { return _ra; } public BinAgent getBinAgent() { return _ba; } public DummycodeAgent getDummycodeAgent() { return _da; } /** * Function that checks if the given string is one of NA strings. * * @param NAstrings array of NA strings * @param w string to check * @return true if w is a NAstring */ public static boolean isNA(String[] NAstrings, String w) { if(NAstrings == null) return false; for(String na : NAstrings) { if(w.equals(na)) return true; } return false; } public String[] getWords(Text line) { return getWords(line.toString()); } public String[] getWords(String line) { return getDelim().split(line.trim(), -1); } /** * Process a given row to construct transformation metadata. * * @param line string to break into words * @return string array of words from the line * @throws IOException if IOException occurs */ public String[] prepareTfMtd(String line) throws IOException { String[] words = getWords(line); if(!getOmitAgent().omit(words, this)) { getMVImputeAgent().prepare(words); getRecodeAgent().prepare(words, this); getBinAgent().prepare(words, this); incrValid(); } incrTotal(); return words; } public void loadTfMetadata() throws IOException { JobConf job = ConfigurationManager.getCachedJobConf(); loadTfMetadata(job, false); } public void loadTfMetadata(JobConf job, boolean fromLocalFS) throws IOException { Path tfMtdDir = null; FileSystem fs = null; if(fromLocalFS) { // metadata must be read from local file system (e.g., distributed cache in the case of Hadoop) tfMtdDir = (DistributedCache.getLocalCacheFiles(job))[0]; fs = FileSystem.getLocal(job); } else { fs = FileSystem.get(job); tfMtdDir = new Path(getTfMtdDir()); } // load transformation metadata getMVImputeAgent().loadTxMtd(job, fs, tfMtdDir, this); getRecodeAgent().loadTxMtd(job, fs, tfMtdDir, this); getBinAgent().loadTxMtd(job, fs, tfMtdDir, this); // associate recode maps and bin definitions with dummycoding agent, // as recoded and binned columns are typically dummycoded getDummycodeAgent().setRecodeMaps( getRecodeAgent().getRecodeMaps() ); getDummycodeAgent().setNumBins(getBinAgent().getColList(), getBinAgent().getNumBins()); getDummycodeAgent().loadTxMtd(job, fs, tfMtdDir, this); } public String processHeaderLine() throws IOException { //TODO: fix hard-wired header propagation to meta data column names FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); String dcdHeader = getDummycodeAgent().constructDummycodedHeader(getHeader(), getDelim()); getDummycodeAgent().genDcdMapsAndColTypes(fs, getTmpDir(), (int) getNumCols(), this); // write header information (before and after transformation) to temporary path // these files are copied into txMtdPath, once the ApplyTf job is complete. DataTransform.generateHeaderFiles(fs, getTmpDir(), getHeader(), dcdHeader); return dcdHeader; //_numTransformedColumns = getDelim().split(dcdHeader, -1).length; //return _numTransformedColumns; } public boolean omit(String[] words) { if(getOmitAgent() == null) return false; return getOmitAgent().omit(words, this); } /** * Function to apply transformation metadata on a given row. * * @param words string array of words * @return string array of transformed words */ public String[] apply( String[] words ) { words = getMVImputeAgent().apply(words); words = getRecodeAgent().apply(words); words = getBinAgent().apply(words); words = getDummycodeAgent().apply(words); _numTransformedRows++; return words; } public void check(String []words) throws DMLRuntimeException { boolean checkEmptyString = ( getNAStrings() != null ); if ( checkEmptyString ) { final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: "; for(int i=0; i<words.length; i++) if ( words[i] != null && words[i].equals("")) throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i+1)); } } public String checkAndPrepOutputString(String []words) throws DMLRuntimeException { return checkAndPrepOutputString(words, new StringBuilder()); } public String checkAndPrepOutputString(String []words, StringBuilder sb) throws DMLRuntimeException { /* * Check if empty strings ("") have to be handled. * * Unless na.strings are provided, empty strings are (implicitly) considered as value zero. * When na.strings are provided, then "" is considered a missing value indicator, and the * user is expected to provide an appropriate imputation method. Therefore, when na.strings * are provided, "" encountered in any column (after all transformations are applied) * denotes an erroneous condition. */ boolean checkEmptyString = ( getNAStrings() != null ); //&& !MVImputeAgent.isNA("", TransformationAgent.NAstrings) ) { //StringBuilder sb = new StringBuilder(); sb.setLength(0); int i =0; if ( checkEmptyString ) { final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: "; if ( words[0] != null ) if ( words[0].equals("") ) throw new DMLRuntimeException( msg + getDummycodeAgent().mapDcdColumnID(1)); else sb.append(words[0]); else sb.append("0"); for(i=1; i<words.length; i++) { sb.append(_delimString); if ( words[i] != null ) if ( words[i].equals("") ) throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i+1)); else sb.append(words[i]); else sb.append("0"); } } else { sb.append(words[0] != null ? words[0] : "0"); for(i=1; i<words.length; i++) { sb.append(_delimString); sb.append(words[i] != null ? words[i] : "0"); } } return sb.toString(); } private Reader initOffsetsReader(JobConf job) throws IOException { Path path=new Path(job.get(CSVReblockMR.ROWID_FILE_NAME)); FileSystem fs = FileSystem.get(job); Path[] files = MatrixReader.getSequenceFilePaths(fs, path); if ( files.length != 1 ) throw new IOException("Expecting a single file under counters file: " + path.toString()); Reader reader = new SequenceFile.Reader(fs, files[0], job); return reader; } /** * Function to generate custom file names (transform-part-.....) for * mappers' output for ApplyTfCSV job. The idea is to find the index * of (thisfile, fileoffset) in the list of all offsets from the * counters/offsets file, which was generated from either GenTfMtdMR * or AssignRowIDMR job. * * @param job job configuration * @param offset file offset * @return part file id (ie, 00001, 00002, etc) * @throws IOException if IOException occurs */ public String getPartFileID(JobConf job, long offset) throws IOException { Reader reader = null; int id = 0; try { reader = initOffsetsReader(job); ByteWritable key=new ByteWritable(); OffsetCount value=new OffsetCount(); String thisFile = TfUtils.getPartFileName(job); while (reader.next(key, value)) { if ( thisFile.equals(value.filename) && value.fileOffset == offset ) break; id++; } } finally { IOUtilFunctions.closeSilently(reader); } String sid = Integer.toString(id); char[] carr = new char[5-sid.length()]; Arrays.fill(carr, '0'); String ret = (new String(carr)).concat(sid); return ret; } }