/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.io; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.sysml.runtime.transform.TfUtils; import org.apache.sysml.runtime.util.LocalFileUtils; import org.apache.sysml.runtime.util.UtilFunctions; public class IOUtilFunctions { private static final Log LOG = LogFactory.getLog(UtilFunctions.class.getName()); private static final char CSV_QUOTE_CHAR = '"'; public static void closeSilently( Closeable io ) { try { if( io != null ) io.close(); } catch (Exception ex) { LOG.error("Failed to close IO resource.", ex); } } public static void closeSilently( RecordReader<?,?> rr ) { try { if( rr != null ) rr.close(); } catch (Exception ex) { LOG.error("Failed to close record reader.", ex); } } public static double parseDoubleParallel( String str ) { //return FloatingDecimal.parseDouble(str); return Double.parseDouble(str); } public static void checkAndRaiseErrorCSVEmptyField(String row, boolean fill, boolean emptyFound) throws IOException { if ( !fill && emptyFound) { throw new IOException("Empty fields found in delimited file. " + "Use \"fill\" option to read delimited files with empty fields:" + ((row!=null)?row:"")); } } public static void checkAndRaiseErrorCSVNumColumns(String fname, String line, String[] parts, long ncol) throws IOException { int realncol = parts.length; if( realncol != ncol ) { throw new IOException("Invalid number of columns (" + realncol + ", expected=" + ncol + ") " + "found in delimited file (" + fname + ") for line: " + line); } } /** * Splits a string by a specified delimiter into all tokens, including empty. * NOTE: This method is meant as a faster drop-in replacement of the regular * string split. * * @param str string to split * @param delim delimiter * @return string array */ public static String[] split(String str, String delim) { //split by whole separator required for multi-character delimiters, preserve //all tokens required for empty cells and in order to keep cell alignment return StringUtils.splitByWholeSeparatorPreserveAllTokens(str, delim); } /** * Splits a string by a specified delimiter into all tokens, including empty * while respecting the rules for quotes and escapes defined in RFC4180, * with robustness for various special cases. * * @param str string to split * @param delim delimiter * @return string array of tokens */ public static String[] splitCSV(String str, String delim) { // check for empty input if( str == null || str.isEmpty() ) return new String[]{""}; // scan string and create individual tokens ArrayList<String> tokens = new ArrayList<String>(); int from = 0, to = 0; int len = str.length(); int dlen = delim.length(); while( from < len ) { // for all tokens if( str.charAt(from) == CSV_QUOTE_CHAR && str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) { to = str.indexOf(CSV_QUOTE_CHAR, from+1); // handle escaped inner quotes, e.g. "aa""a" while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR ) to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + "" to += 1; // last " // handle remaining non-quoted characters "aa"a if( to<len-1 && !str.regionMatches(to, delim, 0, dlen) ) to = str.indexOf(delim, to+1); } else if( str.regionMatches(from, delim, 0, dlen) ) { to = from; // empty string } else { // default: unquoted non-empty to = str.indexOf(delim, from+1); } // slice out token and advance position to = (to >= 0) ? to : len; tokens.add(str.substring(from, to)); from = to + delim.length(); } // handle empty string at end if( from == len ) tokens.add(""); // return tokens return tokens.toArray(new String[0]); } /** * Splits a string by a specified delimiter into all tokens, including empty * while respecting the rules for quotes and escapes defined in RFC4180, * with robustness for various special cases. * * @param str string to split * @param delim delimiter * @param tokens array for tokens, length needs to match the number of tokens * @return string array of tokens */ public static String[] splitCSV(String str, String delim, String[] tokens) { // check for empty input if( str == null || str.isEmpty() ) return new String[]{""}; // scan string and create individual tokens int from = 0, to = 0; int len = str.length(); int dlen = delim.length(); int pos = 0; while( from < len ) { // for all tokens if( str.charAt(from) == CSV_QUOTE_CHAR && str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) { to = str.indexOf(CSV_QUOTE_CHAR, from+1); // handle escaped inner quotes, e.g. "aa""a" while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR ) to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + "" to += 1; // last " // handle remaining non-quoted characters "aa"a if( to<len-1 && !str.regionMatches(to, delim, 0, dlen) ) to = str.indexOf(delim, to+1); } else if( str.regionMatches(from, delim, 0, dlen) ) { to = from; // empty string } else { // default: unquoted non-empty to = str.indexOf(delim, from+1); } // slice out token and advance position to = (to >= 0) ? to : len; tokens[pos++] = str.substring(from, to); from = to + delim.length(); } // handle empty string at end if( from == len ) tokens[pos] = ""; // return tokens return tokens; } /** * Counts the number of tokens defined by the given delimiter, respecting * the rules for quotes and escapes defined in RFC4180, * with robustness for various special cases. * * @param str string to split * @param delim delimiter * @return number of tokens split by the given delimiter */ public static int countTokensCSV(String str, String delim) { // check for empty input if( str == null || str.isEmpty() ) return 1; // scan string and compute num tokens int numTokens = 0; int from = 0, to = 0; int len = str.length(); int dlen = delim.length(); while( from < len ) { // for all tokens if( str.charAt(from) == CSV_QUOTE_CHAR && str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) { to = str.indexOf(CSV_QUOTE_CHAR, from+1); // handle escaped inner quotes, e.g. "aa""a" while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR ) to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + "" to += 1; // last " // handle remaining non-quoted characters "aa"a if( to<len-1 && !str.regionMatches(to, delim, 0, dlen) ) to = str.indexOf(delim, to+1); } else if( str.regionMatches(from, delim, 0, dlen) ) { to = from; // empty string } else { // default: unquoted non-empty to = str.indexOf(delim, from+1); } //increase counter and advance position to = (to >= 0) ? to : len; from = to + delim.length(); numTokens++; } // handle empty string at end if( from == len ) numTokens++; // return number of tokens return numTokens; } /** * Returns the number of non-zero entries but avoids the expensive * string to double parsing. This function is guaranteed to never * underestimate. * * @param cols string array * @return number of non-zeros */ public static int countNnz(String[] cols) { return countNnz(cols, 0, cols.length); } /** * Returns the number of non-zero entries but avoids the expensive * string to double parsing. This function is guaranteed to never * underestimate. * * @param cols string array * @param pos starting array index * @param len ending array index * @return number of non-zeros */ public static int countNnz(String[] cols, int pos, int len) { int lnnz = 0; for( int i=pos; i<pos+len; i++ ) { String col = cols[i]; lnnz += (!col.isEmpty() && !col.equals("0") && !col.equals("0.0")) ? 1 : 0; } return lnnz; } /** * Returns the serialized size in bytes of the given string value, * following the modified UTF-8 specification as used by Java's * DataInput/DataOutput. * * see java docs: docs/api/java/io/DataInput.html#modified-utf-8 * * @param value string value * @return string size for modified UTF-8 specifiecation */ public static int getUTFSize(String value) { if( value == null ) return 2; //size in modified UTF-8 as used by DataInput/DataOutput int size = 2; //length in bytes for (int i = 0; i < value.length(); i++) { char c = value.charAt(i); size += ( c>=0x0001 && c<=0x007F) ? 1 : (c >= 0x0800) ? 3 : 2; } return size; } public static InputStream toInputStream(String input) throws IOException { if( input == null ) return null; return new ByteArrayInputStream(input.getBytes("UTF-8")); } public static String toString(InputStream input) throws IOException { if( input == null ) return null; try { ByteArrayOutputStream bos = new ByteArrayOutputStream(); byte[] buff = new byte[LocalFileUtils.BUFFER_SIZE]; for( int len=0; (len=input.read(buff))!=-1; ) bos.write(buff, 0, len); return bos.toString("UTF-8"); } finally { IOUtilFunctions.closeSilently(input); } } public static InputSplit[] sortInputSplits(InputSplit[] splits) { if (splits[0] instanceof FileSplit) { // The splits do not always arrive in order by file name. // Sort the splits lexicographically by path so that the header will // be in the first split. // Note that we're assuming that the splits come in order by offset Arrays.sort(splits, new Comparator<InputSplit>() { @Override public int compare(InputSplit o1, InputSplit o2) { Path p1 = ((FileSplit) o1).getPath(); Path p2 = ((FileSplit) o2).getPath(); return p1.toString().compareTo(p2.toString()); } }); } return splits; } /** * Counts the number of columns in a given collection of csv file splits. This primitive aborts * if a row with more than 0 columns is found and hence is robust against empty file splits etc. * * @param splits input splits * @param informat input format * @param job job configruation * @param delim delimiter * @return the number of columns in the collection of csv file splits * @throws IOException if IOException occurs */ @SuppressWarnings({ "rawtypes", "unchecked" }) public static int countNumColumnsCSV(InputSplit[] splits, InputFormat informat, JobConf job, String delim ) throws IOException { LongWritable key = new LongWritable(); Text value = new Text(); int ncol = -1; for( int i=0; i<splits.length && ncol<=0; i++ ) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL); try { if( reader.next(key, value) ) { boolean hasValue = true; if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) ) hasValue = reader.next(key, value); if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) ) hasValue = reader.next(key, value); String row = value.toString().trim(); if( hasValue && !row.isEmpty() ) { ncol = IOUtilFunctions.countTokensCSV(row, delim); } } } finally { closeSilently(reader); } } return ncol; } /** * Delete the CRC files from the local file system associated with a * particular file and its metadata file. * * @param fs * the file system * @param path * the path to a file * @throws IOException * thrown if error occurred attempting to delete crc files */ public static void deleteCrcFilesFromLocalFileSystem(FileSystem fs, Path path) throws IOException { if (fs instanceof LocalFileSystem) { Path fnameCrc = new Path(path.getParent(), "." + path.getName() + ".crc"); fs.delete(fnameCrc, false); Path fnameMtdCrc = new Path(path.getParent(), "." + path.getName() + ".mtd.crc"); fs.delete(fnameMtdCrc, false); } } }