/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.udf.lib;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.runtime.util.MapReduceTool;
import com.ibm.bi.dml.udf.FunctionParameter;
import com.ibm.bi.dml.udf.Matrix;
import com.ibm.bi.dml.udf.PackageFunction;
import com.ibm.bi.dml.udf.Matrix.ValueType;
/**
*
*
*/
@Deprecated
public class RemoveEmptyRows extends PackageFunction
{
private static final long serialVersionUID = 1L;
private static final String OUTPUT_FILE = "TMP";
private Matrix _ret;
@Override
public int getNumFunctionOutputs()
{
return 1;
}
@Override
public FunctionParameter getFunctionOutput(int pos)
{
return _ret;
}
@Override
public void execute()
{
Matrix mat = (Matrix) this.getFunctionInput(0);
String fnameOld = mat.getFilePath();
HashMap<Long,Long> keyMap = new HashMap<Long,Long>(); //old,new rowID
try
{
//prepare input
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameOld);
FileSystem fs = FileSystem.get(job);
if( !fs.exists(path) )
throw new IOException("File "+fnameOld+" does not exist on HDFS.");
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
//prepare output
String fnameNew = createOutputFilePathAndName( OUTPUT_FILE );
DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream( fnameNew, true );
//read and write if necessary
InputSplit[] splits = informat.getSplits(job, 1);
LongWritable key = new LongWritable();
Text value = new Text();
long ID = 1;
try
{
//for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
for(InputSplit split: splits)
{
RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try
{
while( reader.next(key, value) )
{
String cellStr = value.toString().trim();
StringTokenizer st = new StringTokenizer(cellStr, " ");
long row = Integer.parseInt( st.nextToken() );
long col = Integer.parseInt( st.nextToken() );
double lvalue = Double.parseDouble( st.nextToken() );
if( !keyMap.containsKey( row ) )
keyMap.put(row, ID++);
long rowNew = keyMap.get( row );
sb.append(rowNew);
sb.append(' ');
sb.append(col);
sb.append(' ');
sb.append(lvalue);
sb.append('\n');
ostream.writeBytes( sb.toString() );
sb.setLength(0);
}
}
finally
{
if( reader != null )
reader.close();
}
}
_ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
}
finally
{
if( ostream != null )
ostream.close();
}
}
catch(Exception ex)
{
throw new RuntimeException( "Unable to execute external function.", ex );
}
}
}