/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.udf.lib; import java.io.DataOutputStream; import java.io.IOException; import java.util.HashMap; import java.util.StringTokenizer; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.udf.FunctionParameter; import org.apache.sysml.udf.Matrix; import org.apache.sysml.udf.PackageFunction; import org.apache.sysml.udf.Matrix.ValueType; @Deprecated public class RemoveEmptyRows extends PackageFunction { private static final long serialVersionUID = 1L; private static final String OUTPUT_FILE = "TMP"; private Matrix _ret; @Override public int getNumFunctionOutputs() { return 1; } @Override public FunctionParameter getFunctionOutput(int pos) { return _ret; } @Override public void execute() { Matrix mat = (Matrix) this.getFunctionInput(0); String fnameOld = mat.getFilePath(); HashMap<Long,Long> keyMap = new HashMap<Long,Long>(); //old,new rowID try { //prepare input JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fnameOld); FileSystem fs = FileSystem.get(job); if( !fs.exists(path) ) throw new IOException("File "+fnameOld+" does not exist on HDFS."); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); //prepare output String fnameNew = createOutputFilePathAndName( OUTPUT_FILE ); DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream( fnameNew, true ); //read and write if necessary InputSplit[] splits = informat.getSplits(job, 1); LongWritable key = new LongWritable(); Text value = new Text(); long ID = 1; try { //for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); for(InputSplit split: splits) { RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while( reader.next(key, value) ) { String cellStr = value.toString().trim(); StringTokenizer st = new StringTokenizer(cellStr, " "); long row = Integer.parseInt( st.nextToken() ); long col = Integer.parseInt( st.nextToken() ); double lvalue = Double.parseDouble( st.nextToken() ); if( !keyMap.containsKey( row ) ) keyMap.put(row, ID++); long rowNew = keyMap.get( row ); sb.append(rowNew); sb.append(' '); sb.append(col); sb.append(' '); sb.append(lvalue); sb.append('\n'); ostream.writeBytes( sb.toString() ); sb.setLength(0); } } finally { if( reader != null ) reader.close(); } } _ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double); } finally { if( ostream != null ) ostream.close(); } } catch(Exception ex) { throw new RuntimeException( "Unable to execute external function.", ex ); } } }