/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.matrix.mapred;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.CSVReblockMR;
import org.apache.sysml.runtime.matrix.CSVReblockMR.BlockRow;
import org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.TaggedFirstSecondIndexes;
import org.apache.sysml.runtime.util.UtilFunctions;
public class CSVReblockMapper extends MapperBase implements Mapper<LongWritable, Text, TaggedFirstSecondIndexes, BlockRow>
{
private long rowOffset=0;
private boolean first=true;
private long num=0;
private HashMap<Long, Long> offsetMap=new HashMap<Long, Long>();
private String _delim=" ";
private boolean ignoreFirstLine=false;
private boolean headerFile=false;
private IndexedBlockRow idxRow = null;
public static class IndexedBlockRow
{
private BlockRow row=null;
private TaggedFirstSecondIndexes outIndexes=null;
public IndexedBlockRow() {
row = new BlockRow();
row.data = new MatrixBlock();
outIndexes=new TaggedFirstSecondIndexes();
}
public BlockRow getRow() { return row; }
public TaggedFirstSecondIndexes getIndexes() { return outIndexes; }
}
public static IndexedBlockRow processRow(IndexedBlockRow row, String[] cells, long rowOffset, long num, byte outTag, int brlen, int bclen, boolean fill, double fillValue, OutputCollector<TaggedFirstSecondIndexes, BlockRow> out) throws IOException
{
int start=0;
row.getIndexes().setTag(outTag);
long rowIndex=UtilFunctions.computeBlockIndex(rowOffset+num+1, brlen);
row.getRow().indexInBlock=UtilFunctions.computeCellInBlock(rowOffset+num+1, brlen);
long col=0;
for(; col<cells.length/bclen; col++)
{
row.getRow().data.reset(1, bclen);
row.getIndexes().setIndexes(rowIndex, col+1);
for(int k=0;k<bclen; k++)
{
if(cells[k+start] == null || cells[k+start].isEmpty())
{
IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(null, fill, true);
row.getRow().data.appendValue(0, k, fillValue);
}
else
row.getRow().data.appendValue(0, k, UtilFunctions.parseToDouble(cells[k+start]));
}
out.collect(row.getIndexes(), row.getRow());
start+=bclen;
}
row.getIndexes().setIndexes(rowIndex, col+1);
int lastBclen=cells.length%bclen;
if(lastBclen!=0)
{
row.getRow().data.reset(1, lastBclen);
for(int k=0;k<lastBclen; k++)
{
if(cells[k+start] == null || cells[k+start].isEmpty())
{
if(!fill)
throw new RuntimeException("Empty fields found in the input delimited file. Use \"fill\" option to read delimited files with empty fields.");
row.getRow().data.appendValue(0, k, fillValue);
}
else
row.getRow().data.appendValue(0, k, UtilFunctions.parseToDouble(cells[k+start]));
}
out.collect(row.getIndexes(), row.getRow());
}
return row;
}
@Override
public void map(LongWritable key, Text value,
OutputCollector<TaggedFirstSecondIndexes, BlockRow> out, Reporter reporter)
throws IOException
{
if(first) {
rowOffset=offsetMap.get(key.get());
first=false;
}
if(key.get()==0 && headerFile && ignoreFirstLine)
return;
String[] cells = IOUtilFunctions.split( value.toString(), _delim );
for(int i=0; i<representativeMatrixes.size(); i++)
for(CSVReblockInstruction ins: csv_reblock_instructions.get(i))
{
idxRow = processRow(idxRow, cells, rowOffset, num, ins.output, ins.brlen, ins.bclen, ins.fill, ins.fillValue, out);
}
num++;
}
@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job)
{
super.configure(job);
//get the number colums per block
//load the offset mapping
byte matrixIndex=representativeMatrixes.get(0);
try
{
FileSystem fs = FileSystem.get(job);
Path thisPath=new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE)).makeQualified(fs);
String filename=thisPath.toString();
Path headerPath=new Path(job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT)[matrixIndex]).makeQualified(fs);
if(headerPath.toString().equals(filename))
headerFile=true;
ByteWritable key=new ByteWritable();
OffsetCount value=new OffsetCount();
Path p=new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
SequenceFile.Reader reader = null;
try {
reader = new SequenceFile.Reader(fs, p, job);
while (reader.next(key, value)) {
if(key.get()==matrixIndex && filename.equals(value.filename))
offsetMap.put(value.fileOffset, value.count);
}
}
finally {
IOUtilFunctions.closeSilently(reader);
}
}
catch (IOException e) {
throw new RuntimeException(e);
}
CSVReblockInstruction ins=csv_reblock_instructions.get(0).get(0);
_delim = ins.delim;
ignoreFirstLine=ins.hasHeader;
idxRow = new IndexedBlockRow();
int maxBclen=0;
for(ArrayList<CSVReblockInstruction> insv: csv_reblock_instructions)
for(CSVReblockInstruction in: insv)
{
if(maxBclen<in.bclen)
maxBclen=in.bclen;
}
//always dense since common csv usecase
idxRow.getRow().data.reset(1, maxBclen, false);
}
@Override
protected void specialOperationsForActualMap(int index,
OutputCollector<Writable, Writable> out, Reporter reporter)
throws IOException
{
//do nothing
}
}