// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.common;
import java.util.ArrayList;
import org.apache.hadoop.io.Text;
public class CutText
{
public static class FormatException extends Exception {
private static final long serialVersionUID = 1L;
public FormatException(String msg, Text record)
{
super(msg + "Record: " + record.toString());
}
}
private final String delim;
private ArrayList<Integer> columns;
private String[] extractedFields;
private int[] extractedFieldPositions;
/**
* @param delim: delimiter string that separates fields within the records to be scanned.
* @param cols: the sorted, zero-based indices of the columns to be extracted from the records.
*/
public CutText(String delimiter, int... cols)
{
if (delimiter.length() == 0)
throw new IllegalArgumentException("empty string is an invalid delimiter");
delim = delimiter;
columns = new ArrayList<Integer>(cols.length);
for (int c: cols)
columns.add(c);
if (columns.size() == 0)
throw new IllegalArgumentException("no columns specified");
// ensure the columns are in sorted order
for (int i = 1; i < columns.size(); ++i)
{
if ( columns.get(i-1) >= columns.get(i) )
throw new IllegalArgumentException("specified columns must be in sorted order and must not contain duplicates");
}
// initialize index
extractedFields = new String[columns.size()];
extractedFieldPositions = new int[columns.size()];
// initialize so that trying to access the fields before
// calling loadRecord results in obviously bad stuff
for (int i = 0; i < extractedFields.length; ++i)
{
extractedFields[i] = null;
extractedFieldPositions[i] = -1;
}
}
public void loadRecord(Text record) throws FormatException
{
int pos = 0; // the byte position within the record
int fieldno = 0; // the field index within the record
int colno = 0; // the index within the list of requested fields (columns)
try
{
while (pos < record.getLength() && colno < columns.size()) // iterate over each field
{
int endpos = record.find(delim, pos); // the field's end position
if (endpos < 0)
endpos = record.getLength();
if (columns.get(colno) == fieldno) // if we're at a requested field
{
extractedFields[colno] = Text.decode(record.getBytes(), pos, endpos - pos);
extractedFieldPositions[colno] = pos;
colno += 1; // advance column
}
pos = endpos + 1; // the next starting position is the current end + 1
fieldno += 1;
}
}
catch (java.nio.charset.CharacterCodingException e) {
throw new FormatException("character coding exception. Message: " + e.getMessage(), record);
}
if (colno < columns.size())
throw new FormatException("Missing field(s) in record. Field " + colno + " (zero-based) not found.", record);
}
public String getField(int i)
{
return extractedFields[i];
}
public int getNumFields()
{
return extractedFields.length;
}
public int getFieldPos(int i)
{
return extractedFieldPositions[i];
}
}