package eu.stratosphere.sopremo.pact; import static eu.stratosphere.sopremo.pact.IOConstants.ENCODING; import static eu.stratosphere.sopremo.pact.IOConstants.FIELD_DELIMITER; import java.io.IOException; import java.io.Reader; import java.net.URI; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.ArrayList; import java.util.Deque; import java.util.LinkedList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import eu.stratosphere.nephele.configuration.Configuration; import eu.stratosphere.nephele.fs.FSDataInputStream; import eu.stratosphere.nephele.fs.FileInputSplit; import eu.stratosphere.nephele.fs.FileStatus; import eu.stratosphere.nephele.fs.FileSystem; import eu.stratosphere.nephele.fs.LineReader; import eu.stratosphere.nephele.fs.Path; import eu.stratosphere.pact.common.io.FileInputFormat; import eu.stratosphere.pact.common.io.statistics.BaseStatistics; import eu.stratosphere.pact.common.type.PactRecord; import eu.stratosphere.sopremo.EvaluationContext; import eu.stratosphere.sopremo.serialization.Schema; import eu.stratosphere.sopremo.type.IObjectNode; import eu.stratosphere.sopremo.type.ObjectNode; import eu.stratosphere.sopremo.type.TextNode; /** * InputFormat that interpretes the input data as a csv representation. */ public class CsvInputFormat extends FileInputFormat { /** * @author Arvid Heise */ public static class CountingReader extends Reader { private long start = 0, relativePos = 0, limit = 0; private boolean reachedLimit = false; private ByteBuffer streamBuffer = ByteBuffer.allocate(100); private CharBuffer charBuffer = CharBuffer.allocate(100); private FSDataInputStream stream; protected CharsetDecoder decoder; public CountingReader(FSDataInputStream stream, Charset charset, long start, long limit) { this.stream = stream; this.decoder = charset.newDecoder(); this.start = start; this.limit = limit; // mark as empty this.charBuffer.limit(0); } /** * Returns the pos. * * @return the pos */ public long getRelativePos() { return this.relativePos; } public boolean reachedLimit() { return this.reachedLimit; } public void seek(long absolutePos) throws IOException { this.relativePos = absolutePos - this.start; this.stream.seek(absolutePos); // mark as empty this.charBuffer.limit(0); } /* * (non-Javadoc) * @see java.io.Reader#read(char[], int, int) */ @Override public int read(char[] cbuf, int off, int len) throws IOException { int toRead = len - off; while (toRead > 0) { fillCharBufferIfEmpty(); int currentReadCount = Math.min(toRead, this.charBuffer.length()); this.charBuffer.get(cbuf, off, currentReadCount); toRead -= currentReadCount; } return len - toRead; } private void fillCharBufferIfEmpty() throws IOException { if (this.charBuffer.remaining() == 0) { final int maxLen = this.streamBuffer.capacity(); this.streamBuffer.clear(); if (this.reachedLimit) { final int read = this.stream.read(this.streamBuffer.array(), 0, maxLen); this.streamBuffer.limit(read); } else { final int read = this.stream.read(this.streamBuffer.array(), 0, (int) Math.min(maxLen, this.limit - this.relativePos)); this.relativePos += read; this.streamBuffer.limit(read); this.reachedLimit = this.limit <= this.relativePos; } this.charBuffer.clear(); this.decoder.decode(this.streamBuffer, this.charBuffer, false); this.charBuffer.flip(); } } /* * (non-Javadoc) * @see java.io.Reader#read() */ @Override public int read() throws IOException { fillCharBufferIfEmpty(); if (this.charBuffer.remaining() == 0) return -1; return this.charBuffer.get(); } /* * (non-Javadoc) * @see java.io.Reader#close() */ @Override public void close() throws IOException { this.stream.close(); } } /** * The default number of sample lines to consider when calculating the line width. */ private static final int DEFAULT_NUM_SAMPLES = 10; /** * The configuration key to set the number of samples to take for the statistics. */ public static final String NUM_STATISTICS_SAMPLES = "csv-format.numSamples"; public static final String COLUMN_NAMES = "csv-format.columns"; public static final String USE_QUOTATION = "csv-format.quotation"; /** * The log. */ private static final Log LOG = LogFactory.getLog(CsvInputFormat.class); private static final char DEFAULT_DELIMITER = ','; private char fieldDelimiter = ','; private Quotation quotation = Quotation.AUTO; private boolean usesQuotation = true; private String[] keyNames; private Schema targetSchema; private Charset encoding; private int numLineSamples; public enum Quotation { ON, OFF, AUTO; } private enum State { TOP_LEVEL, QUOTED, ESCAPED; } private Deque<State> state = new LinkedList<State>(); private boolean endReached; private CountingReader reader; private EvaluationContext context; @Override public void configure(final Configuration parameters) { super.configure(parameters); this.context = SopremoUtil.deserialize(parameters, SopremoUtil.CONTEXT, EvaluationContext.class); this.targetSchema = this.context.getOutputSchema(0); final Boolean useQuotation = SopremoUtil.deserialize(parameters, USE_QUOTATION, Boolean.class); this.quotation = useQuotation == null ? Quotation.AUTO : useQuotation ? Quotation.ON : Quotation.OFF; this.keyNames = SopremoUtil.deserialize(parameters, COLUMN_NAMES, String[].class); // this.targetSchema = SopremoUtil.deserialize(parameters, SCHEMA, Schema.class); this.encoding = Charset.forName(parameters.getString(ENCODING, "utf-8")); final Character delimiter = SopremoUtil.deserialize(parameters, FIELD_DELIMITER, Character.class); this.fieldDelimiter = delimiter != null ? delimiter : DEFAULT_DELIMITER; this.numLineSamples = parameters.getInteger(NUM_STATISTICS_SAMPLES, DEFAULT_NUM_SAMPLES); } /* * (non-Javadoc) * @see eu.stratosphere.pact.common.io.DelimitedInputFormat#open(eu.stratosphere.nephele.fs.FileInputSplit) */ @Override public void open(FileInputSplit split) throws IOException { super.open(split); this.setState(State.TOP_LEVEL); this.endReached = false; this.reader = new CountingReader(this.stream, this.encoding, this.splitStart, this.splitLength); this.usesQuotation = this.quotation == Quotation.ON; if (this.quotation == Quotation.AUTO) { // very simple heuristic for (int index = 0, ch; !this.usesQuotation && index < 1000 && (ch = this.reader.read()) != -1; index++) this.usesQuotation = ch == '"'; this.reader.seek(this.splitStart); } if (this.keyNames == null) { if (split.getSplitNumber() > 0) this.reader.seek(0); this.keyNames = this.extractKeyNames(); } // skip to beginning of the first record if (split.getSplitNumber() > 0) { if (this.usesQuotation) { // TODO: how to detect if where are inside a quotation? this.reader.seek(this.splitStart - 1); int ch; while ((ch = this.reader.read()) != -1 && ch != '\n') ; this.endReached = ch == -1; } else { this.reader.seek(this.splitStart - 1); int ch; while ((ch = this.reader.read()) != -1 && ch != '\n') ; this.endReached = ch == -1; } } } /** * Reads the key names from the first line of the first split. */ private String[] extractKeyNames() throws IOException { List<String> keyNames = new ArrayList<String>(); int lastCharacter; do { lastCharacter = this.fillBuilderWithNextField(); keyNames.add(this.builder.toString()); this.builder.setLength(0); } while (lastCharacter != -1 && lastCharacter != '\n'); return keyNames.toArray(new String[keyNames.size()]); } /* * (non-Javadoc) * @see eu.stratosphere.pact.common.generic.io.InputFormat#reachedEnd() */ @Override public boolean reachedEnd() throws IOException { return this.endReached; } private final IObjectNode objectNode = new ObjectNode(); private final StringBuilder builder = new StringBuilder(); private int fillBuilderWithNextField() throws IOException { int character = 0; readLoop: while ((character = this.reader.read()) != -1) { final char ch = (char) character; switch (this.getCurrentState()) { case ESCAPED: this.builder.append(ch); this.revertToPreviousState(); break; case QUOTED: switch (ch) { case '"': this.revertToPreviousState(); break; case '\\': this.setState(State.ESCAPED); break; default: this.builder.append(ch); } break; case TOP_LEVEL: if (ch == this.fieldDelimiter) { this.builder.toString(); break readLoop; } else if (ch == '\n') { final int lastCharPos = this.builder.length() - 1; if (this.builder.charAt(lastCharPos) == '\r') this.builder.setLength(lastCharPos); break readLoop; } else if (this.usesQuotation && ch == '"') this.setState(State.QUOTED); else this.builder.append(ch); } } return character; } /* * (non-Javadoc) * @see eu.stratosphere.pact.common.generic.io.InputFormat#nextRecord(java.lang.Object) */ @Override public boolean nextRecord(PactRecord record) throws IOException { int lastCharacter, fieldIndex = 0; do { lastCharacter = this.fillBuilderWithNextField(); // ignore empty line if (lastCharacter == 0 && fieldIndex == 0 && this.builder.length() == 0) break; this.addToObject(fieldIndex++, this.builder.toString()); this.builder.setLength(0); } while (lastCharacter != -1 && lastCharacter != '\n'); if (this.objectNode.size() == 0) return false; this.targetSchema.jsonToRecord(this.objectNode, record, this.context); this.endReached = lastCharacter == -1 || this.reader.reachedLimit(); return true; } /** * @param escaped */ private void setState(State newState) { this.state.push(newState); } /** * @param fieldIndex * @param string */ private void addToObject(int fieldIndex, String string) { this.objectNode.put(this.keyNames[fieldIndex], TextNode.valueOf(string)); } /* * (non-Javadoc) * @see eu.stratosphere.pact.common.io.FileInputFormat#close() */ @Override public void close() throws IOException { this.revertToPreviousState(); this.reader.close(); super.close(); } /** * @return */ private State getCurrentState() { return this.state.peek(); } private State revertToPreviousState() { return this.state.pop(); } /* * (non-Javadoc) * @see eu.stratosphere.pact.common.generic.io.InputFormat#getStatistics(eu.stratosphere.pact.common.io.statistics. * BaseStatistics) */ @Override public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { // check the cache FileBaseStatistics stats = null; if (cachedStatistics != null && cachedStatistics instanceof FileBaseStatistics) stats = (FileBaseStatistics) cachedStatistics; else stats = new FileBaseStatistics(-1, BaseStatistics.UNKNOWN, BaseStatistics.UNKNOWN); try { final Path file = this.filePath; final URI uri = file.toUri(); // get the filesystem final FileSystem fs = FileSystem.get(uri); List<FileStatus> files = null; // get the file info and check whether the cached statistics are still // valid. { FileStatus status = fs.getFileStatus(file); if (status.isDir()) { FileStatus[] fss = fs.listStatus(file); files = new ArrayList<FileStatus>(fss.length); boolean unmodified = true; for (FileStatus s : fss) if (!s.isDir()) { files.add(s); if (s.getModificationTime() > stats.getLastModificationTime()) { stats.setLastModificationTime(s.getModificationTime()); unmodified = false; } } if (unmodified) return stats; } else { // check if the statistics are up to date long modTime = status.getModificationTime(); if (stats.getLastModificationTime() == modTime) return stats; stats.setLastModificationTime(modTime); files = new ArrayList<FileStatus>(1); files.add(status); } } long fileSize = 0; // calculate the whole length for (FileStatus s : files) fileSize += s.getLen(); // sanity check if (fileSize <= 0) { fileSize = BaseStatistics.UNKNOWN; return stats; } stats.setTotalInputSize(fileSize); // make the samples small for very small files int numSamples = Math.min(this.numLineSamples, (int) (fileSize / 1024)); if (numSamples < 2) numSamples = 2; long offset = 0; long bytes = 0; // one byte for the line-break long stepSize = fileSize / numSamples; int fileNum = 0; int samplesTaken = 0; // take the samples for (int sampleNum = 0; sampleNum < numSamples && fileNum < files.size(); sampleNum++) { FileStatus currentFile = files.get(fileNum); FSDataInputStream inStream = null; try { inStream = fs.open(currentFile.getPath()); LineReader lineReader = new LineReader(inStream, offset, currentFile.getLen() - offset, 1024); byte[] line = lineReader.readLine(); lineReader.close(); if (line != null && line.length > 0) { samplesTaken++; bytes += line.length + 1; // one for the linebreak } } finally { // make a best effort to close if (inStream != null) try { inStream.close(); } catch (Throwable t) { } } offset += stepSize; // skip to the next file, if necessary while (fileNum < files.size() && offset >= (currentFile = files.get(fileNum)).getLen()) { offset -= currentFile.getLen(); fileNum++; } } stats.setAverageRecordWidth(bytes / (float) samplesTaken); } catch (IOException ioex) { if (LOG.isWarnEnabled()) LOG.warn("Could not determine complete statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage()); } catch (Throwable t) { if (LOG.isErrorEnabled()) LOG.error("Unexpected problen while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t); } return stats; } }