/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * <p/> * http://www.apache.org/licenses/LICENSE-2.0 * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.alibaba.jstorm.hdfs.spout; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.Collections; import java.util.List; import java.util.Map; // Todo: Track file offsets instead of line number public class TextFileReader extends AbstractFileReader { public static final String[] defaultFields = {"line"}; public static final String CHARSET = "hdfsspout.reader.charset"; public static final String BUFFER_SIZE = "hdfsspout.reader.buffer.bytes"; private static final int DEFAULT_BUFF_SIZE = 4096; private BufferedReader reader; private final Logger LOG = LoggerFactory.getLogger(TextFileReader.class); private TextFileReader.Offset offset; public TextFileReader(FileSystem fs, Path file, Map conf) throws IOException { this(fs, file, conf, new TextFileReader.Offset(0,0) ); } public TextFileReader(FileSystem fs, Path file, Map conf, String startOffset) throws IOException { this(fs, file, conf, new TextFileReader.Offset(startOffset) ); } private TextFileReader(FileSystem fs, Path file, Map conf, TextFileReader.Offset startOffset) throws IOException { super(fs, file); offset = startOffset; FSDataInputStream in = fs.open(file); String charSet = (conf==null || !conf.containsKey(CHARSET) ) ? "UTF-8" : conf.get(CHARSET).toString(); int buffSz = (conf==null || !conf.containsKey(BUFFER_SIZE) ) ? DEFAULT_BUFF_SIZE : Integer.parseInt( conf.get(BUFFER_SIZE).toString() ); reader = new BufferedReader(new InputStreamReader(in, charSet), buffSz); if(offset.charOffset >0) { reader.skip(offset.charOffset); } } public Offset getFileOffset() { return offset.clone(); } public List<Object> next() throws IOException, ParseException { String line = readLineAndTrackOffset(reader); if(line!=null) { return Collections.singletonList((Object) line); } return null; } private String readLineAndTrackOffset(BufferedReader reader) throws IOException { StringBuffer sb = new StringBuffer(1000); long before = offset.charOffset; int ch; while( (ch = reader.read()) != -1 ) { ++offset.charOffset; if (ch == '\n') { ++offset.lineNumber; return sb.toString(); } else if( ch != '\r') { sb.append((char)ch); } } if(before==offset.charOffset) { // reached EOF, didnt read anything return null; } return sb.toString(); } @Override public void close() { try { reader.close(); } catch (IOException e) { LOG.warn("Ignoring error when closing file " + getFilePath(), e); } } public static class Offset implements FileOffset { long charOffset; long lineNumber; public Offset(long byteOffset, long lineNumber) { this.charOffset = byteOffset; this.lineNumber = lineNumber; } public Offset(String offset) { if(offset==null) { throw new IllegalArgumentException("offset cannot be null"); } try { if(offset.equalsIgnoreCase("0")) { this.charOffset = 0; this.lineNumber = 0; } else { String[] parts = offset.split(":"); this.charOffset = Long.parseLong(parts[0].split("=")[1]); this.lineNumber = Long.parseLong(parts[1].split("=")[1]); } } catch (Exception e) { throw new IllegalArgumentException("'" + offset + "' cannot be interpreted. It is not in expected format for TextFileReader." + " Format e.g. {char=123:line=5}"); } } @Override public String toString() { return '{' + "char=" + charOffset + ":line=" + lineNumber + ":}"; } @Override public boolean isNextOffset(FileOffset rhs) { if(rhs instanceof Offset) { Offset other = ((Offset) rhs); return other.charOffset > charOffset && other.lineNumber == lineNumber+1; } return false; } @Override public int compareTo(FileOffset o) { Offset rhs = ((Offset)o); if(lineNumber < rhs.lineNumber) { return -1; } if(lineNumber == rhs.lineNumber) { return 0; } return 1; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof Offset)) { return false; } Offset that = (Offset) o; if (charOffset != that.charOffset) return false; return lineNumber == that.lineNumber; } @Override public int hashCode() { int result = (int) (charOffset ^ (charOffset >>> 32)); result = 31 * result + (int) (lineNumber ^ (lineNumber >>> 32)); return result; } @Override public Offset clone() { return new Offset(charOffset, lineNumber); } } //class Offset }