UnicodeOffsetLengthReader.java example

Explorer
rascal-master
- src
  - org
    - rascalmpl
- test
  - org
    - rascalmpl
package org.rascalmpl.unicode;

import java.io.FilterReader;
import java.io.IOException;
import java.io.Reader;

/**
 * Wraps a reader counting first a number (offset) of unicode characters before it starts reading
 * and then only reads a number of unicode characters (length) before it returns end-of-stream.
 * When length is -1, the reader reads until the end of the stream. 
 * 
 * Counting unicode characters is
 * interesting because of the utf16 encoding Java has in memory for unicode.
 */
public class UnicodeOffsetLengthReader extends FilterReader {
	private int charsRead;
	private int offset;
	private int length;
	
	public UnicodeOffsetLengthReader(Reader in, int offset, int len) {
		super(in);
		this.offset = offset;
		this.length = len;
	}
	
	private void offset() throws IOException {
		if (offset > 0) {
			char[] buf = new char[8096];

			while (offset > 0) {
				int res = in.read(buf, 0, Math.min(offset, buf.length));

				if (res == -1) {
					offset = 0;
					return;
				}

				offset -= res; // may be not enough due to surrogate pairs

				for (int i = 0; i < res; i++) {
					if (Character.isHighSurrogate(buf[i])) {
						offset++; // correct for earlier subtraction
					}
				}
			}
		}
	}
	
	@Override
	public int read() throws IOException {
		offset();
		if (length != -1 && charsRead >= length) {
			return -1;
		}
		
		int res = super.read();
		
		if (res != -1 && !Character.isHighSurrogate((char) res)) {
			charsRead++;
		}
		
		return res;
	}
	
	@Override
	public int read(char[] cbuf, int off, int len) throws IOException {
		offset();
		
		if (length != -1 && charsRead >= length) {
			// we are at the end already
			return -1;
		}
		
		
		// just get what we can, we will cut the result below
		int res = super.read(cbuf, off, len);
		
		if (res == 0) {
			return res; // unlikely corner case
		}
		
		if (res != -1) {
			// now cut off the result
			int count = 0;
			for (int i = 0; i < res; i++) {
				count++;
				if (!Character.isHighSurrogate(cbuf[i])) {
					charsRead++;
					
					if (length != -1   && charsRead >= length) {
						// done
						return count;
					}
				}
			}
			return count;
		}
		
		return res;
	}
}