/*
* Copyright 2014 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
/**
* This class is only for the readUntilEndStream method, to prevent a
* final CR LF or LF (but not a final CR!) from being written to the output,
* unless the beginning of the stream is assumed to be ASCII.
* Only the 3-param write() method is implemented. This solves
* PDFBOX-2079 and PDFBOX-2120 and avoids making readUntilEndStream()
* even more complex than it already is.
*
* @author Tilman Hausherr
*/
class EndstreamOutputStream extends BufferedOutputStream
{
//TODO: replace this class with a PullBackOutputStream class if there ever is one
private boolean hasCR = false;
private boolean hasLF = false;
private int pos = 0;
private boolean mustFilter = true;
EndstreamOutputStream(OutputStream out)
{
super(out);
}
/**
* Write CR and/or LF that were kept, then writes len bytes from the
* specified byte array starting at offset off to this output stream,
* except trailing CR, CR LF, or LF. No filtering will be done for the
* entire stream if the beginning is assumed to be ASCII.
* @param b byte array.
* @param off offset.
* @param len length of segment to write.
* @throws IOException
*/
@Override
public void write(byte[] b, int off, int len) throws IOException
{
if (pos == 0 && len > 10)
{
// PDFBOX-2120 Don't filter if ASCII, i.e. keep a final CR LF or LF
mustFilter = false;
for (int i = 0; i < 10; ++i)
{
// Heuristic approach, taken from PDFStreamParser, PDFBOX-1164
if ((b[i] < 0x09) || ((b[i] > 0x0a) && (b[i] < 0x20) && (b[i] != 0x0d)))
{
// control character or > 0x7f -> we have binary data
mustFilter = true;
break;
}
}
}
if (mustFilter)
{
// first write what we kept last time
if (hasCR)
{
// previous buffer ended with CR
hasCR = false;
if (!hasLF && len == 1 && b[off] == '\n')
{
// actual buffer contains only LF so it will be the last one
// => we're done
// reset hasCR done too to avoid CR getting written in the flush
return;
}
super.write('\r');
}
if (hasLF)
{
super.write('\n');
hasLF = false;
}
// don't write CR, LF, or CR LF if at the end of the buffer
if (len > 0)
{
if (b[off + len - 1] == '\r')
{
hasCR = true;
--len;
}
else if (b[off + len - 1] == '\n')
{
hasLF = true;
--len;
if (len > 0 && b[off + len - 1] == '\r')
{
hasCR = true;
--len;
}
}
}
}
super.write(b, off, len);
pos += len;
}
/**
* write out a single CR if one was kept. Don't write kept CR LF or LF,
* and then call the base method to flush.
*
* @throws IOException
*/
@Override
public void flush() throws IOException
{
// if there is only a CR and no LF, write it
if (hasCR && !hasLF)
{
super.write('\r');
++pos;
}
hasCR = false;
hasLF = false;
super.flush();
}
}