package org.cdlib.xtf.textIndexer; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.SequenceInputStream; import java.util.Enumeration; /* * This file created on Jan 21, 2005 by Martin Haye */ /** * There's a very nasty bug in the Apache Crimson XML parser. If a ']' * character appears at the very end of its 8193-byte buffer and is * preceded by a '>' then it crashes. This stream works around it by * inserting spaces just before ']' if preceded by a '>'. */ public class CrimsonBugWorkaround extends SequenceInputStream { InputStream in; /** Construct a stream that filters the given one */ public CrimsonBugWorkaround(InputStream in) { super(new BlockEnum(in)); } // constructor /** Presents the input stream as a series of blocks of data */ private static class BlockEnum implements Enumeration { static final int BLOCK_SIZE = 32 * 1024; InputStream in; byte[] inBuf = new byte[BLOCK_SIZE]; int inBufLen; byte[] outBuf = new byte[BLOCK_SIZE * 2]; int outBufLen; boolean eof = false; byte prev = 'a'; BlockEnum(InputStream in) { this.in = in; } /** Tells whether there are more blocks to read */ public boolean hasMoreElements() { return !eof; } /** Gets an InputStream for the next block of data */ public Object nextElement() { try { inBufLen = 0; while (!eof && inBufLen < BLOCK_SIZE) { int nRead = in.read(inBuf, inBufLen, BLOCK_SIZE - inBufLen); if (nRead < 0) { eof = true; break; } inBufLen += nRead; assert inBufLen <= BLOCK_SIZE; } fixBuf(); return new ByteArrayInputStream(outBuf, 0, outBufLen); } catch (IOException e) { throw new RuntimeException(e); } } // nextElement() /** * Scan through the input buffer, looking for the suspicious pair of * characters and sticking a space between them. The result is in * the output buffer. */ private void fixBuf() { outBufLen = 0; for (int src = 0; src < inBufLen; src++) { byte cur = inBuf[src]; if (cur == ']' && prev == '>') outBuf[outBufLen++] = ' '; outBuf[outBufLen++] = cur; prev = cur; } // for src } // fixBuf() } // class BlockEnum } // class CrimsonBugWorkaround