/* $Id$ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.connectorcommon.fuzzyml; import org.apache.manifoldcf.core.interfaces.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.io.*; /** This class represents the parse state of the BOM (byte order mark) parser. * The byte order mark parser looks for a byte order mark at the start of a byte sequence, * and based on whether it finds it or not, and what it finds, selects a preliminary character encoding. * Once a preliminary character encoding is determined, an EncodingAccepter is notified, * and further bytes are sent to a provided ByteReceiver. */ public class BOMEncodingDetector extends SingleByteReceiver implements EncodingDetector { protected String encoding = null; protected final ByteReceiver overflowByteReceiver; protected ByteBuffer replayBuffer = new ByteBuffer(); protected final static int BOM_NOTHINGYET = 0; protected final static int BOM_SEEN_EF = 1; protected final static int BOM_SEEN_FF = 2; protected final static int BOM_SEEN_FE = 3; protected final static int BOM_SEEN_ZERO = 4; protected final static int BOM_SEEN_EFBB = 5; protected final static int BOM_SEEN_FFFE = 6; protected final static int BOM_SEEN_0000 = 7; protected final static int BOM_SEEN_FFFE00 = 8; protected final static int BOM_SEEN_0000FE = 9; protected int currentState = BOM_NOTHINGYET; /** Constructor. *@param overflowByteReceiver Pass in the receiver of all overflow bytes. * If no receiver is passed in, the detector will stop as soon as the * BOM is either seen, or not seen. */ public BOMEncodingDetector(ByteReceiver overflowByteReceiver) { super(8); this.overflowByteReceiver = overflowByteReceiver; } /** Set initial encoding. */ @Override public void setEncoding(String encoding) { this.encoding = encoding; } /** Retrieve final encoding determination. */ @Override public String getEncoding() { return encoding; } /** Receive a byte. */ @Override public boolean dealWithByte(byte b) throws ManifoldCFException { replayBuffer.appendByte(b); int theByte = 0xff & (int)b; switch (currentState) { case BOM_NOTHINGYET: if (theByte == 0xef) currentState = BOM_SEEN_EF; else if (theByte == 0xff) currentState = BOM_SEEN_FF; else if (theByte == 0xfe) currentState = BOM_SEEN_FE; else if (theByte == 0x00) currentState = BOM_SEEN_ZERO; else return replay(); break; case BOM_SEEN_EF: if (theByte == 0xbb) currentState = BOM_SEEN_EFBB; else return replay(); break; case BOM_SEEN_FF: if (theByte == 0xfe) { // Either UTF-16LE or UTF-32LE mark(); currentState = BOM_SEEN_FFFE; } else return replay(); break; case BOM_SEEN_FE: if (theByte == 0xff) { // UTF-16BE detected mark(); return establishEncoding("UTF-16BE"); } else return replay(); case BOM_SEEN_ZERO: if (theByte == 0x00) currentState = BOM_SEEN_0000; else return replay(); break; case BOM_SEEN_EFBB: if (theByte == 0xbf) { // Encoding detected as utf-8 mark(); return establishEncoding(StandardCharsets.UTF_8.name()); } else return replay(); case BOM_SEEN_FFFE: if (theByte == 0x00) { currentState = BOM_SEEN_FFFE00; } else { // Encoding detected as UTF-16LE. Do NOT re-mark, we need this // character for later. return establishEncoding(StandardCharsets.UTF_16LE.name()); } break; case BOM_SEEN_0000: if (theByte == 0xfe) currentState = BOM_SEEN_0000FE; else return replay(); break; case BOM_SEEN_FFFE00: if (theByte == 0x00) { mark(); return establishEncoding("UTF-32LE"); } else { // Leave mark alone. return establishEncoding(StandardCharsets.UTF_16LE.name()); } case BOM_SEEN_0000FE: if (theByte == 0xff) { mark(); return establishEncoding("UTF-32BE"); } else return replay(); default: throw new ManifoldCFException("Unknown state: "+currentState); } return false; } /** Establish the provided encoding, and send the rest to the child, if any. */ protected boolean establishEncoding(String encoding) throws ManifoldCFException { setEncoding(encoding); return true; } /** Set a "mark". */ protected void mark() { replayBuffer.clear(); } /** Establish NO encoding, and replay from the current saved point to the child, if any. */ protected boolean replay() throws ManifoldCFException { return true; } /** Send stream from current point onward with the current encoding. */ protected boolean playFromCurrentPoint() throws ManifoldCFException { mark(); return true; } /** Deal with the remainder of the input. * This is called only when dealWithByte() returns true. *@param buffer is the buffer of characters that should come first. *@param offset is the offset within the buffer of the first character. *@param len is the number of characters in the buffer. *@param inputStream is the stream that should come after the characters in the buffer. *@return true to abort, false if the end of the stream has been reached. */ @Override protected boolean dealWithRemainder(byte[] buffer, int offset, int len, InputStream inputStream) throws IOException, ManifoldCFException { if (overflowByteReceiver == null) return super.dealWithRemainder(buffer,offset,len,inputStream); // Create a wrapped input stream with all the missing bytes while (len > 0) { replayBuffer.appendByte(buffer[offset++]); len--; } return overflowByteReceiver.dealWithBytes(new PrefixedInputStream(replayBuffer,inputStream)); } }