/* * #! * Ontopia Engine * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.utils.ctm; import java.io.IOException; import java.io.PushbackInputStream; import net.ontopia.topicmaps.impl.utils.EncodingSnifferIF; import net.ontopia.topicmaps.xml.InvalidTopicMapException; /** * INTERNAL: An encoding sniffer for CTM. */ public class CTMEncodingSniffer implements EncodingSnifferIF { public String guessEncoding(PushbackInputStream stream) throws IOException { String encoding; // Look to seee if there's a UTF-8 BOM (Byte Order Mark) at the // start of the stream. byte[] bomBuffer = new byte[3]; boolean foundBom = false; int bytesread = stream.read(bomBuffer, 0, 3); if (bytesread == 3) { // Check if bomBuffer contains the UTF-8 BOM. Casts necessary to deal // with signedness issues. (Java needs unsigned byte!) foundBom = (bomBuffer[0] == (byte) 0xEF && bomBuffer[1] == (byte) 0xBB && bomBuffer[2] == (byte) 0xBF); if (!foundBom) stream.unread(bomBuffer, 0, 3); } else if (bytesread != -1) stream.unread(bomBuffer, 0, bytesread); encoding = "utf-8"; // this is what the spec says to assume here // Now look for an encoding declaration byte[] buf = new byte[50]; int read = stream.read(buf, 0, 50); if (read != -1) { String start = new String(buf, 0, read); stream.unread(buf, 0, read); // Get the encoding (if any) declared in the document. if (start.startsWith("%encoding")) encoding = getEncoding(start); // If a BOM is found then the encoding must be utf-8. if (foundBom && encoding != null && !encoding.equals("utf-8")) throw new InvalidTopicMapException("Contradicting encoding information." + " The BOM indicates that the encoding should be utf-8," + " but the encoding is declared to be: " + encoding + "."); return encoding; } return encoding; } private String getEncoding(String buf) { // characters 0 - 8 are taken up by '%encoding' // now scan past the whitespace int ix = 9; while (ix < buf.length() && (buf.charAt(ix) == ' ' || buf.charAt(ix) == '\u0009' || buf.charAt(ix) == '\n' || buf.charAt(ix) == '\r')) ix++; // FIXME: triple-quoted strings should be accepted here if (buf.charAt(ix) != '"') return null; ix++; int start = ix; while (ix < buf.length() && buf.charAt(ix) != '"') ix++; return buf.substring(start, ix); } }