/** * Copyright (C) 2014-2017 Philip Helger (www.helger.com) * philip[at]helger[dot]com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.helger.css.supplementary.parser; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.function.Consumer; import javax.annotation.Nonnull; import javax.annotation.WillClose; import javax.annotation.WillNotClose; import com.helger.commons.ValueEnforcer; import com.helger.commons.charset.CharsetManager; import com.helger.commons.io.file.FileHelper; import com.helger.commons.io.stream.StreamHelper; public class CSSTokenizer { private static final String CHARSET = "@charset \""; private final Charset m_aFallbackEncoding; private boolean m_bStrictMode = false; private boolean m_bDebugMode = false; public CSSTokenizer () { this (StandardCharsets.UTF_8); } public CSSTokenizer (@Nonnull final Charset aFallbackEncoding) { m_aFallbackEncoding = ValueEnforcer.notNull (aFallbackEncoding, "FallbackEncoding"); } @Nonnull public CSSTokenizer setStrictMode (final boolean bStrictMode) { m_bStrictMode = bStrictMode; return this; } @Nonnull public CSSTokenizer setDebugMode (final boolean bDebugMode) { m_bDebugMode = bDebugMode; return this; } @Nonnull private Charset _determineCharset (@Nonnull @WillNotClose final CSSInputStream aIS) throws IOException, CSSTokenizeException { // Determine charset // https://www.w3.org/TR/css-syntax-3/#input-byte-stream final int nMaxHeader = Math.min (1024, aIS.available ()); if (nMaxHeader > 11) { final byte [] aBuffer = new byte [nMaxHeader]; aIS.read (aBuffer); aIS.unread (aBuffer); final String sPrefix = new String (aBuffer, 0, CHARSET.length (), StandardCharsets.US_ASCII); if (m_bStrictMode ? CHARSET.equals (sPrefix) : CHARSET.equalsIgnoreCase (sPrefix)) { int nEnd = CHARSET.length (); while (nEnd < nMaxHeader && aBuffer[nEnd] != '"') nEnd++; if (nEnd == nMaxHeader) throw new CSSTokenizeException ("Unexpected end of @charset declaration"); String sCharset = new String (aBuffer, CHARSET.length (), nEnd - CHARSET.length (), StandardCharsets.US_ASCII); if ("utf-16be".equalsIgnoreCase (sCharset) || "utf-16le".equalsIgnoreCase (sCharset)) sCharset = "utf-8"; final Charset aCharset = CharsetManager.getCharsetFromName (sCharset); if (aCharset == null) throw new CSSTokenizeException ("Unsupported charset '" + sCharset + "' provided!"); return aCharset; } } return m_aFallbackEncoding; } public void tokenize (@Nonnull @WillClose final InputStream aIS, @Nonnull final Consumer <CSSToken> aConsumer) throws IOException, CSSTokenizeException { ValueEnforcer.notNull (aIS, "InputStream"); ValueEnforcer.notNull (aConsumer, "Consumer"); try (final CSSInputStream aCSSIS = new CSSInputStream (aIS)) { final Charset aCharset = _determineCharset (aCSSIS); try (final CSSCodepointReader aReader = new CSSCodepointReader (aCSSIS, aCharset)) { while (true) { // https://www.w3.org/TR/css-syntax-3/#consume-a-token0 CSSCodepoint aCP = aReader.startToken (); final ECSSTokenStartType eTokenStartType = aCP.getTokenStartType (); if (m_bDebugMode) { final int nValue = aCP.getValue (); System.out.println ("[" + aReader.getTokenStartLineNumber () + ":" + aReader.getTokenStartColumnNumber () + "] - " + (eTokenStartType == ECSSTokenStartType.EOF ? "EOF" : "read CP " + (nValue >= 0x20 && nValue <= 0x7f ? Character.toString ((char) nValue) : "0x" + Integer.toHexString (nValue)) + " as " + eTokenStartType)); } if (eTokenStartType == ECSSTokenStartType.EOF) { // EOF aConsumer.accept (aReader.createToken (ECSSTokenType.EOF)); break; } CSSToken aToken = null; switch (eTokenStartType) { case WHITESPACE: while (true) { aCP = aReader.read (); if (aCP.getTokenStartType () != ECSSTokenStartType.WHITESPACE) break; } aReader.unread (aCP); aToken = aReader.createToken (ECSSTokenType.WHITESPACE); break; case SOLIDUS: // Maybe a comment? if (aReader.peek ().getTokenStartType () == ECSSTokenStartType.ASTERISK) { // It's a comment aReader.read (); while (true) { aCP = aReader.read (); if (aCP.getTokenStartType () == ECSSTokenStartType.ASTERISK) if (aReader.peek ().getTokenStartType () == ECSSTokenStartType.SOLIDUS) { aReader.read (); break; } } aToken = aReader.createToken (ECSSTokenType.COMMENT); } else aToken = aReader.createToken (ECSSTokenType.DELIM); break; default: if (false) throw new IllegalStateException ("Unsupported token start type " + eTokenStartType); System.err.println ("Unsupported token start type " + eTokenStartType); } if (aToken != null) aConsumer.accept (aToken); } } } } public static void main (final String [] args) throws IOException, CSSTokenizeException { final File f = new File ("src/test/resources/testfiles/css30/good/pure-min.css"); try (InputStream aIS = StreamHelper.getBuffered (FileHelper.getInputStream (f))) { new CSSTokenizer ().setDebugMode (false).tokenize (aIS, t -> { System.out.println (t); }); } } }