/* $Id$ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.connectorcommon.fuzzyml;
import org.apache.manifoldcf.core.interfaces.*;
import java.io.*;
import java.nio.charset.StandardCharsets;
/** This is the main parser class.
* This class has an entry point for both parsing XML and HTML. The way the
* parser works is to accept both an input stream (which the caller is responsible
* for closing) as well as a CharacterReceiver that will do the actual parsing.
* This class is responsible mainly for setup and character set detection,
*/
public class Parser
{
/** Constructor.
* Someday there will be a constructor which accepts character detection
* configuration information, but for now there is none.
*/
public Parser()
{
}
/** Parse an input stream with character set detection.
* This method uses BOM (byte order mark) and the xml encoding tag to determine the character encoding to use.
* The caller may pass in a starting character encoding, which functions as the default if no better determination
* is made.
*@param startingCharset is the starting character set. Pass null if this is unknown.
*@param inputStream is the input stream. It is the caller's responsibility to close the stream when the parse is done.
*@param characterReceiver is the character receiver that will actually do the parsing.
*/
public void parseWithCharsetDetection(String startingCharset, InputStream inputStream, CharacterReceiver characterReceiver)
throws IOException, ManifoldCFException
{
// Wrap the input stream, before we do anything else
ReplayableInputStream replayableInputStream = new ReplayableInputStream(inputStream);
// First go-around: use the BOM detector with nothing downstream, since we don't know the character set yet.
BOMEncodingDetector bomEncodingDetector = new BOMEncodingDetector(null);
bomEncodingDetector.setEncoding(startingCharset);
if (bomEncodingDetector.dealWithBytes(replayableInputStream) == false)
bomEncodingDetector.finishUp();
// Update our notion of what the character set is
startingCharset = bomEncodingDetector.getEncoding();
if (startingCharset == null)
startingCharset = StandardCharsets.UTF_8.name();
// Reset the stream
replayableInputStream.restart(false);
// Set up a detection chain that includes the XML detector.
// BOMEncodingDetector (for BOM detection) -> XMLEncodingDetector (for xml encoding tag access)
XMLEncodingDetector xmlEncodingDetector = new XMLEncodingDetector();
xmlEncodingDetector.setEncoding(startingCharset);
bomEncodingDetector = new BOMEncodingDetector(new DecodingByteReceiver(1024,startingCharset,xmlEncodingDetector));
// Rerun the detection; this should finalize the value.
if (bomEncodingDetector.dealWithBytes(replayableInputStream) == false)
bomEncodingDetector.finishUp();
// Get the final charset determination
startingCharset = xmlEncodingDetector.getEncoding();
// Reset for the final time
replayableInputStream.restart(true);
// Set up the whole chain and parse
bomEncodingDetector = new BOMEncodingDetector(new DecodingByteReceiver(65536,startingCharset,characterReceiver));
if (bomEncodingDetector.dealWithBytes(replayableInputStream) == false)
bomEncodingDetector.finishUp();
}
/** Parse an input stream without character set detection.
*@param startingCharset is the starting character set. If null is passed, the code will presume utf-8.
*@param inputStream is the input stream. It is the caller's responsibility to close the stream when the parse is done.
*@param characterReceiver is the character receiver that will actually do the parsing.
*/
public void parseWithoutCharsetDetection(String startingCharset, InputStream inputStream, CharacterReceiver characterReceiver)
throws IOException, ManifoldCFException
{
if (startingCharset == null)
startingCharset = StandardCharsets.UTF_8.name();
ByteReceiver byteReceiver = new DecodingByteReceiver(65536, startingCharset, characterReceiver);
// Process to completion
if (byteReceiver.dealWithBytes(inputStream) == false)
byteReceiver.finishUp();
}
}