/* $Id$ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.connectorcommon.fuzzyml; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.core.system.Logging; import java.util.*; import java.io.*; /** Class to keep track of XML hierarchy in the face of possibly corrupt * XML and with case-insensitive tags, etc. * Basically, this class accepts what is supposedly XML but allows for various * kinds of handwritten corruption. Specific kinds of errors allowed include: * * - Bad character encoding * - Tag case match problems; all attributes are (optionally) bashed to lower case * - Other parsing recoveries to be added as they arise * * The functionality of this class is also somewhat lessened vs. standard * SAX-type parsers. No namespace interpretation is done, for instance; tag qnames * are split into namespace name and local name, and that's all folks. But if you need * more power, you can write a class extension that will do that readily. */ public class XMLFuzzyParseState extends TagParseState { protected final boolean lowerCaseAttributes; protected final boolean lowerCaseTags; protected final boolean lowerCaseQAttributes; protected final boolean lowerCaseQTags; protected final boolean lowerCaseBTags; protected final boolean lowerCaseEscapeTags; /** Constructor. */ public XMLFuzzyParseState(boolean lowerCaseAttributes, boolean lowerCaseTags, boolean lowerCaseQAttributes, boolean lowerCaseQTags, boolean lowerCaseBTags, boolean lowerCaseEscapeTags) { this.lowerCaseAttributes = lowerCaseAttributes; this.lowerCaseTags = lowerCaseTags; this.lowerCaseQAttributes = lowerCaseQAttributes; this.lowerCaseQTags = lowerCaseQTags; this.lowerCaseBTags = lowerCaseBTags; this.lowerCaseEscapeTags = lowerCaseEscapeTags; } /** This method gets called for every tag. Override this method to intercept tag begins. *@return true to halt further processing. */ @Override protected final boolean noteTag(String tagName, List<AttrNameValue> attributes) throws ManifoldCFException { Map<String,String> attrMap = new HashMap<String,String>(attributes.size()); for (AttrNameValue nv : attributes) { String name = nv.getName(); if (lowerCaseAttributes) name = nv.getName().toLowerCase(Locale.ROOT); attrMap.put(name, nv.getValue()); } if (lowerCaseTags) tagName = tagName.toLowerCase(Locale.ROOT); int index = tagName.indexOf(":"); String nameSpace; String localName; if (index == -1) { localName = tagName; nameSpace = null; } else { localName = tagName.substring(index+1); nameSpace = tagName.substring(0,index); } return noteTagEx(tagName, nameSpace, localName, attrMap); } /** Map version of the noteTag method. *@return true to halt further processing. */ protected boolean noteTagEx(String tagName, String nameSpace, String localName, Map<String,String> attributes) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw tag '"+tagName+"'"); return false; } /** This method gets called for every end tag. Override this method to intercept tag ends. *@return true to halt further processing. */ @Override protected final boolean noteEndTag(String tagName) throws ManifoldCFException { if (lowerCaseTags) tagName = tagName.toLowerCase(Locale.ROOT); int index = tagName.indexOf(":"); String nameSpace; String localName; if (index == -1) { localName = tagName; nameSpace = null; } else { localName = tagName.substring(index+1); nameSpace = tagName.substring(0,index); } return noteEndTagEx(tagName, nameSpace, localName); } /** Note end tag. */ protected boolean noteEndTagEx(String tagName, String nameSpace, String localName) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw end tag '"+tagName+"'"); return false; } /** This method is called for every <? ... ?> construct, or 'qtag'. * This is not useful for HTML. *@return true to halt further processing. */ @Override protected final boolean noteQTag(String tagName, List<AttrNameValue> attributes) throws ManifoldCFException { Map<String,String> attrMap = new HashMap<String,String>(attributes.size()); for (AttrNameValue nv : attributes) { String name = nv.getName(); if (lowerCaseQAttributes) name = nv.getName().toLowerCase(Locale.ROOT); attrMap.put(name, nv.getValue()); } if (lowerCaseQTags) tagName = tagName.toLowerCase(Locale.ROOT); return noteQTagEx(tagName, attrMap); } /** Map version of the noteQTag method. *@return true to halt further processing. */ protected boolean noteQTagEx(String tagName, Map<String,String> attributes) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw QTag '"+tagName+"'"); return false; } /** This method is called for every <! <token> ... > construct, or 'btag'. * Override it to intercept these. *@return true to halt further processing. */ @Override protected final boolean noteBTag(String tagName) throws ManifoldCFException { if (lowerCaseBTags) tagName = tagName.toLowerCase(Locale.ROOT); return noteBTagEx(tagName); } /** New version of the noteBTag method. *@return true to halt further processing. */ protected boolean noteBTagEx(String tagName) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw BTag '"+tagName+"'"); return false; } /** Called for the start of every cdata-like tag, e.g. <![ <token> [ ... ]]> *@param token may be empty!!! *@return true to halt further processing. */ @Override protected final boolean noteEscaped(String token) throws ManifoldCFException { if (lowerCaseEscapeTags && token != null) token = token.toLowerCase(Locale.ROOT); return noteEscapedEx(token); } /** New version of the noteEscapedTag method. *@return true to halt further processing. */ protected boolean noteEscapedEx(String token) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw Escaped '"+token+"'"); return false; } /** This method gets called for every token inside a btag. *@return true to halt further processing. */ @Override protected final boolean noteBTagToken(String token) throws ManifoldCFException { if (lowerCaseBTags) token = token.toLowerCase(Locale.ROOT); return noteBTagTokenEx(token); } /** New version of the noteBTagToken method. *@return true to halt further processing. */ protected boolean noteBTagTokenEx(String token) throws ManifoldCFException { if (Logging.misc.isDebugEnabled()) Logging.misc.debug(" Saw BTag token '"+token+"'"); return false; } }