/******************************************************************************* * Copyright (c) 2001, 2010 IBM Corporation and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * IBM Corporation - initial API and implementation * Jens Lukowski/Innoopract - initial renaming/restructuring * *******************************************************************************/ package org.eclipse.wst.xml.core.internal.parser; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.eclipse.jface.text.BadLocationException; import org.eclipse.jface.text.IDocument; import org.eclipse.wst.sse.core.internal.document.DocumentReader; import org.eclipse.wst.sse.core.internal.ltk.parser.BlockMarker; import org.eclipse.wst.sse.core.internal.ltk.parser.BlockTagParser; import org.eclipse.wst.sse.core.internal.ltk.parser.BlockTokenizer; import org.eclipse.wst.sse.core.internal.ltk.parser.RegionParser; import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionHandler; import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionParser; import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionParserExtension; import org.eclipse.wst.sse.core.internal.provisional.text.IStructuredDocumentRegion; import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegion; import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegionContainer; import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegionList; import org.eclipse.wst.sse.core.internal.text.CharSequenceReader; import org.eclipse.wst.sse.core.internal.text.IRegionComparible; import org.eclipse.wst.sse.core.internal.util.Debug; import org.eclipse.wst.xml.core.internal.Logger; import org.eclipse.wst.xml.core.internal.regions.DOMRegionContext; /** * Takes input from the HTMLTokenizer and creates a tag list */ public class XMLSourceParser implements RegionParser, BlockTagParser, StructuredDocumentRegionParser, IRegionComparible, StructuredDocumentRegionParserExtension { // made public to aid access from inner classes in hierarchy. // TODO: in future, figure out how to solve without exposing data. public CharSequence fCharSequenceSource = null; private IDocument fDocumentInput; protected int fOffset = 0; // DMW: 2/12/03. Removed some state data, since not really needed, // and since it added a lot to overhead (since so many regions are // created. // protected IStructuredDocumentRegion fCurrentNode = null; // protected IStructuredDocumentRegion fNodes = null; // protected List fRegions = null; // protected Object fInput = null; protected String fStringInput = null; protected List fStructuredDocumentRegionHandlers; protected BlockTokenizer fTokenizer = null; protected long startTime; protected long stopTime; /** * HTMLSourceParser constructor comment. */ public XMLSourceParser() { super(); fStructuredDocumentRegionHandlers = new ArrayList(); } /** * This is a simple utility to count nodes. Used only for debug * statements. */ protected int _countNodes(IStructuredDocumentRegion nodes) { int result = 0; IStructuredDocumentRegion countNode = nodes; while (countNode != null) { result++; countNode = countNode.getNext(); } return result; } public void addBlockMarker(BlockMarker marker) { getTokenizer().addBlockMarker(marker); } public synchronized void addStructuredDocumentRegionHandler(StructuredDocumentRegionHandler handler) { if (fStructuredDocumentRegionHandlers == null) fStructuredDocumentRegionHandlers = new ArrayList(); synchronized (fStructuredDocumentRegionHandlers) { fStructuredDocumentRegionHandlers.add(handler); } } public void beginBlockScan(String newTagName) { getTokenizer().beginBlockTagScan(newTagName); } /** * @return IStructuredDocumentRegion */ protected IStructuredDocumentRegion createStructuredDocumentRegion(String type) { IStructuredDocumentRegion newNode = null; if (type == DOMRegionContext.BLOCK_TEXT) newNode = XMLStructuredRegionFactory.createRegion(XMLStructuredRegionFactory.XML_BLOCK); else newNode = XMLStructuredRegionFactory.createRegion(XMLStructuredRegionFactory.XML); return newNode; } protected void fireNodeParsed(IStructuredDocumentRegion fCurrentNode) { /* * Never let an Exceptions from foreign code interfere with completion * of parsing. To get an exception here is definitely a program error * somewhere, but we can't afford to interrupt the flow of control. or * backwards typing can result! * * Protect the user's data above everything. */ Object[] handlers = null; synchronized (fStructuredDocumentRegionHandlers) { if (fStructuredDocumentRegionHandlers == null) return; handlers = fStructuredDocumentRegionHandlers.toArray(); } if (fCurrentNode != null && handlers != null) { for (int i = 0; i < handlers.length; i++) { try { ((StructuredDocumentRegionHandler) handlers[i]).nodeParsed(fCurrentNode); } catch (Exception e) { Logger.log(Logger.ERROR, "Error occurred while firing Node Parsed event", e); //$NON-NLS-1$ } } } } public BlockMarker getBlockMarker(String tagName) { List markers = getTokenizer().getBlockMarkers(); for (int i = 0; i < markers.size(); i++) { BlockMarker marker = (BlockMarker) markers.get(i); if (marker.isCaseSensitive()) { if (marker.getTagName().equals(tagName)) return marker; } else { if (marker.getTagName().equalsIgnoreCase(tagName)) return marker; } } return null; } public List getBlockMarkers() { return getTokenizer().getBlockMarkers(); } /** * @return IStructuredDocumentRegion */ public IStructuredDocumentRegion getDocumentRegions() { IStructuredDocumentRegion headnode = null; if (headnode == null) { if (Debug.perfTest) { startTime = System.currentTimeMillis(); } headnode = parseNodes(); if (Debug.perfTest) { stopTime = System.currentTimeMillis(); System.out.println(" -- creating nodes of IStructuredDocument -- "); //$NON-NLS-1$ System.out.println(" Time parse and init all regions: " + (stopTime - startTime) + " (msecs)"); //$NON-NLS-2$//$NON-NLS-1$ // System.out.println(" for " + fRegions.size() + " // Regions");//$NON-NLS-2$//$NON-NLS-1$ System.out.println(" and " + _countNodes(headnode) + " Nodes"); //$NON-NLS-2$//$NON-NLS-1$ } } return headnode; } protected ITextRegion getNextRegion() { ITextRegion region = null; try { region = getTokenizer().getNextToken(); // DMW: 2/12/03 Removed state // if (region != null) { // fRegions.add(region); // } return region; } catch (StackOverflowError e) { Logger.logException(getClass().getName() + ": input could not be parsed correctly at position " + getTokenizer().getOffset(), e); //$NON-NLS-1$ throw e; } catch (Exception e) { Logger.logException(getClass().getName() + ": input could not be parsed correctly at position " + getTokenizer().getOffset() + " (" + e.getLocalizedMessage() + ")", e); //$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$ } return null; } /** * Return the full list of known regions. Typically getNodes should be * used instead of this method. */ public List getRegions() { IStructuredDocumentRegion headNode = null; if (!getTokenizer().isEOF()) { headNode = getDocumentRegions(); // throw new IllegalStateException("parsing has not finished"); } // for memory recovery, we assume if someone // requests all regions, we can reset our big // memory consuming objects // but the new "getRegions" method is then more expensive. // I don't think its used much, though. List localRegionsList = getRegions(headNode); primReset(); return localRegionsList; } /** * Method getRegions. * * @param headNode * @return List */ protected List getRegions(IStructuredDocumentRegion headNode) { List allRegions = new ArrayList(); IStructuredDocumentRegion currentNode = headNode; while (currentNode != null) { ITextRegionList nodeRegions = currentNode.getRegions(); for (int i = 0; i < nodeRegions.size(); i++) { allRegions.add(nodeRegions.get(i)); } currentNode = currentNode.getNext(); } return allRegions; } /** * @deprecated - use the add/remove methods instead * @return java.util.List */ public List getStructuredDocumentRegionHandlers() { if (fStructuredDocumentRegionHandlers == null) { fStructuredDocumentRegionHandlers = new ArrayList(0); } return fStructuredDocumentRegionHandlers; } /** * Returns text from the current input. Text is only valid before * getNodes() has been called and only when a raw String or DocumentReader * is given as the input. */ public String getText(int offset, int length) { String text = null; if (fCharSequenceSource != null) { int start = fOffset + offset; int end = start + length; text = fCharSequenceSource.subSequence(start, end).toString(); } else if (fDocumentInput != null) { try { text = fDocumentInput.get(offset, length); } catch (BadLocationException e) { text = ""; //$NON-NLS-1$ } } else { if (fStringInput == null || fStringInput.length() == 0 || offset + length > fStringInput.length() || offset < 0) { text = ""; //$NON-NLS-1$ } else { // offset is entirely valid during parsing as the parse // numbers haven't been adjusted. text = fStringInput.substring(offset, offset + length); } } return text; } protected BlockTokenizer getTokenizer() { if (fTokenizer == null) { fTokenizer = new XMLTokenizer(); } return fTokenizer; } public RegionParser newInstance() { XMLSourceParser newInstance = new XMLSourceParser(); newInstance.setTokenizer(getTokenizer().newInstance()); return newInstance; } protected IStructuredDocumentRegion parseNodes() { // regions are initially reported as complete offsets within the // scanned input // they are adjusted here to be indexes from the currentNode's start // offset IStructuredDocumentRegion headNode = null; IStructuredDocumentRegion lastNode = null; ITextRegion region = null; IStructuredDocumentRegion currentNode = null; String type = null; while ((region = getNextRegion()) != null) { type = region.getType(); // these types (might) demand a IStructuredDocumentRegion for each // of them if (type == DOMRegionContext.BLOCK_TEXT) { if (currentNode != null && currentNode.getLastRegion().getType() == DOMRegionContext.BLOCK_TEXT) { // multiple block texts indicated embedded containers; no // new IStructuredDocumentRegion currentNode.addRegion(region); currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart()); region.adjustStart(-currentNode.getStart()); // DW 4/16/2003 regions no longer have parents // region.setParent(currentNode); } else { // not continuing a IStructuredDocumentRegion if (currentNode != null) { // ensure that any existing node is at least // terminated if (!currentNode.isEnded()) { currentNode.setLength(region.getStart() - currentNode.getStart()); // fCurrentNode.setTextLength(region.getStart() - // fCurrentNode.getStart()); } lastNode = currentNode; } fireNodeParsed(currentNode); currentNode = createStructuredDocumentRegion(type); if (lastNode != null) { lastNode.setNext(currentNode); } currentNode.setPrevious(lastNode); currentNode.setStart(region.getStart()); currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart()); currentNode.setEnded(true); region.adjustStart(-currentNode.getStart()); currentNode.addRegion(region); // DW 4/16/2003 regions no longer have parents // region.setParent(currentNode); } } // the following contexts OPEN new StructuredDocumentRegions else if ((currentNode != null && currentNode.isEnded()) || (type == DOMRegionContext.XML_CONTENT) || (type == DOMRegionContext.XML_CHAR_REFERENCE) || (type == DOMRegionContext.XML_ENTITY_REFERENCE) || (type == DOMRegionContext.XML_PI_OPEN) || (type == DOMRegionContext.XML_TAG_OPEN) || (type == DOMRegionContext.XML_END_TAG_OPEN) || (type == DOMRegionContext.XML_COMMENT_OPEN) || (type == DOMRegionContext.XML_CDATA_OPEN) || (type == DOMRegionContext.XML_DECLARATION_OPEN)) { if (currentNode != null) { // ensure that any existing node is at least terminated if (!currentNode.isEnded()) { currentNode.setLength(region.getStart() - currentNode.getStart()); // fCurrentNode.setTextLength(region.getStart() - // fCurrentNode.getStart()); } lastNode = currentNode; } fireNodeParsed(currentNode); currentNode = createStructuredDocumentRegion(type); if (lastNode != null) { lastNode.setNext(currentNode); } currentNode.setPrevious(lastNode); currentNode.setStart(region.getStart()); currentNode.addRegion(region); currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart()); region.adjustStart(-currentNode.getStart()); // DW 4/16/2003 regions no longer have parents // region.setParent(currentNode); } // the following contexts neither open nor close // StructuredDocumentRegions; just add to them else if ((type == DOMRegionContext.XML_TAG_NAME) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_NAME) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_EQUALS) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_VALUE) || (type == DOMRegionContext.XML_COMMENT_TEXT) || (type == DOMRegionContext.XML_PI_CONTENT) || (type == DOMRegionContext.XML_DOCTYPE_INTERNAL_SUBSET)) { currentNode.addRegion(region); currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart()); region.adjustStart(-currentNode.getStart()); // DW 4/16/2003 regions no longer have parents // region.setParent(currentNode); } // the following contexts close off StructuredDocumentRegions // cleanly else if ((type == DOMRegionContext.XML_PI_CLOSE) || (type == DOMRegionContext.XML_TAG_CLOSE) || (type == DOMRegionContext.XML_EMPTY_TAG_CLOSE) || (type == DOMRegionContext.XML_COMMENT_CLOSE) || (type == DOMRegionContext.XML_DECLARATION_CLOSE) || (type == DOMRegionContext.XML_CDATA_CLOSE)) { currentNode.setEnded(true); currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart()); currentNode.addRegion(region); region.adjustStart(-currentNode.getStart()); // DW 4/16/2003 regions no longer have parents // region.setParent(currentNode); } // this is extremely rare, but valid else if (type == DOMRegionContext.WHITE_SPACE) { ITextRegion lastRegion = currentNode.getLastRegion(); // pack the embedded container with this region if (lastRegion instanceof ITextRegionContainer) { ITextRegionContainer container = (ITextRegionContainer) lastRegion; container.getRegions().add(region); // containers must have parent set ... // setting for EACH subregion is redundent, but not sure // where else to do, so will do here for now. container.setParent(currentNode); // DW 4/16/2003 regions no longer have parents // region.setParent(container); region.adjustStart(container.getLength() - region.getStart()); } currentNode.getLastRegion().adjustLength(region.getLength()); currentNode.adjustLength(region.getLength()); } else if (type == DOMRegionContext.UNDEFINED && currentNode != null) { // skip on a very-first region situation as the default // behavior is good enough // combine with previous if also undefined if (currentNode.getLastRegion() != null && currentNode.getLastRegion().getType() == DOMRegionContext.UNDEFINED) { currentNode.getLastRegion().adjustLength(region.getLength()); currentNode.adjustLength(region.getLength()); } // previous wasn't undefined else { currentNode.addRegion(region); currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart()); region.adjustStart(-currentNode.getStart()); } } else { // if an unknown type is the first region in the document, // ensure that a node exists if (currentNode == null) { currentNode = createStructuredDocumentRegion(type); currentNode.setStart(region.getStart()); } currentNode.addRegion(region); currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart()); region.adjustStart(-currentNode.getStart()); // DW 4/16/2003 regions no longer have parents // region.setParent(currentNode); if (Debug.debugTokenizer) System.out.println(getClass().getName() + " found region of not specifically handled type " + region.getType() + " @ " + region.getStart() + "[" + region.getLength() + "]"); //$NON-NLS-4$//$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$ //$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$ } // these regions also get their own node, so close them cleanly // NOTE: these regions have new StructuredDocumentRegions created // for them above; it may // be more readable if that is handled here as well, but the // current layout // ensures that they open StructuredDocumentRegions the same way if ((type == DOMRegionContext.XML_CONTENT) || (type == DOMRegionContext.XML_CHAR_REFERENCE) || (type == DOMRegionContext.XML_ENTITY_REFERENCE)) { currentNode.setEnded(true); } if (headNode == null && currentNode != null) { headNode = currentNode; } } if (currentNode != null) { fireNodeParsed(currentNode); currentNode.setPrevious(lastNode); } // fStringInput = null; primReset(); return headNode; } protected void primReset() { // fNodes = null; // fRegions = null; // fInput = null; fStringInput = null; fCharSequenceSource = null; fDocumentInput = null; fOffset = 0; // fCurrentNode = null; // DMW: also reset tokenizer so it doesn't hold on // to large arrays getTokenizer().reset(new char[0]); } /* * (non-Javadoc) * * @see org.eclipse.wst.sse.core.internal.text.IRegionComparible#regionMatches(int, * int, java.lang.String) */ public boolean regionMatches(int offset, int length, String stringToCompare) { // by definition if (stringToCompare == null) return false; int ajustedOffset = fOffset + offset; boolean result = false; if (fCharSequenceSource != null && fCharSequenceSource instanceof IRegionComparible) { result = ((IRegionComparible) fCharSequenceSource).regionMatches(ajustedOffset, length, stringToCompare); } else { // old fashioned ways String test = null; if (fCharSequenceSource != null) { test = fCharSequenceSource.subSequence(ajustedOffset, ajustedOffset + length).toString(); } else if (fStringInput != null) { test = fStringInput.substring(ajustedOffset, ajustedOffset + length); } result = stringToCompare.equals(test); } return result; } public boolean regionMatchesIgnoreCase(int offset, int length, String stringToCompare) { // by definition if (stringToCompare == null) return false; int ajustedOffset = fOffset + offset; boolean result = false; if (fCharSequenceSource != null && fCharSequenceSource instanceof IRegionComparible) { result = ((IRegionComparible) fCharSequenceSource).regionMatchesIgnoreCase(ajustedOffset, length, stringToCompare); } else { // old fashioned ways String test = null; if (fCharSequenceSource != null) { test = fCharSequenceSource.subSequence(ajustedOffset, ajustedOffset + length).toString(); } else if (fStringInput != null) { test = fStringInput.substring(ajustedOffset, ajustedOffset + length); } result = stringToCompare.equalsIgnoreCase(test); } return result; } public void removeBlockMarker(BlockMarker marker) { getTokenizer().removeBlockMarker(marker); } public void removeBlockMarker(String tagName) { getTokenizer().removeBlockMarker(tagName); } public void removeStructuredDocumentRegionHandler(StructuredDocumentRegionHandler handler) { if (fStructuredDocumentRegionHandlers == null) return; synchronized (fStructuredDocumentRegionHandlers) { fStructuredDocumentRegionHandlers.remove(handler); } } /** * Resets the input. */ public void reset(java.io.FileInputStream instream) { primReset(); // fInput = instream; getTokenizer().reset(instream); } /** * Resets the input. */ public void reset(java.io.Reader reader) { reset(reader, 0); } /** * Resets the input. */ public void reset(java.io.Reader reader, int position) { primReset(); fOffset = position; getTokenizer().reset(reader, position); if (reader instanceof DocumentReader) { IDocument doc = ((DocumentReader) reader).getDocument(); if (doc instanceof CharSequence) { fCharSequenceSource = (CharSequence) doc; } else { // old fashioned IDocument fDocumentInput = ((DocumentReader) reader).getDocument(); } } else if (reader instanceof CharSequenceReader) { fCharSequenceSource = ((CharSequenceReader) reader).getOriginalSource(); } } /** * Resets the input. Use this version to allow text to be retrieved * <em>during</em> parsing, such as by the * StructuredDocumentRegionHandler. */ public void reset(String sourceString) { reset(new StringReader(sourceString)); fStringInput = sourceString; } /** * Resets the input. Use this version to allow text to be retrieved * <em>during</em> parsing, such as by the * StructuredDocumentRegionHandler. */ public void reset(String sourceString, int position) { StringReader reader = new StringReader(sourceString); reset(reader, position); fStringInput = sourceString; } public void resetHandlers() { Object[] handlers = null; synchronized (fStructuredDocumentRegionHandlers) { if (fStructuredDocumentRegionHandlers == null) return; handlers = fStructuredDocumentRegionHandlers.toArray(); } for (int i = 0; i < handlers.length; i++) { try { ((StructuredDocumentRegionHandler) handlers[i]).resetNodes(); } catch (Exception e) { Logger.log(Logger.ERROR, "Error occurred while resetting handlers", e); //$NON-NLS-1$ } } } /** * * @param List */ public void setStructuredDocumentRegionHandlers(List newStructuredDocumentRegionHandlers) { fStructuredDocumentRegionHandlers = newStructuredDocumentRegionHandlers; } protected void setTokenizer(BlockTokenizer newTokenizer) { // DMW: changed from private to protected, so subclass could use in // creation of 'newInstance'. fTokenizer = newTokenizer; } }