package info.bliki.wiki.filter; import info.bliki.htmlcleaner.TagToken; import info.bliki.wiki.model.Configuration; import info.bliki.wiki.model.ITableOfContent; import info.bliki.wiki.model.IWikiModel; import info.bliki.wiki.tags.TableOfContentTag; import info.bliki.wiki.tags.util.NodeAttribute; import info.bliki.wiki.tags.util.WikiTagNode; import java.util.ArrayList; import java.util.List; import java.util.Map; import net.fortuna.ical4j.data.ParserException; public class WikipediaScanner { public final static String TAG_NAME = "$TAG_NAME"; /** * Return value when the source is exhausted. Has a value of <code>-1</code>. */ public static final int EOF = -1; protected int fScannerPosition; protected IWikiModel fWikiModel = null; /** * The <code>String</code> of the given raw wiki text */ protected final String fStringSource; /** * The corresponding <code>char[]</code> array for the string source */ protected final char[] fSource; public WikipediaScanner(String src) { this(src, 0); } public WikipediaScanner(String src, int position) { fSource = src.toCharArray(); fStringSource = src; fScannerPosition = position; } public void setModel(IWikiModel wikiModel) { fWikiModel = wikiModel; } public int getPosition() { return fScannerPosition; } public void setPosition(int newPos) { fScannerPosition = newPos; } public WPCodeBlock codeBlock() { WPCodeBlock code = null; try { if (fScannerPosition < 0) { // simulate newline fScannerPosition = 0; } if (fSource[fScannerPosition++] != '{') { return null; } if (fSource[fScannerPosition++] != 'c' || fSource[fScannerPosition++] != 'o' || fSource[fScannerPosition++] != 'd' || fSource[fScannerPosition++] != 'e') { return null; } if (fSource[fScannerPosition++] != '}') { return null; } int contentsStart = fScannerPosition; char ch = ' '; while (true) { ch = fSource[fScannerPosition++]; switch (ch) { case '{': int cposition = findNestedEndSingle(fSource, '{', '}', fScannerPosition); if (cposition >= 0) { String block = new String(fSource).substring(fScannerPosition, cposition - 1); if ("code".equals(block)) { int contentsEnd = fScannerPosition; code = new WPCodeBlock(); String contents = new String(fSource).substring(contentsStart, contentsEnd - 1); code.setContents(contents); /** remove end {code} tag */ fScannerPosition = fScannerPosition + "{code}".length(); return code; } continue; } break; } } } catch (IndexOutOfBoundsException e) { fScannerPosition = fSource.length; } return null; } public WPBlockQuote blockQuote() { WPBlockQuote code = null; try { if (fScannerPosition < 0) { // simulate newline fScannerPosition = 0; } if (fSource[fScannerPosition++] != '{') { return null; } if (fSource[fScannerPosition++] != 'q' || fSource[fScannerPosition++] != 'u' || fSource[fScannerPosition++] != 'o' || fSource[fScannerPosition++] != 't' || fSource[fScannerPosition++] != 'e') { return null; } if (fSource[fScannerPosition++] != '}') { return null; } int contentsStart = fScannerPosition; char ch = ' '; while (true) { ch = fSource[fScannerPosition++]; switch (ch) { case '{': int cposition = findNestedEndSingle(fSource, '{', '}', fScannerPosition); if (cposition >= 0) { String block = new String(fSource).substring(fScannerPosition, cposition - 1); if ("quote".equals(block)) { int contentsEnd = fScannerPosition; code = new WPBlockQuote(); String contents = new String(fSource).substring(contentsStart, contentsEnd - 1); code.setContents(contents); /** remove end {quote} tag */ fScannerPosition = fScannerPosition + "{quote}".length(); return code; } continue; } break; } } } catch (IndexOutOfBoundsException e) { fScannerPosition = fSource.length; } return null; } /** * Scan a wikipedia table. * * See: <a href="http://meta.wikimedia.org/wiki/Help:Table">Help - Table</a> * * @param tableOfContentTag * @return <code>null</code> if no wiki table was found */ public WPTable wpTable(ITableOfContent tableOfContentTag) { WPTable table = null; WPCell cell = null; ArrayList<WPCell> cells = new ArrayList<WPCell>(); WPRow row = new WPRow(cells); try { if (fScannerPosition < 0) { // simulate newline fScannerPosition = 0; } if (fSource[fScannerPosition++] != '{') { return null; } if (fSource[fScannerPosition++] != '|') { return null; } ArrayList<WPRow> rows = new ArrayList<WPRow>(); table = new WPTable(rows); int startPos = fScannerPosition; // read parameters until end of line nextNewline(); table.setParams(fStringSource.substring(startPos, fScannerPosition)); char ch = ' '; while (true) { ch = fSource[fScannerPosition++]; switch (ch) { case '[': int position = findNestedEndSingle(fSource, '[', ']', fScannerPosition); if (position >= 0) { fScannerPosition = position; continue; } break; case '{': int cposition = findNestedEndSingle(fSource, '{', '}', fScannerPosition); if (cposition >= 0) { fScannerPosition = cposition; continue; } break; case '\n': ch = fSource[fScannerPosition++]; // ignore whitespace at the beginning of the line while (ch == ' ' || ch == '\t') { ch = fSource[fScannerPosition++]; } switch (ch) { case '|': // "\n |" if (cell != null) { cell.createTagStack( table, fSource, fWikiModel, fScannerPosition - 2); cell = null; } ch = fSource[fScannerPosition++]; switch (ch) { case '-': // new row - "\n|-" addTableRow(table, row); cells = new ArrayList<WPCell>(); row = new WPRow(cells); startPos = fScannerPosition; nextNewlineCell(cell); row.setParams(fStringSource.substring( startPos, fScannerPosition)); break; case '+': // new row - "\n|+" addTableRow(table, row); cells = new ArrayList<WPCell>(); row = new WPRow(cells); row.setType(WPCell.CAPTION); cell = new WPCell(fScannerPosition); cell.setType(WPCell.CAPTION); cells.add(cell); nextNewlineCell(cell); cell.createTagStack( table, fSource, fWikiModel, fScannerPosition); cell = null; addTableRow(table, row); cells = new ArrayList<WPCell>(); row = new WPRow(cells); break; case '}': // end of table - "\n|}" addTableRow(table, row); return table; default: fScannerPosition--; cell = new WPCell(fScannerPosition); cells.add(cell); } break; case '!': // "\n !" if (cell != null) { cell.createTagStack( table, fSource, fWikiModel, fScannerPosition - 2); cell = null; } ch = fSource[fScannerPosition++]; cell = new WPCell(fScannerPosition - 1); cell.setType(WPCell.TH); cells.add(cell); break; case '{': // "\n {" if (fSource[fScannerPosition] == '|') { // start of nested table? fScannerPosition = indexEndOfTable(); break; } break; default: fScannerPosition--; } break; case '|': ch = fSource[fScannerPosition++]; if (ch == '|') { if (cell != null) { cell.createTagStack( table, fSource, fWikiModel, fScannerPosition - 2); cell = null; } cell = new WPCell(fScannerPosition); cells.add(cell); } else { fScannerPosition--; if (cell != null) { cell.setAttributesStartPos(fScannerPosition - 1); } } break; case '!': ch = fSource[fScannerPosition++]; if (ch == '!') { if (cell != null) { cell.createTagStack( table, fSource, fWikiModel, fScannerPosition - 2); cell = null; } cell = new WPCell(fScannerPosition); cell.setType(WPCell.TH); cells.add(cell); } else { fScannerPosition--; } break; default: if (cell == null) { cell = new WPCell(fScannerPosition - 1); cell.setType(WPCell.UNDEFINED); cells.add(cell); } } } } catch (IndexOutOfBoundsException e) { // ... fScannerPosition = fSource.length; if (cell != null) { cell.createTagStack(table, fSource, fWikiModel, fScannerPosition); cell = null; } if (table != null && row != null && row.size() > 0) { addTableRow(table, row); } } if (table != null) { return table; } return null; } /** * Scan a Trac simple wiki table * * @param tableOfContentTag * @return */ public WPTable tracTable(TableOfContentTag tableOfContentTag) { WPTable table = null; WPCell cell = null; ArrayList<WPCell> cells = new ArrayList<WPCell>(); WPRow row = new WPRow(cells); try { if (fScannerPosition < 0) { // simulate newline fScannerPosition = 0; } if (fSource[fScannerPosition++] != '|') { return null; } if (fSource[fScannerPosition++] != '|') { return null; } ArrayList<WPRow> rows = new ArrayList<WPRow>(); table = new WPTable(rows); fScannerPosition -= 2; char ch = ' '; while (true) { ch = fSource[fScannerPosition++]; switch (ch) { case '\n': addTableRow(table, row); cell = null; cells = new ArrayList<WPCell>(); row = new WPRow(cells); if (fSource[fScannerPosition] != '|' || fSource[fScannerPosition + 1] != '|') { return table; } continue; case '|': ch = fSource[fScannerPosition++]; if (ch == '|') { if (cell != null) { cell.createTagStack( table, fSource, fWikiModel, fScannerPosition - 2); cells.add(cell); } cell = new WPCell(fScannerPosition); } else { fScannerPosition--; } break; } } } catch (IndexOutOfBoundsException e) { // ... fScannerPosition = fSource.length; if (cell != null) { cell.createTagStack(table, fSource, fWikiModel, fScannerPosition); cells.add(cell); } if (table != null && row != null && row.size() > 0) { addTableRow(table, row); } } if (table != null) { return table; } return null; } private void addTableRow(WPTable table, WPRow row) { if (row.getParams() != null) { table.add(row); } else { if (row.size() > 0) { table.add(row); } } } public WPList wpList() { WPList list = null; WPListElement listElement = null; int startPosition; try { char ch; char lastCh = ' '; char[] sequence = null; int count = 0; if (fScannerPosition < 0) { // simulate newline fScannerPosition = 0; ch = '\n'; } else { ch = fSource[fScannerPosition++]; } list = new WPList(); while (true) { if (ch == WPList.DL_DD_CHAR && lastCh == WPList.DL_DT_CHAR && sequence != null) { startPosition = fScannerPosition; if (listElement != null) { listElement.createTagStack( fSource, fWikiModel, fScannerPosition - 1); list.add(listElement); listElement = null; } char[] ddSequence = new char[sequence.length]; System.arraycopy(sequence, 0, ddSequence, 0, sequence.length); ddSequence[sequence.length - 1] = WPList.DL_DD_CHAR; sequence = ddSequence; int startPos; while (true) { ch = fSource[fScannerPosition++]; if (!Character.isWhitespace(ch)) { startPos = fScannerPosition - 1; listElement = new WPListElement(count, sequence, startPos); break; } if (ch == '\n') { fScannerPosition--; // to detect next row startPos = fScannerPosition; listElement = new WPListElement(count, sequence, startPos); listElement.createTagStack(fSource, fWikiModel, startPos); list.add(listElement); listElement = null; break; } } lastCh = ' '; } if (ch == '\n' || fScannerPosition == 0) { startPosition = fScannerPosition; if (listElement != null) { listElement.createTagStack( fSource, fWikiModel, fScannerPosition - 1); list.add(listElement); listElement = null; } ch = fSource[fScannerPosition++]; switch (ch) { case WPList.DL_DD_CHAR: case WPList.DL_DT_CHAR: case WPList.OL_CHAR: case WPList.UL_CHAR: count = 1; lastCh = ch; while (fSource[fScannerPosition] == WPList.UL_CHAR || fSource[fScannerPosition] == WPList.OL_CHAR || fSource[fScannerPosition] == WPList.DL_DD_CHAR || fSource[fScannerPosition] == WPList.DL_DT_CHAR) { count++; lastCh = fSource[fScannerPosition++]; } sequence = new char[count]; System.arraycopy( fSource, fScannerPosition - count, sequence, 0, count); int startPos; while (true) { ch = fSource[fScannerPosition++]; if (!Character.isWhitespace(ch)) { startPos = fScannerPosition - 1; listElement = new WPListElement(count, sequence, startPos); break; } if (ch == '\n') { fScannerPosition--; // to detect next row startPos = fScannerPosition; listElement = new WPListElement(count, sequence, startPos); listElement.createTagStack(fSource, fWikiModel, startPos); list.add(listElement); listElement = null; break; } } break; default: fScannerPosition = startPosition; return list; } } if (ch == '<') { int temp = readSpecialWikiTags(fScannerPosition); if (temp >= 0) { fScannerPosition = temp; ch = fSource[fScannerPosition++]; continue; } } ch = fSource[fScannerPosition++]; } } catch (IndexOutOfBoundsException e) { fScannerPosition = fSource.length + 1; } if (list != null) { if (listElement != null) { listElement.createTagStack(fSource, fWikiModel, fScannerPosition - 1); list.add(listElement); listElement = null; } return list; } return null; } public int nextNewline() { while (true) { if (fSource[fScannerPosition++] == '\n') { return --fScannerPosition; } } } public int nextNewlineCell(WPCell cell) { char ch; while (true) { ch = fSource[fScannerPosition++]; if (ch == '\n') { return --fScannerPosition; } if (ch == '|') { if (cell != null) { cell.setAttributesStartPos(fScannerPosition - 1); } } else if (ch == '[') { int position = findNestedEndSingle(fSource, '[', ']', fScannerPosition); if (position >= 0) { fScannerPosition = position; } } else if (ch == '{') { int cposition = findNestedEndSingle(fSource, '{', '}', fScannerPosition); if (cposition >= 0) { fScannerPosition = cposition; } } } } /** * Get the offset position behind the next closing HTML comment tag (-->). * * @return the offset position behind the next closing HTML comment tag or * <code>-1</code> if no tag could be found. */ public int indexEndOfComment() { char ch; while (fScannerPosition < fSource.length - 2) { ch = fSource[fScannerPosition++]; if (ch == '-' && fSource[fScannerPosition] == '-' && fSource[fScannerPosition + 1] == '>') { return fScannerPosition + 2; } } return -1; } /** * Get the offset position behind the next </nowiki> tag. * * @return the offset position behind the </nowiki> tag or * <code>-1</code> if no tag could be found. */ public int indexEndOfNowiki() { char ch; while (fScannerPosition < fSource.length - 8) { ch = fSource[fScannerPosition++]; if (ch == '<' && fSource[fScannerPosition] == '/' && fSource[fScannerPosition + 1] == 'n' && fSource[fScannerPosition + 2] == 'o' && fSource[fScannerPosition + 3] == 'w' && fSource[fScannerPosition + 4] == 'i' && fSource[fScannerPosition + 5] == 'k' && fSource[fScannerPosition + 6] == 'i' && fSource[fScannerPosition + 7] == '>') { return fScannerPosition + 8; } } return -1; } /** * Get the offset position behind the corresponding wiki table closing tag * (i.e. <code>|}</code>). The scanner detects HTML comment tags, * <nowiki> tags and nested wiki table tags (i.e. * <code>{|... {|... ...|} ...|}</code>). * * @return the offset position behind the corresponding wiki table closing tag * or <code>-1</code> if no corresponding tag could be found. */ public int indexEndOfTable() { // check nowiki and html comments int nestedWikiTableCounter = 1; char ch; try { while (fScannerPosition < fSource.length) { ch = fSource[fScannerPosition++]; if (ch == '<' && fSource[fScannerPosition] == '!' && fSource[fScannerPosition + 1] == '-' && fSource[fScannerPosition + 2] == '-') { // start of HTML comment fScannerPosition += 3; fScannerPosition = indexEndOfComment(); if (fScannerPosition == (-1)) { return -1; } } else if (ch == '<' && fSource[fScannerPosition] == 'n' && fSource[fScannerPosition + 1] == 'o' && fSource[fScannerPosition + 2] == 'w' && fSource[fScannerPosition + 3] == 'i' && fSource[fScannerPosition + 4] == 'k' && fSource[fScannerPosition + 5] == 'i' && fSource[fScannerPosition + 6] == '>') { // start of <nowiki> fScannerPosition += 7; fScannerPosition = indexEndOfNowiki(); if (fScannerPosition == (-1)) { return -1; } } else if (ch == '\n' && fSource[fScannerPosition] == '{' && fSource[fScannerPosition + 1] == '|') { // assume nested table nestedWikiTableCounter++; } else if (ch == '\n') { int oldPosition = fScannerPosition; ch = fSource[fScannerPosition++]; // ignore SPACES and TABs at the beginning of the line while (ch == ' ' || ch == '\t') { ch = fSource[fScannerPosition++]; } if (ch == '|' && fSource[fScannerPosition] == '}') { if (--nestedWikiTableCounter == 0) { return fScannerPosition + 1; } } fScannerPosition = oldPosition; } } } catch (IndexOutOfBoundsException e) { // .. } return -1; } /** * <p> * Check if a String starts with a specified prefix (optionally case * insensitive). * </p> * * @see java.lang.String#startsWith(String) * @param str * the String to check, may be null * @param toffset * the starting offset of the subregion the String to check * @param prefix * the prefix to find, may be null * @param ignoreCase * inidicates whether the compare should ignore case (case * insensitive) or not. * @return <code>true</code> if the String starts with the prefix or both * <code>null</code> */ public static boolean startsWith(String str, int toffset, String prefix, boolean ignoreCase) { if (str == null || prefix == null) { return (str == null && prefix == null); } if (prefix.length() > str.length() - toffset) { return false; } return str.regionMatches(ignoreCase, toffset, prefix, 0, prefix.length()); } public void scanWhiteSpace() { while (Character.isWhitespace(fSource[fScannerPosition++])) { } --fScannerPosition; } /** * Replace the wiki template parameters in the given template string * * @param template * @param templateParameters * @return <code>null</code> if no replacement could be found */ public StringBuilder replaceTemplateParameters(String template, Map<String, String> templateParameters) { StringBuilder buffer = null; int bufferStart = 0; try { int level = fWikiModel.incrementRecursionLevel(); if (level > Configuration.PARSER_RECURSION_LIMIT) { return null; // no further processing } char ch; int parameterStart = -1; StringBuilder recursiveResult; boolean isDefaultValue; while (true) { ch = fSource[fScannerPosition++]; if (ch == '{' && fSource[fScannerPosition] == '{' && fSource[fScannerPosition + 1] == '{' && fSource[fScannerPosition + 2] != '{') { fScannerPosition += 2; parameterStart = fScannerPosition; int temp[] = findNestedParamEnd(fSource, parameterStart); if (temp[0] >= 0) { fScannerPosition = temp[0]; List<String> list = splitByPipe(fSource, parameterStart, fScannerPosition - 3, null); if (list.size() > 0) { String parameterString = list.get(0).trim(); String value = null; isDefaultValue = false; if (templateParameters != null) { value = templateParameters.get(parameterString); } if (value == null && list.size() > 1) { // default value is available for the template value = list.get(1); isDefaultValue = true; } if (value != null) { if (value.length() <= Configuration.TEMPLATE_VALUE_LIMIT) { if (buffer == null) { buffer = new StringBuilder(template.length() + 128); } if (bufferStart < fScannerPosition) { buffer.append(fSource, bufferStart, parameterStart - bufferStart - 3); } WikipediaScanner scanner = new WikipediaScanner(value); scanner.setModel(fWikiModel); if (isDefaultValue) { recursiveResult = scanner.replaceTemplateParameters( value, templateParameters); } else { recursiveResult = scanner.replaceTemplateParameters(value, null); } if (recursiveResult != null) { buffer.append(recursiveResult); } else { buffer.append(value); } bufferStart = fScannerPosition; } } } fScannerPosition = temp[0]; parameterStart = -1; } } if (buffer != null && buffer.length() > Configuration.TEMPLATE_BUFFER_LIMIT) { // Controls the scanner, when infinite recursion occurs the // buffer grows out of control. return buffer; } } } catch (IndexOutOfBoundsException e) { // ignore } finally { fWikiModel.decrementRecursionLevel(); } if (buffer != null && bufferStart < fScannerPosition) { buffer.append(fSource, bufferStart, fScannerPosition - bufferStart - 1); } return buffer; } /** * Split the given src string by pipe symbol (i.e. "|") * * @param sourceString * @param resultList * the list which contains the splitted strings * @return */ public static List<String> splitByPipe(String sourceString, List<String> resultList) { // TODO optimize this to avoid new char[] generation inside toCharArray() ? return splitByPipe( sourceString.toCharArray(), 0, sourceString.length(), resultList); } /** * Split the given <code>srcArray</code> character array by pipe symbol (i.e. * "|") * * @param srcArray * @param currOffset * @param endOffset * @param resultList * the list which contains the splitted strings * @return */ public static List<String> splitByPipe(char[] srcArray, int currOffset, int endOffset, List<String> resultList) { if (resultList == null) { resultList = new ArrayList<String>(); } char ch; int[] temp = new int[] { -1, -1 }; int lastOffset = currOffset; try { while (currOffset < endOffset) { ch = srcArray[currOffset++]; if (ch == '[' && srcArray[currOffset] == '[') { currOffset++; temp[0] = findNestedEnd(srcArray, '[', ']', currOffset); if (temp[0] >= 0) { currOffset = temp[0]; } } else if (ch == '{' && srcArray[currOffset] == '{') { currOffset++; if (srcArray[currOffset] == '{' && srcArray[currOffset + 1] != '{') { currOffset++; temp = findNestedParamEnd(srcArray, currOffset); if (temp[0] >= 0) { currOffset = temp[0]; } } else { temp[0] = findNestedTemplateEnd(srcArray, currOffset); if (temp[0] >= 0) { currOffset = temp[0]; } } } else if (ch == '|') { resultList.add(new String(srcArray, lastOffset, currOffset - lastOffset - 1)); lastOffset = currOffset; } } if (currOffset > lastOffset) { resultList .add(new String(srcArray, lastOffset, currOffset - lastOffset)); } else if (currOffset == lastOffset) { resultList.add(""); } } catch (IndexOutOfBoundsException e) { if (currOffset > lastOffset) { resultList .add(new String(srcArray, lastOffset, currOffset - lastOffset)); } else if (currOffset == lastOffset) { resultList.add(""); } } return resultList; } /** * Read until the end of a nested block i.e. something like * <code>[[...[[ ]]...]]</code> * * @param sourceArray * @param startCh * @param endChar * @param startPosition * @return the position of the nested end charcters or <code>-1</code> if not * found */ public static int findNestedEnd(final char[] sourceArray, final char startCh, final char endChar, int startPosition) { char ch; int level = 1; int position = startPosition; final int sourceArrayLength = sourceArray.length - 1; try { while (position < sourceArrayLength) { ch = sourceArray[position++]; if (ch == startCh && sourceArray[position] == startCh) { position++; level++; } else if (ch == endChar && sourceArray[position] == endChar) { position++; if (--level == 0) { return position; } } } return -1; } catch (IndexOutOfBoundsException e) { return -1; } } /** * Read until the end of a nested block i.e. something like * <code>{{{...{...{{ }}...}...}}}</code> * * @param sourceArray * @param startCh * @param endChar * @param startPosition * @return the position of the nested end charcters or <code>-1</code> if not * found */ public static int findNestedEndSingle(final char[] sourceArray, final char startCh, final char endChar, int startPosition) { char ch; int level = 1; int position = startPosition; final int sourceArrayLength = sourceArray.length; try { while (position < sourceArrayLength) { ch = sourceArray[position++]; if (ch == startCh) { level++; } else if (ch == endChar) { if (--level == 0) { return position; } } } return -1; } catch (IndexOutOfBoundsException e) { return -1; } } public static int findNestedTemplateEnd(final char[] sourceArray, int startPosition) { char ch; int countSingleOpenBraces = 0; int position = startPosition; try { while (position < sourceArray.length) { ch = sourceArray[position++]; if (ch == '{') { countSingleOpenBraces++; } else if (ch == '}') { if (countSingleOpenBraces > 0) { countSingleOpenBraces--; } else { if (sourceArray[position] == '}') { // template ending position++; return position; } } } } return -1; } catch (IndexOutOfBoundsException e) { return -1; } } /** * Find the end of a template parameter declaration or the end of a template * declaration. * * @param sourceArray * @param startPosition * @return an array of two integers. If <code>array[0] > 0</code> the scanner * has found the end position of a template parameter declaration. If * <code>array[1] > 0</code> the scanner has found the end position of * a template declaration. */ public static int[] findNestedParamEnd(final char[] sourceArray, int startPosition) { char ch; final int sourceArrayLength = sourceArray.length; int countSingleOpenBraces = 0; int parameterPosition = startPosition; try { while (parameterPosition < sourceArrayLength) { ch = sourceArray[parameterPosition++]; if (ch == '{') { if ((sourceArrayLength > parameterPosition) && sourceArray[parameterPosition] == '{') { parameterPosition++; if ((sourceArrayLength > parameterPosition) && sourceArray[parameterPosition] == '{' && sourceArray[parameterPosition + 1] != '{') { // template parameter beginning parameterPosition++; int[] temp = findNestedParamEnd(sourceArray, parameterPosition); if (temp[0] >= 0) { parameterPosition = temp[0]; } else { if (temp[1] >= 0) { parameterPosition = temp[1]; } else { return new int[] { -1, -1 }; } } } else { // template beginning int temp = findNestedTemplateEnd(sourceArray, parameterPosition); if (temp < 0) { return new int[] { -1, -1 }; } parameterPosition = temp; } } else { countSingleOpenBraces++; } } else if (ch == '}') { if (countSingleOpenBraces > 0) { countSingleOpenBraces--; } else { if ((sourceArrayLength > parameterPosition) && sourceArray[parameterPosition] == '}') { if (sourceArray[parameterPosition + 1] == '}') { // template parameter ending return new int[] { parameterPosition + 2, -1 }; } else { return new int[] { -1, parameterPosition + 1 }; } } } } } return new int[] { -1, -1 }; } catch (IndexOutOfBoundsException e) { return new int[] { -1, -1 }; } } /** * Parse a tag. Parse the name and attributes from a start tag. * <p> * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2"> * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a> * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2 * <p> * <cite> 3.2.2 Attributes * <p> * Elements may have associated properties, called attributes, which may have * values (by default, or set by authors or scripts). Attribute/value pairs * appear before the final ">" of an element's start tag. Any number of * (legal) attribute value pairs, separated by spaces, may appear in an * element's start tag. They may appear in any order. * <p> * In this example, the id attribute is set for an H1 element: <code> * <H1 id="section1"> * </code> This is an identified heading thanks to the id attribute <code> * </H1> * </code> By default, SGML requires that all attribute values be delimited * using either double quotation marks (ASCII decimal 34) or single quotation * marks (ASCII decimal 39). Single quote marks can be included within the * attribute value when the value is delimited by double quote marks, and vice * versa. Authors may also use numeric character references to represent * double quotes (&#34;) and single quotes (&#39;). For doublequotes * authors can also use the character entity reference &quot;. * <p> * In certain cases, authors may specify the value of an attribute without any * quotation marks. The attribute value may only contain letters (a-z and * A-Z), digits (0-9), hyphens (ASCII decimal 45), periods (ASCII decimal 46), * underscores (ASCII decimal 95), and colons (ASCII decimal 58). We recommend * using quotation marks even when it is possible to eliminate them. * <p> * Attribute names are always case-insensitive. * <p> * Attribute values are generally case-insensitive. The definition of each * attribute in the reference manual indicates whether its value is * case-insensitive. * <p> * All the attributes defined by this specification are listed in the * attribute index. * <p> * </cite> * <p> * This method uses a state machine with the following states: * <ol> * <li>state 0 - outside of any attribute</li> * <li>state 1 - within attributre name</li> * <li>state 2 - equals hit</li> * <li>state 3 - within naked attribute value.</li> * <li>state 4 - within single quoted attribute value</li> * <li>state 5 - within double quoted attribute value</li> * <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or * state 0</li> * </ol> * <p> * The starting point for the various components is stored in an array of * integers that match the initiation point for the states one-for-one, i.e. * bookmarks[0] is where state 0 began, bookmarks[1] is where state 1 began, * etc. Attributes are stored in a <code>Vector</code> having one slot for * each whitespace or attribute/value pair. The first slot is for attribute * name (kind of like a standalone attribute). * * @param start * The position at which to start scanning. * @return The parsed tag. * @exception ParserException * If a problem occurs reading from the source. */ protected WikiTagNode parseTag(int start) { boolean done; char ch; int state; int[] bookmarks; done = false; ArrayList<NodeAttribute> attributes = new ArrayList<NodeAttribute>(); state = 0; fScannerPosition = start; bookmarks = new int[8]; bookmarks[0] = fScannerPosition; try { while (!done) { bookmarks[state + 1] = fScannerPosition; ch = fSource[fScannerPosition++]; switch (state) { case 0: // outside of any attribute if ((EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle bookmarks[state + 1] = --fScannerPosition; } whitespace(attributes, bookmarks); done = true; } else if (!Character.isWhitespace(ch)) { whitespace(attributes, bookmarks); state = 1; } break; case 1: // within attribute name if ((EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle bookmarks[state + 1] = --fScannerPosition; } standalone(attributes, bookmarks); done = true; } else if (Character.isWhitespace(ch)) { // whitespaces might be followed by next attribute or an // equal sign // see Bug #891058 Bug in lexer. bookmarks[6] = bookmarks[2]; // setting the // bookmark[0] // is done in state 6 if // applicable state = 6; } else if ('=' == ch) { state = 2; } break; case 2: // equals hit if ((EOF == ch) || ('>' == ch)) { empty(attributes, bookmarks); done = true; } else if ('\'' == ch) { state = 4; bookmarks[4] = bookmarks[3]; } else if ('"' == ch) { state = 5; bookmarks[5] = bookmarks[3]; } else if (Character.isWhitespace(ch)) { // collect white spaces after "=" into the assignment // string; // do nothing // see Bug #891058 Bug in lexer. } else { state = 3; } break; case 3: // within naked attribute value if ((EOF == ch) || ('>' == ch)) { naked(attributes, bookmarks); done = true; } else if (Character.isWhitespace(ch)) { naked(attributes, bookmarks); bookmarks[0] = bookmarks[4]; state = 0; } else if (ch == '/' && fSource[fScannerPosition] == '>') { naked(attributes, bookmarks); bookmarks[0] = bookmarks[4]; fScannerPosition--; state = 0; } break; case 4: // within single quoted attribute value if (EOF == ch) { single_quote(attributes, bookmarks); done = true; // complain? } else if ('\'' == ch) { single_quote(attributes, bookmarks); bookmarks[0] = bookmarks[5] + 1; state = 0; } break; case 5: // within double quoted attribute value if (EOF == ch) { double_quote(attributes, bookmarks); done = true; // complain? } else if ('"' == ch) { double_quote(attributes, bookmarks); bookmarks[0] = bookmarks[6] + 1; state = 0; } break; case 6: // undecided for state 0 or 2 // we have read white spaces after an attributte name if (EOF == ch) { // same as last else clause standalone(attributes, bookmarks); bookmarks[0] = bookmarks[6]; // mPage.ungetCharacter(mCursor); --fScannerPosition; state = 0; } else if (Character.isWhitespace(ch)) { // proceed } else if ('=' == ch) {// yepp. the white spaces belonged to the // equal. bookmarks[2] = bookmarks[6]; bookmarks[3] = bookmarks[7]; state = 2; } else { // white spaces were not ended by equal // meaning the attribute was a stand alone attribute // now: create the stand alone attribute and rewind // the cursor to the end of the white spaces // and restart scanning as whitespace attribute. standalone(attributes, bookmarks); bookmarks[0] = bookmarks[6]; --fScannerPosition; state = 0; } break; default: throw new IllegalStateException("how did we get in state " + state); } } if (fSource[fScannerPosition - 1] != '>') { fScannerPosition = start; return null; } return (makeTag(start, fScannerPosition, attributes)); } catch (IndexOutOfBoundsException e) { if (state == 3) { // within naked attribute value naked(attributes, bookmarks); } } fScannerPosition = start; return null; } protected List<NodeAttribute> parseAttributes(int start, int end) { boolean done; char ch; int state; int[] bookmarks; done = false; ArrayList<NodeAttribute> attributes = new ArrayList<NodeAttribute>(); state = 0; fScannerPosition = start; bookmarks = new int[8]; bookmarks[0] = fScannerPosition; try { while (!done && fScannerPosition < end) { bookmarks[state + 1] = fScannerPosition; ch = fSource[fScannerPosition++]; switch (state) { case 0: // outside of any attribute if ((EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle bookmarks[state + 1] = --fScannerPosition; } whitespace(attributes, bookmarks); done = true; } else if (!Character.isWhitespace(ch)) { whitespace(attributes, bookmarks); state = 1; } break; case 1: // within attribute name if ((EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle bookmarks[state + 1] = --fScannerPosition; } standalone(attributes, bookmarks); done = true; } else if (Character.isWhitespace(ch)) { // whitespaces might be followed by next attribute or an // equal sign // see Bug #891058 Bug in lexer. bookmarks[6] = bookmarks[2]; // setting the // bookmark[0] // is done in state 6 if // applicable state = 6; } else if ('=' == ch) { state = 2; } break; case 2: // equals hit if ((EOF == ch) || ('>' == ch)) { empty(attributes, bookmarks); done = true; } else if ('\'' == ch) { state = 4; bookmarks[4] = bookmarks[3]; } else if ('"' == ch) { state = 5; bookmarks[5] = bookmarks[3]; } else if (Character.isWhitespace(ch)) { // collect white spaces after "=" into the assignment // string; // do nothing // see Bug #891058 Bug in lexer. } else { state = 3; } break; case 3: // within naked attribute value if ((EOF == ch) || ('>' == ch)) { naked(attributes, bookmarks); done = true; } else if (Character.isWhitespace(ch)) { naked(attributes, bookmarks); bookmarks[0] = bookmarks[4]; state = 0; } break; case 4: // within single quoted attribute value if (EOF == ch) { single_quote(attributes, bookmarks); done = true; // complain? } else if ('\'' == ch) { single_quote(attributes, bookmarks); bookmarks[0] = bookmarks[5] + 1; state = 0; } break; case 5: // within double quoted attribute value if (EOF == ch) { double_quote(attributes, bookmarks); done = true; // complain? // } else if ('\\' == ch && fSource[fScannerPosition] == '"') { // fScannerPosition++; } else if ('"' == ch) { double_quote(attributes, bookmarks); bookmarks[0] = bookmarks[6] + 1; state = 0; } break; // patch for lexer state correction by // Gernot Fricke // See Bug # 891058 Bug in lexer. case 6: // undecided for state 0 or 2 // we have read white spaces after an attributte name if (EOF == ch) { // same as last else clause standalone(attributes, bookmarks); bookmarks[0] = bookmarks[6]; // mPage.ungetCharacter(mCursor); --fScannerPosition; state = 0; } else if (Character.isWhitespace(ch)) { // proceed } else if ('=' == ch) // yepp. the white spaces belonged // to the equal. { bookmarks[2] = bookmarks[6]; bookmarks[3] = bookmarks[7]; state = 2; } else { // white spaces were not ended by equal // meaning the attribute was a stand alone attribute // now: create the stand alone attribute and rewind // the cursor to the end of the white spaces // and restart scanning as whitespace attribute. standalone(attributes, bookmarks); bookmarks[0] = bookmarks[6]; --fScannerPosition; state = 0; } break; default: throw new IllegalStateException("how did we get in state " + state); } } if (state == 3 || state == 4 || state == 5) { // within naked attribute value bookmarks[state + 1] = fScannerPosition; naked(attributes, bookmarks); } return attributes; } catch (IndexOutOfBoundsException e) { } return null; } /** * Create a tag node based on the current cursor and the one provided. * * @param start * The starting point of the node. * @param end * The ending point of the node. * @param attributes * The attributes parsed from the tag. * @exception ParserException * If the nodefactory creation of the tag node fails. * @return The new Tag node. */ protected WikiTagNode makeTag(int start, int end, ArrayList<NodeAttribute> attributes) { int length; length = end - start; if (0 != length) { // return tag based on second character, '/', '%', // Letter (ch), '!' if (2 > length) { // this is an error return null; // (makeString(start, end)); } return new WikiTagNode(start, end, attributes); } return null; } /** * Generate a whitespace 'attribute', * * @param attributes * The list so far. * @param bookmarks * The array of positions. */ private void whitespace(ArrayList<NodeAttribute> attributes, int[] bookmarks) { // if (bookmarks[1] > bookmarks[0]) // attributes.addElement(new PageAttribute(fSource,-1, -1, bookmarks[0], // bookmarks[1], (char) 0)); } /** * Generate a standalone attribute -- font. * * @param attributes * The list so far. * @param bookmarks * The array of positions. */ private void standalone(ArrayList<NodeAttribute> attributes, int[] bookmarks) { attributes.add(new NodeAttribute( fSource, bookmarks[1], bookmarks[2], -1, -1, (char) 0)); } /** * Generate an empty attribute -- color=. * * @param attributes * The list so far. * @param bookmarks * The array of positions. */ private void empty(ArrayList<NodeAttribute> attributes, int[] bookmarks) { attributes.add(new NodeAttribute( fSource, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char) 0)); } /** * Generate an unquoted attribute -- size=1. * * @param attributes * The list so far. * @param bookmarks * The array of positions. */ private void naked(ArrayList<NodeAttribute> attributes, int[] bookmarks) { attributes.add(new NodeAttribute( fSource, bookmarks[1], bookmarks[2], bookmarks[3], bookmarks[4], (char) 0)); } /** * Generate an single quoted attribute -- width='100%'. * * @param attributes * The list so far. * @param bookmarks * The array of positions. */ private void single_quote(ArrayList<NodeAttribute> attributes, int[] bookmarks) { attributes.add(new NodeAttribute( fSource, bookmarks[1], bookmarks[2], bookmarks[4] + 1, bookmarks[5], '\'')); } /** * Generate an double quoted attribute -- CONTENT="Test Development". * * @param attributes * The list so far. * @param bookmarks * The array of positions. */ private void double_quote(ArrayList<NodeAttribute> attributes, int[] bookmarks) { attributes.add(new NodeAttribute( fSource, bookmarks[1], bookmarks[2], bookmarks[5] + 1, bookmarks[6], '"')); } protected int readSpecialWikiTags(int start) { int startPosition = fScannerPosition; try { if (fSource[start] != '/') { // starting tag WikiTagNode tagNode = parseTag(start); if (tagNode != null && !tagNode.isEmptyXmlTag()) { String tagName = tagNode.getTagName(); if (tagName.equals("nowiki")) { return readUntilIgnoreCase(fScannerPosition, "</", "nowiki>"); } else if (tagName.equals("source")) { return readUntilIgnoreCase(fScannerPosition, "</", "source>"); } else if (tagName.equals("math")) { return readUntilIgnoreCase(fScannerPosition, "</", "math>"); } else if (tagName.equals("span")) { return readUntilIgnoreCase(fScannerPosition, "</", "span>"); // <div> could be nested ? // } else if (tagName.equals("div")) { // return readUntilIgnoreCase(fScannerPosition, "</", "div>"); } } } } catch (IndexOutOfBoundsException e) { // do nothing } fScannerPosition = startPosition; return -1; } /** * Read the characters until the concatenated <i>start</i> and <i>end</i> * substring is found. The end substring is matched ignoring case * considerations. * * @param startString * the start string which should be searched in exact case mode * @param endString * the end string which should be searched in ignore case mode * @return */ protected final int readUntilIgnoreCase(int start, String startString, String endString) { int index = Util.indexOfIgnoreCase(fStringSource, startString, endString, start); if (index != (-1)) { return index + startString.length() + endString.length(); } return -1; } /** * Read the characters until no more letters are found or the given * <code>testChar</code> is found. If <code>testChar</code> was found, return * the offset position. * * @param testCh * the test character * @param fromIndex * read from this offset * @return <code>-1</code> if the character could not be found or no more * letter character were found. */ protected int indexOfUntilNoLetter(char testChar, int fromIndex) { int index = fromIndex; char ch; while (index < fSource.length) { ch = fSource[index++]; if (ch == testChar) { return index - 1; } if (Character.isLetter(ch)) { if (fSource.length <= index) { return -1; } continue; } return -1; } return -1; } /** * Reduce the current token stack until the given nodes name is at the top of * the stack. Useful for closing HTML tags. */ protected void reduceStackUntilToken(TagToken node) { TagToken tag; int index = -1; String allowedParents = node.getParents(); while (fWikiModel.stackSize() > 0) { tag = fWikiModel.peekNode(); if (node.getName().equals(tag.getName())) { fWikiModel.popNode(); break; } if (allowedParents == null) { fWikiModel.popNode(); } else { index = allowedParents.indexOf("|" + tag.getName() + "|"); if (index < 0) { fWikiModel.popNode(); } else { break; } } } } }