package info.bliki.wiki.filter;
import info.bliki.htmlcleaner.TagToken;
import info.bliki.wiki.model.Configuration;
import info.bliki.wiki.model.ITableOfContent;
import info.bliki.wiki.model.IWikiModel;
import info.bliki.wiki.tags.TableOfContentTag;
import info.bliki.wiki.tags.util.NodeAttribute;
import info.bliki.wiki.tags.util.WikiTagNode;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import net.fortuna.ical4j.data.ParserException;
public class WikipediaScanner {
public final static String TAG_NAME = "$TAG_NAME";
/**
* Return value when the source is exhausted. Has a value of <code>-1</code>.
*/
public static final int EOF = -1;
protected int fScannerPosition;
protected IWikiModel fWikiModel = null;
/**
* The <code>String</code> of the given raw wiki text
*/
protected final String fStringSource;
/**
* The corresponding <code>char[]</code> array for the string source
*/
protected final char[] fSource;
public WikipediaScanner(String src) {
this(src, 0);
}
public WikipediaScanner(String src, int position) {
fSource = src.toCharArray();
fStringSource = src;
fScannerPosition = position;
}
public void setModel(IWikiModel wikiModel) {
fWikiModel = wikiModel;
}
public int getPosition() {
return fScannerPosition;
}
public void setPosition(int newPos) {
fScannerPosition = newPos;
}
public WPCodeBlock codeBlock() {
WPCodeBlock code = null;
try {
if (fScannerPosition < 0) {
// simulate newline
fScannerPosition = 0;
}
if (fSource[fScannerPosition++] != '{') {
return null;
}
if (fSource[fScannerPosition++] != 'c'
|| fSource[fScannerPosition++] != 'o'
|| fSource[fScannerPosition++] != 'd'
|| fSource[fScannerPosition++] != 'e') {
return null;
}
if (fSource[fScannerPosition++] != '}') {
return null;
}
int contentsStart = fScannerPosition;
char ch = ' ';
while (true) {
ch = fSource[fScannerPosition++];
switch (ch) {
case '{':
int cposition =
findNestedEndSingle(fSource, '{', '}', fScannerPosition);
if (cposition >= 0) {
String block =
new String(fSource).substring(fScannerPosition, cposition - 1);
if ("code".equals(block)) {
int contentsEnd = fScannerPosition;
code = new WPCodeBlock();
String contents =
new String(fSource).substring(contentsStart, contentsEnd - 1);
code.setContents(contents);
/** remove end {code} tag */
fScannerPosition = fScannerPosition + "{code}".length();
return code;
}
continue;
}
break;
}
}
} catch (IndexOutOfBoundsException e) {
fScannerPosition = fSource.length;
}
return null;
}
public WPBlockQuote blockQuote() {
WPBlockQuote code = null;
try {
if (fScannerPosition < 0) {
// simulate newline
fScannerPosition = 0;
}
if (fSource[fScannerPosition++] != '{') {
return null;
}
if (fSource[fScannerPosition++] != 'q'
|| fSource[fScannerPosition++] != 'u'
|| fSource[fScannerPosition++] != 'o'
|| fSource[fScannerPosition++] != 't'
|| fSource[fScannerPosition++] != 'e') {
return null;
}
if (fSource[fScannerPosition++] != '}') {
return null;
}
int contentsStart = fScannerPosition;
char ch = ' ';
while (true) {
ch = fSource[fScannerPosition++];
switch (ch) {
case '{':
int cposition =
findNestedEndSingle(fSource, '{', '}', fScannerPosition);
if (cposition >= 0) {
String block =
new String(fSource).substring(fScannerPosition, cposition - 1);
if ("quote".equals(block)) {
int contentsEnd = fScannerPosition;
code = new WPBlockQuote();
String contents =
new String(fSource).substring(contentsStart, contentsEnd - 1);
code.setContents(contents);
/** remove end {quote} tag */
fScannerPosition = fScannerPosition + "{quote}".length();
return code;
}
continue;
}
break;
}
}
} catch (IndexOutOfBoundsException e) {
fScannerPosition = fSource.length;
}
return null;
}
/**
* Scan a wikipedia table.
*
* See: <a href="http://meta.wikimedia.org/wiki/Help:Table">Help - Table</a>
*
* @param tableOfContentTag
* @return <code>null</code> if no wiki table was found
*/
public WPTable wpTable(ITableOfContent tableOfContentTag) {
WPTable table = null;
WPCell cell = null;
ArrayList<WPCell> cells = new ArrayList<WPCell>();
WPRow row = new WPRow(cells);
try {
if (fScannerPosition < 0) {
// simulate newline
fScannerPosition = 0;
}
if (fSource[fScannerPosition++] != '{') {
return null;
}
if (fSource[fScannerPosition++] != '|') {
return null;
}
ArrayList<WPRow> rows = new ArrayList<WPRow>();
table = new WPTable(rows);
int startPos = fScannerPosition;
// read parameters until end of line
nextNewline();
table.setParams(fStringSource.substring(startPos, fScannerPosition));
char ch = ' ';
while (true) {
ch = fSource[fScannerPosition++];
switch (ch) {
case '[':
int position =
findNestedEndSingle(fSource, '[', ']', fScannerPosition);
if (position >= 0) {
fScannerPosition = position;
continue;
}
break;
case '{':
int cposition =
findNestedEndSingle(fSource, '{', '}', fScannerPosition);
if (cposition >= 0) {
fScannerPosition = cposition;
continue;
}
break;
case '\n':
ch = fSource[fScannerPosition++];
// ignore whitespace at the beginning of the line
while (ch == ' ' || ch == '\t') {
ch = fSource[fScannerPosition++];
}
switch (ch) {
case '|': // "\n |"
if (cell != null) {
cell.createTagStack(
table,
fSource,
fWikiModel,
fScannerPosition - 2);
cell = null;
}
ch = fSource[fScannerPosition++];
switch (ch) {
case '-': // new row - "\n|-"
addTableRow(table, row);
cells = new ArrayList<WPCell>();
row = new WPRow(cells);
startPos = fScannerPosition;
nextNewlineCell(cell);
row.setParams(fStringSource.substring(
startPos,
fScannerPosition));
break;
case '+': // new row - "\n|+"
addTableRow(table, row);
cells = new ArrayList<WPCell>();
row = new WPRow(cells);
row.setType(WPCell.CAPTION);
cell = new WPCell(fScannerPosition);
cell.setType(WPCell.CAPTION);
cells.add(cell);
nextNewlineCell(cell);
cell.createTagStack(
table,
fSource,
fWikiModel,
fScannerPosition);
cell = null;
addTableRow(table, row);
cells = new ArrayList<WPCell>();
row = new WPRow(cells);
break;
case '}': // end of table - "\n|}"
addTableRow(table, row);
return table;
default:
fScannerPosition--;
cell = new WPCell(fScannerPosition);
cells.add(cell);
}
break;
case '!': // "\n !"
if (cell != null) {
cell.createTagStack(
table,
fSource,
fWikiModel,
fScannerPosition - 2);
cell = null;
}
ch = fSource[fScannerPosition++];
cell = new WPCell(fScannerPosition - 1);
cell.setType(WPCell.TH);
cells.add(cell);
break;
case '{': // "\n {"
if (fSource[fScannerPosition] == '|') {
// start of nested table?
fScannerPosition = indexEndOfTable();
break;
}
break;
default:
fScannerPosition--;
}
break;
case '|':
ch = fSource[fScannerPosition++];
if (ch == '|') {
if (cell != null) {
cell.createTagStack(
table,
fSource,
fWikiModel,
fScannerPosition - 2);
cell = null;
}
cell = new WPCell(fScannerPosition);
cells.add(cell);
} else {
fScannerPosition--;
if (cell != null) {
cell.setAttributesStartPos(fScannerPosition - 1);
}
}
break;
case '!':
ch = fSource[fScannerPosition++];
if (ch == '!') {
if (cell != null) {
cell.createTagStack(
table,
fSource,
fWikiModel,
fScannerPosition - 2);
cell = null;
}
cell = new WPCell(fScannerPosition);
cell.setType(WPCell.TH);
cells.add(cell);
} else {
fScannerPosition--;
}
break;
default:
if (cell == null) {
cell = new WPCell(fScannerPosition - 1);
cell.setType(WPCell.UNDEFINED);
cells.add(cell);
}
}
}
} catch (IndexOutOfBoundsException e) {
// ...
fScannerPosition = fSource.length;
if (cell != null) {
cell.createTagStack(table, fSource, fWikiModel, fScannerPosition);
cell = null;
}
if (table != null && row != null && row.size() > 0) {
addTableRow(table, row);
}
}
if (table != null) {
return table;
}
return null;
}
/**
* Scan a Trac simple wiki table
*
* @param tableOfContentTag
* @return
*/
public WPTable tracTable(TableOfContentTag tableOfContentTag) {
WPTable table = null;
WPCell cell = null;
ArrayList<WPCell> cells = new ArrayList<WPCell>();
WPRow row = new WPRow(cells);
try {
if (fScannerPosition < 0) {
// simulate newline
fScannerPosition = 0;
}
if (fSource[fScannerPosition++] != '|') {
return null;
}
if (fSource[fScannerPosition++] != '|') {
return null;
}
ArrayList<WPRow> rows = new ArrayList<WPRow>();
table = new WPTable(rows);
fScannerPosition -= 2;
char ch = ' ';
while (true) {
ch = fSource[fScannerPosition++];
switch (ch) {
case '\n':
addTableRow(table, row);
cell = null;
cells = new ArrayList<WPCell>();
row = new WPRow(cells);
if (fSource[fScannerPosition] != '|'
|| fSource[fScannerPosition + 1] != '|') {
return table;
}
continue;
case '|':
ch = fSource[fScannerPosition++];
if (ch == '|') {
if (cell != null) {
cell.createTagStack(
table,
fSource,
fWikiModel,
fScannerPosition - 2);
cells.add(cell);
}
cell = new WPCell(fScannerPosition);
} else {
fScannerPosition--;
}
break;
}
}
} catch (IndexOutOfBoundsException e) {
// ...
fScannerPosition = fSource.length;
if (cell != null) {
cell.createTagStack(table, fSource, fWikiModel, fScannerPosition);
cells.add(cell);
}
if (table != null && row != null && row.size() > 0) {
addTableRow(table, row);
}
}
if (table != null) {
return table;
}
return null;
}
private void addTableRow(WPTable table, WPRow row) {
if (row.getParams() != null) {
table.add(row);
} else {
if (row.size() > 0) {
table.add(row);
}
}
}
public WPList wpList() {
WPList list = null;
WPListElement listElement = null;
int startPosition;
try {
char ch;
char lastCh = ' ';
char[] sequence = null;
int count = 0;
if (fScannerPosition < 0) {
// simulate newline
fScannerPosition = 0;
ch = '\n';
} else {
ch = fSource[fScannerPosition++];
}
list = new WPList();
while (true) {
if (ch == WPList.DL_DD_CHAR
&& lastCh == WPList.DL_DT_CHAR
&& sequence != null) {
startPosition = fScannerPosition;
if (listElement != null) {
listElement.createTagStack(
fSource,
fWikiModel,
fScannerPosition - 1);
list.add(listElement);
listElement = null;
}
char[] ddSequence = new char[sequence.length];
System.arraycopy(sequence, 0, ddSequence, 0, sequence.length);
ddSequence[sequence.length - 1] = WPList.DL_DD_CHAR;
sequence = ddSequence;
int startPos;
while (true) {
ch = fSource[fScannerPosition++];
if (!Character.isWhitespace(ch)) {
startPos = fScannerPosition - 1;
listElement = new WPListElement(count, sequence, startPos);
break;
}
if (ch == '\n') {
fScannerPosition--; // to detect next row
startPos = fScannerPosition;
listElement = new WPListElement(count, sequence, startPos);
listElement.createTagStack(fSource, fWikiModel, startPos);
list.add(listElement);
listElement = null;
break;
}
}
lastCh = ' ';
}
if (ch == '\n' || fScannerPosition == 0) {
startPosition = fScannerPosition;
if (listElement != null) {
listElement.createTagStack(
fSource,
fWikiModel,
fScannerPosition - 1);
list.add(listElement);
listElement = null;
}
ch = fSource[fScannerPosition++];
switch (ch) {
case WPList.DL_DD_CHAR:
case WPList.DL_DT_CHAR:
case WPList.OL_CHAR:
case WPList.UL_CHAR:
count = 1;
lastCh = ch;
while (fSource[fScannerPosition] == WPList.UL_CHAR
|| fSource[fScannerPosition] == WPList.OL_CHAR
|| fSource[fScannerPosition] == WPList.DL_DD_CHAR
|| fSource[fScannerPosition] == WPList.DL_DT_CHAR) {
count++;
lastCh = fSource[fScannerPosition++];
}
sequence = new char[count];
System.arraycopy(
fSource,
fScannerPosition - count,
sequence,
0,
count);
int startPos;
while (true) {
ch = fSource[fScannerPosition++];
if (!Character.isWhitespace(ch)) {
startPos = fScannerPosition - 1;
listElement = new WPListElement(count, sequence, startPos);
break;
}
if (ch == '\n') {
fScannerPosition--; // to detect next row
startPos = fScannerPosition;
listElement = new WPListElement(count, sequence, startPos);
listElement.createTagStack(fSource, fWikiModel, startPos);
list.add(listElement);
listElement = null;
break;
}
}
break;
default:
fScannerPosition = startPosition;
return list;
}
}
if (ch == '<') {
int temp = readSpecialWikiTags(fScannerPosition);
if (temp >= 0) {
fScannerPosition = temp;
ch = fSource[fScannerPosition++];
continue;
}
}
ch = fSource[fScannerPosition++];
}
} catch (IndexOutOfBoundsException e) {
fScannerPosition = fSource.length + 1;
}
if (list != null) {
if (listElement != null) {
listElement.createTagStack(fSource, fWikiModel, fScannerPosition - 1);
list.add(listElement);
listElement = null;
}
return list;
}
return null;
}
public int nextNewline() {
while (true) {
if (fSource[fScannerPosition++] == '\n') {
return --fScannerPosition;
}
}
}
public int nextNewlineCell(WPCell cell) {
char ch;
while (true) {
ch = fSource[fScannerPosition++];
if (ch == '\n') {
return --fScannerPosition;
}
if (ch == '|') {
if (cell != null) {
cell.setAttributesStartPos(fScannerPosition - 1);
}
} else if (ch == '[') {
int position = findNestedEndSingle(fSource, '[', ']', fScannerPosition);
if (position >= 0) {
fScannerPosition = position;
}
} else if (ch == '{') {
int cposition =
findNestedEndSingle(fSource, '{', '}', fScannerPosition);
if (cposition >= 0) {
fScannerPosition = cposition;
}
}
}
}
/**
* Get the offset position behind the next closing HTML comment tag (-->).
*
* @return the offset position behind the next closing HTML comment tag or
* <code>-1</code> if no tag could be found.
*/
public int indexEndOfComment() {
char ch;
while (fScannerPosition < fSource.length - 2) {
ch = fSource[fScannerPosition++];
if (ch == '-'
&& fSource[fScannerPosition] == '-'
&& fSource[fScannerPosition + 1] == '>') {
return fScannerPosition + 2;
}
}
return -1;
}
/**
* Get the offset position behind the next </nowiki> tag.
*
* @return the offset position behind the </nowiki> tag or
* <code>-1</code> if no tag could be found.
*/
public int indexEndOfNowiki() {
char ch;
while (fScannerPosition < fSource.length - 8) {
ch = fSource[fScannerPosition++];
if (ch == '<'
&& fSource[fScannerPosition] == '/'
&& fSource[fScannerPosition + 1] == 'n'
&& fSource[fScannerPosition + 2] == 'o'
&& fSource[fScannerPosition + 3] == 'w'
&& fSource[fScannerPosition + 4] == 'i'
&& fSource[fScannerPosition + 5] == 'k'
&& fSource[fScannerPosition + 6] == 'i'
&& fSource[fScannerPosition + 7] == '>') {
return fScannerPosition + 8;
}
}
return -1;
}
/**
* Get the offset position behind the corresponding wiki table closing tag
* (i.e. <code>|}</code>). The scanner detects HTML comment tags,
* <nowiki> tags and nested wiki table tags (i.e.
* <code>{|... {|... ...|} ...|}</code>).
*
* @return the offset position behind the corresponding wiki table closing tag
* or <code>-1</code> if no corresponding tag could be found.
*/
public int indexEndOfTable() {
// check nowiki and html comments
int nestedWikiTableCounter = 1;
char ch;
try {
while (fScannerPosition < fSource.length) {
ch = fSource[fScannerPosition++];
if (ch == '<'
&& fSource[fScannerPosition] == '!'
&& fSource[fScannerPosition + 1] == '-'
&& fSource[fScannerPosition + 2] == '-') {
// start of HTML comment
fScannerPosition += 3;
fScannerPosition = indexEndOfComment();
if (fScannerPosition == (-1)) {
return -1;
}
} else if (ch == '<'
&& fSource[fScannerPosition] == 'n'
&& fSource[fScannerPosition + 1] == 'o'
&& fSource[fScannerPosition + 2] == 'w'
&& fSource[fScannerPosition + 3] == 'i'
&& fSource[fScannerPosition + 4] == 'k'
&& fSource[fScannerPosition + 5] == 'i'
&& fSource[fScannerPosition + 6] == '>') {
// start of <nowiki>
fScannerPosition += 7;
fScannerPosition = indexEndOfNowiki();
if (fScannerPosition == (-1)) {
return -1;
}
} else if (ch == '\n'
&& fSource[fScannerPosition] == '{'
&& fSource[fScannerPosition + 1] == '|') {
// assume nested table
nestedWikiTableCounter++;
} else if (ch == '\n') {
int oldPosition = fScannerPosition;
ch = fSource[fScannerPosition++];
// ignore SPACES and TABs at the beginning of the line
while (ch == ' ' || ch == '\t') {
ch = fSource[fScannerPosition++];
}
if (ch == '|' && fSource[fScannerPosition] == '}') {
if (--nestedWikiTableCounter == 0) {
return fScannerPosition + 1;
}
}
fScannerPosition = oldPosition;
}
}
} catch (IndexOutOfBoundsException e) {
// ..
}
return -1;
}
/**
* <p>
* Check if a String starts with a specified prefix (optionally case
* insensitive).
* </p>
*
* @see java.lang.String#startsWith(String)
* @param str
* the String to check, may be null
* @param toffset
* the starting offset of the subregion the String to check
* @param prefix
* the prefix to find, may be null
* @param ignoreCase
* inidicates whether the compare should ignore case (case
* insensitive) or not.
* @return <code>true</code> if the String starts with the prefix or both
* <code>null</code>
*/
public static boolean startsWith(String str, int toffset, String prefix,
boolean ignoreCase) {
if (str == null || prefix == null) {
return (str == null && prefix == null);
}
if (prefix.length() > str.length() - toffset) {
return false;
}
return str.regionMatches(ignoreCase, toffset, prefix, 0, prefix.length());
}
public void scanWhiteSpace() {
while (Character.isWhitespace(fSource[fScannerPosition++])) {
}
--fScannerPosition;
}
/**
* Replace the wiki template parameters in the given template string
*
* @param template
* @param templateParameters
* @return <code>null</code> if no replacement could be found
*/
public StringBuilder replaceTemplateParameters(String template,
Map<String, String> templateParameters) {
StringBuilder buffer = null;
int bufferStart = 0;
try {
int level = fWikiModel.incrementRecursionLevel();
if (level > Configuration.PARSER_RECURSION_LIMIT) {
return null; // no further processing
}
char ch;
int parameterStart = -1;
StringBuilder recursiveResult;
boolean isDefaultValue;
while (true) {
ch = fSource[fScannerPosition++];
if (ch == '{'
&& fSource[fScannerPosition] == '{'
&& fSource[fScannerPosition + 1] == '{'
&& fSource[fScannerPosition + 2] != '{') {
fScannerPosition += 2;
parameterStart = fScannerPosition;
int temp[] = findNestedParamEnd(fSource, parameterStart);
if (temp[0] >= 0) {
fScannerPosition = temp[0];
List<String> list =
splitByPipe(fSource, parameterStart, fScannerPosition - 3, null);
if (list.size() > 0) {
String parameterString = list.get(0).trim();
String value = null;
isDefaultValue = false;
if (templateParameters != null) {
value = templateParameters.get(parameterString);
}
if (value == null && list.size() > 1) {
// default value is available for the template
value = list.get(1);
isDefaultValue = true;
}
if (value != null) {
if (value.length() <= Configuration.TEMPLATE_VALUE_LIMIT) {
if (buffer == null) {
buffer = new StringBuilder(template.length() + 128);
}
if (bufferStart < fScannerPosition) {
buffer.append(fSource, bufferStart, parameterStart
- bufferStart
- 3);
}
WikipediaScanner scanner = new WikipediaScanner(value);
scanner.setModel(fWikiModel);
if (isDefaultValue) {
recursiveResult =
scanner.replaceTemplateParameters(
value,
templateParameters);
} else {
recursiveResult =
scanner.replaceTemplateParameters(value, null);
}
if (recursiveResult != null) {
buffer.append(recursiveResult);
} else {
buffer.append(value);
}
bufferStart = fScannerPosition;
}
}
}
fScannerPosition = temp[0];
parameterStart = -1;
}
}
if (buffer != null
&& buffer.length() > Configuration.TEMPLATE_BUFFER_LIMIT) {
// Controls the scanner, when infinite recursion occurs the
// buffer grows out of control.
return buffer;
}
}
} catch (IndexOutOfBoundsException e) {
// ignore
} finally {
fWikiModel.decrementRecursionLevel();
}
if (buffer != null && bufferStart < fScannerPosition) {
buffer.append(fSource, bufferStart, fScannerPosition - bufferStart - 1);
}
return buffer;
}
/**
* Split the given src string by pipe symbol (i.e. "|")
*
* @param sourceString
* @param resultList
* the list which contains the splitted strings
* @return
*/
public static List<String> splitByPipe(String sourceString,
List<String> resultList) {
// TODO optimize this to avoid new char[] generation inside toCharArray() ?
return splitByPipe(
sourceString.toCharArray(),
0,
sourceString.length(),
resultList);
}
/**
* Split the given <code>srcArray</code> character array by pipe symbol (i.e.
* "|")
*
* @param srcArray
* @param currOffset
* @param endOffset
* @param resultList
* the list which contains the splitted strings
* @return
*/
public static List<String> splitByPipe(char[] srcArray, int currOffset,
int endOffset, List<String> resultList) {
if (resultList == null) {
resultList = new ArrayList<String>();
}
char ch;
int[] temp = new int[] { -1, -1 };
int lastOffset = currOffset;
try {
while (currOffset < endOffset) {
ch = srcArray[currOffset++];
if (ch == '[' && srcArray[currOffset] == '[') {
currOffset++;
temp[0] = findNestedEnd(srcArray, '[', ']', currOffset);
if (temp[0] >= 0) {
currOffset = temp[0];
}
} else if (ch == '{' && srcArray[currOffset] == '{') {
currOffset++;
if (srcArray[currOffset] == '{' && srcArray[currOffset + 1] != '{') {
currOffset++;
temp = findNestedParamEnd(srcArray, currOffset);
if (temp[0] >= 0) {
currOffset = temp[0];
}
} else {
temp[0] = findNestedTemplateEnd(srcArray, currOffset);
if (temp[0] >= 0) {
currOffset = temp[0];
}
}
} else if (ch == '|') {
resultList.add(new String(srcArray, lastOffset, currOffset
- lastOffset
- 1));
lastOffset = currOffset;
}
}
if (currOffset > lastOffset) {
resultList
.add(new String(srcArray, lastOffset, currOffset - lastOffset));
} else if (currOffset == lastOffset) {
resultList.add("");
}
} catch (IndexOutOfBoundsException e) {
if (currOffset > lastOffset) {
resultList
.add(new String(srcArray, lastOffset, currOffset - lastOffset));
} else if (currOffset == lastOffset) {
resultList.add("");
}
}
return resultList;
}
/**
* Read until the end of a nested block i.e. something like
* <code>[[...[[ ]]...]]</code>
*
* @param sourceArray
* @param startCh
* @param endChar
* @param startPosition
* @return the position of the nested end charcters or <code>-1</code> if not
* found
*/
public static int findNestedEnd(final char[] sourceArray, final char startCh,
final char endChar, int startPosition) {
char ch;
int level = 1;
int position = startPosition;
final int sourceArrayLength = sourceArray.length - 1;
try {
while (position < sourceArrayLength) {
ch = sourceArray[position++];
if (ch == startCh && sourceArray[position] == startCh) {
position++;
level++;
} else if (ch == endChar && sourceArray[position] == endChar) {
position++;
if (--level == 0) {
return position;
}
}
}
return -1;
} catch (IndexOutOfBoundsException e) {
return -1;
}
}
/**
* Read until the end of a nested block i.e. something like
* <code>{{{...{...{{ }}...}...}}}</code>
*
* @param sourceArray
* @param startCh
* @param endChar
* @param startPosition
* @return the position of the nested end charcters or <code>-1</code> if not
* found
*/
public static int findNestedEndSingle(final char[] sourceArray,
final char startCh, final char endChar, int startPosition) {
char ch;
int level = 1;
int position = startPosition;
final int sourceArrayLength = sourceArray.length;
try {
while (position < sourceArrayLength) {
ch = sourceArray[position++];
if (ch == startCh) {
level++;
} else if (ch == endChar) {
if (--level == 0) {
return position;
}
}
}
return -1;
} catch (IndexOutOfBoundsException e) {
return -1;
}
}
public static int findNestedTemplateEnd(final char[] sourceArray,
int startPosition) {
char ch;
int countSingleOpenBraces = 0;
int position = startPosition;
try {
while (position < sourceArray.length) {
ch = sourceArray[position++];
if (ch == '{') {
countSingleOpenBraces++;
} else if (ch == '}') {
if (countSingleOpenBraces > 0) {
countSingleOpenBraces--;
} else {
if (sourceArray[position] == '}') {
// template ending
position++;
return position;
}
}
}
}
return -1;
} catch (IndexOutOfBoundsException e) {
return -1;
}
}
/**
* Find the end of a template parameter declaration or the end of a template
* declaration.
*
* @param sourceArray
* @param startPosition
* @return an array of two integers. If <code>array[0] > 0</code> the scanner
* has found the end position of a template parameter declaration. If
* <code>array[1] > 0</code> the scanner has found the end position of
* a template declaration.
*/
public static int[] findNestedParamEnd(final char[] sourceArray,
int startPosition) {
char ch;
final int sourceArrayLength = sourceArray.length;
int countSingleOpenBraces = 0;
int parameterPosition = startPosition;
try {
while (parameterPosition < sourceArrayLength) {
ch = sourceArray[parameterPosition++];
if (ch == '{') {
if ((sourceArrayLength > parameterPosition)
&& sourceArray[parameterPosition] == '{') {
parameterPosition++;
if ((sourceArrayLength > parameterPosition)
&& sourceArray[parameterPosition] == '{'
&& sourceArray[parameterPosition + 1] != '{') {
// template parameter beginning
parameterPosition++;
int[] temp = findNestedParamEnd(sourceArray, parameterPosition);
if (temp[0] >= 0) {
parameterPosition = temp[0];
} else {
if (temp[1] >= 0) {
parameterPosition = temp[1];
} else {
return new int[] { -1, -1 };
}
}
} else {
// template beginning
int temp = findNestedTemplateEnd(sourceArray, parameterPosition);
if (temp < 0) {
return new int[] { -1, -1 };
}
parameterPosition = temp;
}
} else {
countSingleOpenBraces++;
}
} else if (ch == '}') {
if (countSingleOpenBraces > 0) {
countSingleOpenBraces--;
} else {
if ((sourceArrayLength > parameterPosition)
&& sourceArray[parameterPosition] == '}') {
if (sourceArray[parameterPosition + 1] == '}') {
// template parameter ending
return new int[] { parameterPosition + 2, -1 };
} else {
return new int[] { -1, parameterPosition + 1 };
}
}
}
}
}
return new int[] { -1, -1 };
} catch (IndexOutOfBoundsException e) {
return new int[] { -1, -1 };
}
}
/**
* Parse a tag. Parse the name and attributes from a start tag.
* <p>
* From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2">
* HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>
* http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2
* <p>
* <cite> 3.2.2 Attributes
* <p>
* Elements may have associated properties, called attributes, which may have
* values (by default, or set by authors or scripts). Attribute/value pairs
* appear before the final ">" of an element's start tag. Any number of
* (legal) attribute value pairs, separated by spaces, may appear in an
* element's start tag. They may appear in any order.
* <p>
* In this example, the id attribute is set for an H1 element: <code>
* <H1 id="section1">
* </code> This is an identified heading thanks to the id attribute <code>
* </H1>
* </code> By default, SGML requires that all attribute values be delimited
* using either double quotation marks (ASCII decimal 34) or single quotation
* marks (ASCII decimal 39). Single quote marks can be included within the
* attribute value when the value is delimited by double quote marks, and vice
* versa. Authors may also use numeric character references to represent
* double quotes (") and single quotes ('). For doublequotes
* authors can also use the character entity reference ".
* <p>
* In certain cases, authors may specify the value of an attribute without any
* quotation marks. The attribute value may only contain letters (a-z and
* A-Z), digits (0-9), hyphens (ASCII decimal 45), periods (ASCII decimal 46),
* underscores (ASCII decimal 95), and colons (ASCII decimal 58). We recommend
* using quotation marks even when it is possible to eliminate them.
* <p>
* Attribute names are always case-insensitive.
* <p>
* Attribute values are generally case-insensitive. The definition of each
* attribute in the reference manual indicates whether its value is
* case-insensitive.
* <p>
* All the attributes defined by this specification are listed in the
* attribute index.
* <p>
* </cite>
* <p>
* This method uses a state machine with the following states:
* <ol>
* <li>state 0 - outside of any attribute</li>
* <li>state 1 - within attributre name</li>
* <li>state 2 - equals hit</li>
* <li>state 3 - within naked attribute value.</li>
* <li>state 4 - within single quoted attribute value</li>
* <li>state 5 - within double quoted attribute value</li>
* <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or
* state 0</li>
* </ol>
* <p>
* The starting point for the various components is stored in an array of
* integers that match the initiation point for the states one-for-one, i.e.
* bookmarks[0] is where state 0 began, bookmarks[1] is where state 1 began,
* etc. Attributes are stored in a <code>Vector</code> having one slot for
* each whitespace or attribute/value pair. The first slot is for attribute
* name (kind of like a standalone attribute).
*
* @param start
* The position at which to start scanning.
* @return The parsed tag.
* @exception ParserException
* If a problem occurs reading from the source.
*/
protected WikiTagNode parseTag(int start) {
boolean done;
char ch;
int state;
int[] bookmarks;
done = false;
ArrayList<NodeAttribute> attributes = new ArrayList<NodeAttribute>();
state = 0;
fScannerPosition = start;
bookmarks = new int[8];
bookmarks[0] = fScannerPosition;
try {
while (!done) {
bookmarks[state + 1] = fScannerPosition;
ch = fSource[fScannerPosition++];
switch (state) {
case 0: // outside of any attribute
if ((EOF == ch) || ('>' == ch) || ('<' == ch)) {
if ('<' == ch) {
// don't consume the opening angle
bookmarks[state + 1] = --fScannerPosition;
}
whitespace(attributes, bookmarks);
done = true;
} else if (!Character.isWhitespace(ch)) {
whitespace(attributes, bookmarks);
state = 1;
}
break;
case 1: // within attribute name
if ((EOF == ch) || ('>' == ch) || ('<' == ch)) {
if ('<' == ch) {
// don't consume the opening angle
bookmarks[state + 1] = --fScannerPosition;
}
standalone(attributes, bookmarks);
done = true;
} else if (Character.isWhitespace(ch)) {
// whitespaces might be followed by next attribute or an
// equal sign
// see Bug #891058 Bug in lexer.
bookmarks[6] = bookmarks[2]; // setting the
// bookmark[0]
// is done in state 6 if
// applicable
state = 6;
} else if ('=' == ch) {
state = 2;
}
break;
case 2: // equals hit
if ((EOF == ch) || ('>' == ch)) {
empty(attributes, bookmarks);
done = true;
} else if ('\'' == ch) {
state = 4;
bookmarks[4] = bookmarks[3];
} else if ('"' == ch) {
state = 5;
bookmarks[5] = bookmarks[3];
} else if (Character.isWhitespace(ch)) {
// collect white spaces after "=" into the assignment
// string;
// do nothing
// see Bug #891058 Bug in lexer.
} else {
state = 3;
}
break;
case 3: // within naked attribute value
if ((EOF == ch) || ('>' == ch)) {
naked(attributes, bookmarks);
done = true;
} else if (Character.isWhitespace(ch)) {
naked(attributes, bookmarks);
bookmarks[0] = bookmarks[4];
state = 0;
} else if (ch == '/' && fSource[fScannerPosition] == '>') {
naked(attributes, bookmarks);
bookmarks[0] = bookmarks[4];
fScannerPosition--;
state = 0;
}
break;
case 4: // within single quoted attribute value
if (EOF == ch) {
single_quote(attributes, bookmarks);
done = true; // complain?
} else if ('\'' == ch) {
single_quote(attributes, bookmarks);
bookmarks[0] = bookmarks[5] + 1;
state = 0;
}
break;
case 5: // within double quoted attribute value
if (EOF == ch) {
double_quote(attributes, bookmarks);
done = true; // complain?
} else if ('"' == ch) {
double_quote(attributes, bookmarks);
bookmarks[0] = bookmarks[6] + 1;
state = 0;
}
break;
case 6: // undecided for state 0 or 2
// we have read white spaces after an attributte name
if (EOF == ch) {
// same as last else clause
standalone(attributes, bookmarks);
bookmarks[0] = bookmarks[6];
// mPage.ungetCharacter(mCursor);
--fScannerPosition;
state = 0;
} else if (Character.isWhitespace(ch)) {
// proceed
} else if ('=' == ch) {// yepp. the white spaces belonged to the
// equal.
bookmarks[2] = bookmarks[6];
bookmarks[3] = bookmarks[7];
state = 2;
} else {
// white spaces were not ended by equal
// meaning the attribute was a stand alone attribute
// now: create the stand alone attribute and rewind
// the cursor to the end of the white spaces
// and restart scanning as whitespace attribute.
standalone(attributes, bookmarks);
bookmarks[0] = bookmarks[6];
--fScannerPosition;
state = 0;
}
break;
default:
throw new IllegalStateException("how did we get in state " + state);
}
}
if (fSource[fScannerPosition - 1] != '>') {
fScannerPosition = start;
return null;
}
return (makeTag(start, fScannerPosition, attributes));
} catch (IndexOutOfBoundsException e) {
if (state == 3) {
// within naked attribute value
naked(attributes, bookmarks);
}
}
fScannerPosition = start;
return null;
}
protected List<NodeAttribute> parseAttributes(int start, int end) {
boolean done;
char ch;
int state;
int[] bookmarks;
done = false;
ArrayList<NodeAttribute> attributes = new ArrayList<NodeAttribute>();
state = 0;
fScannerPosition = start;
bookmarks = new int[8];
bookmarks[0] = fScannerPosition;
try {
while (!done && fScannerPosition < end) {
bookmarks[state + 1] = fScannerPosition;
ch = fSource[fScannerPosition++];
switch (state) {
case 0: // outside of any attribute
if ((EOF == ch) || ('>' == ch) || ('<' == ch)) {
if ('<' == ch) {
// don't consume the opening angle
bookmarks[state + 1] = --fScannerPosition;
}
whitespace(attributes, bookmarks);
done = true;
} else if (!Character.isWhitespace(ch)) {
whitespace(attributes, bookmarks);
state = 1;
}
break;
case 1: // within attribute name
if ((EOF == ch) || ('>' == ch) || ('<' == ch)) {
if ('<' == ch) {
// don't consume the opening angle
bookmarks[state + 1] = --fScannerPosition;
}
standalone(attributes, bookmarks);
done = true;
} else if (Character.isWhitespace(ch)) {
// whitespaces might be followed by next attribute or an
// equal sign
// see Bug #891058 Bug in lexer.
bookmarks[6] = bookmarks[2]; // setting the
// bookmark[0]
// is done in state 6 if
// applicable
state = 6;
} else if ('=' == ch) {
state = 2;
}
break;
case 2: // equals hit
if ((EOF == ch) || ('>' == ch)) {
empty(attributes, bookmarks);
done = true;
} else if ('\'' == ch) {
state = 4;
bookmarks[4] = bookmarks[3];
} else if ('"' == ch) {
state = 5;
bookmarks[5] = bookmarks[3];
} else if (Character.isWhitespace(ch)) {
// collect white spaces after "=" into the assignment
// string;
// do nothing
// see Bug #891058 Bug in lexer.
} else {
state = 3;
}
break;
case 3: // within naked attribute value
if ((EOF == ch) || ('>' == ch)) {
naked(attributes, bookmarks);
done = true;
} else if (Character.isWhitespace(ch)) {
naked(attributes, bookmarks);
bookmarks[0] = bookmarks[4];
state = 0;
}
break;
case 4: // within single quoted attribute value
if (EOF == ch) {
single_quote(attributes, bookmarks);
done = true; // complain?
} else if ('\'' == ch) {
single_quote(attributes, bookmarks);
bookmarks[0] = bookmarks[5] + 1;
state = 0;
}
break;
case 5: // within double quoted attribute value
if (EOF == ch) {
double_quote(attributes, bookmarks);
done = true; // complain?
// } else if ('\\' == ch && fSource[fScannerPosition] == '"') {
// fScannerPosition++;
} else if ('"' == ch) {
double_quote(attributes, bookmarks);
bookmarks[0] = bookmarks[6] + 1;
state = 0;
}
break;
// patch for lexer state correction by
// Gernot Fricke
// See Bug # 891058 Bug in lexer.
case 6: // undecided for state 0 or 2
// we have read white spaces after an attributte name
if (EOF == ch) {
// same as last else clause
standalone(attributes, bookmarks);
bookmarks[0] = bookmarks[6];
// mPage.ungetCharacter(mCursor);
--fScannerPosition;
state = 0;
} else if (Character.isWhitespace(ch)) {
// proceed
} else if ('=' == ch) // yepp. the white spaces belonged
// to the equal.
{
bookmarks[2] = bookmarks[6];
bookmarks[3] = bookmarks[7];
state = 2;
} else {
// white spaces were not ended by equal
// meaning the attribute was a stand alone attribute
// now: create the stand alone attribute and rewind
// the cursor to the end of the white spaces
// and restart scanning as whitespace attribute.
standalone(attributes, bookmarks);
bookmarks[0] = bookmarks[6];
--fScannerPosition;
state = 0;
}
break;
default:
throw new IllegalStateException("how did we get in state " + state);
}
}
if (state == 3 || state == 4 || state == 5) {
// within naked attribute value
bookmarks[state + 1] = fScannerPosition;
naked(attributes, bookmarks);
}
return attributes;
} catch (IndexOutOfBoundsException e) {
}
return null;
}
/**
* Create a tag node based on the current cursor and the one provided.
*
* @param start
* The starting point of the node.
* @param end
* The ending point of the node.
* @param attributes
* The attributes parsed from the tag.
* @exception ParserException
* If the nodefactory creation of the tag node fails.
* @return The new Tag node.
*/
protected WikiTagNode makeTag(int start, int end,
ArrayList<NodeAttribute> attributes) {
int length;
length = end - start;
if (0 != length) { // return tag based on second character, '/', '%',
// Letter (ch), '!'
if (2 > length) {
// this is an error
return null; // (makeString(start, end));
}
return new WikiTagNode(start, end, attributes);
}
return null;
}
/**
* Generate a whitespace 'attribute',
*
* @param attributes
* The list so far.
* @param bookmarks
* The array of positions.
*/
private void whitespace(ArrayList<NodeAttribute> attributes, int[] bookmarks) {
// if (bookmarks[1] > bookmarks[0])
// attributes.addElement(new PageAttribute(fSource,-1, -1, bookmarks[0],
// bookmarks[1], (char) 0));
}
/**
* Generate a standalone attribute -- font.
*
* @param attributes
* The list so far.
* @param bookmarks
* The array of positions.
*/
private void standalone(ArrayList<NodeAttribute> attributes, int[] bookmarks) {
attributes.add(new NodeAttribute(
fSource,
bookmarks[1],
bookmarks[2],
-1,
-1,
(char) 0));
}
/**
* Generate an empty attribute -- color=.
*
* @param attributes
* The list so far.
* @param bookmarks
* The array of positions.
*/
private void empty(ArrayList<NodeAttribute> attributes, int[] bookmarks) {
attributes.add(new NodeAttribute(
fSource,
bookmarks[1],
bookmarks[2],
bookmarks[2] + 1,
-1,
(char) 0));
}
/**
* Generate an unquoted attribute -- size=1.
*
* @param attributes
* The list so far.
* @param bookmarks
* The array of positions.
*/
private void naked(ArrayList<NodeAttribute> attributes, int[] bookmarks) {
attributes.add(new NodeAttribute(
fSource,
bookmarks[1],
bookmarks[2],
bookmarks[3],
bookmarks[4],
(char) 0));
}
/**
* Generate an single quoted attribute -- width='100%'.
*
* @param attributes
* The list so far.
* @param bookmarks
* The array of positions.
*/
private void single_quote(ArrayList<NodeAttribute> attributes, int[] bookmarks) {
attributes.add(new NodeAttribute(
fSource,
bookmarks[1],
bookmarks[2],
bookmarks[4] + 1,
bookmarks[5],
'\''));
}
/**
* Generate an double quoted attribute -- CONTENT="Test Development".
*
* @param attributes
* The list so far.
* @param bookmarks
* The array of positions.
*/
private void double_quote(ArrayList<NodeAttribute> attributes, int[] bookmarks) {
attributes.add(new NodeAttribute(
fSource,
bookmarks[1],
bookmarks[2],
bookmarks[5] + 1,
bookmarks[6],
'"'));
}
protected int readSpecialWikiTags(int start) {
int startPosition = fScannerPosition;
try {
if (fSource[start] != '/') {
// starting tag
WikiTagNode tagNode = parseTag(start);
if (tagNode != null && !tagNode.isEmptyXmlTag()) {
String tagName = tagNode.getTagName();
if (tagName.equals("nowiki")) {
return readUntilIgnoreCase(fScannerPosition, "</", "nowiki>");
} else if (tagName.equals("source")) {
return readUntilIgnoreCase(fScannerPosition, "</", "source>");
} else if (tagName.equals("math")) {
return readUntilIgnoreCase(fScannerPosition, "</", "math>");
} else if (tagName.equals("span")) {
return readUntilIgnoreCase(fScannerPosition, "</", "span>");
// <div> could be nested ?
// } else if (tagName.equals("div")) {
// return readUntilIgnoreCase(fScannerPosition, "</", "div>");
}
}
}
} catch (IndexOutOfBoundsException e) {
// do nothing
}
fScannerPosition = startPosition;
return -1;
}
/**
* Read the characters until the concatenated <i>start</i> and <i>end</i>
* substring is found. The end substring is matched ignoring case
* considerations.
*
* @param startString
* the start string which should be searched in exact case mode
* @param endString
* the end string which should be searched in ignore case mode
* @return
*/
protected final int readUntilIgnoreCase(int start, String startString,
String endString) {
int index =
Util.indexOfIgnoreCase(fStringSource, startString, endString, start);
if (index != (-1)) {
return index + startString.length() + endString.length();
}
return -1;
}
/**
* Read the characters until no more letters are found or the given
* <code>testChar</code> is found. If <code>testChar</code> was found, return
* the offset position.
*
* @param testCh
* the test character
* @param fromIndex
* read from this offset
* @return <code>-1</code> if the character could not be found or no more
* letter character were found.
*/
protected int indexOfUntilNoLetter(char testChar, int fromIndex) {
int index = fromIndex;
char ch;
while (index < fSource.length) {
ch = fSource[index++];
if (ch == testChar) {
return index - 1;
}
if (Character.isLetter(ch)) {
if (fSource.length <= index) {
return -1;
}
continue;
}
return -1;
}
return -1;
}
/**
* Reduce the current token stack until the given nodes name is at the top of
* the stack. Useful for closing HTML tags.
*/
protected void reduceStackUntilToken(TagToken node) {
TagToken tag;
int index = -1;
String allowedParents = node.getParents();
while (fWikiModel.stackSize() > 0) {
tag = fWikiModel.peekNode();
if (node.getName().equals(tag.getName())) {
fWikiModel.popNode();
break;
}
if (allowedParents == null) {
fWikiModel.popNode();
} else {
index = allowedParents.indexOf("|" + tag.getName() + "|");
if (index < 0) {
fWikiModel.popNode();
} else {
break;
}
}
}
}
}