XomwBlockLevelPass.java example

Explorer
xowa-master
/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com

XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.

You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.

The terms of each license can be found in the source code repository:

GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*;
import gplx.core.btries.*;
import gplx.langs.htmls.*;
/**
* This is the part of the wikitext parser which handles automatic paragraphs
* and conversion of start-of-line prefixes to HTML lists.
*/
public class XomwBlockLevelPass {
	private boolean DTopen = false;
	private boolean inPre = false;
	private int lastSection = LAST_SECTION_NONE;
	private boolean linestart;
//		private $text;
	private final    Bry_bfr tmp = Bry_bfr_.New();
	private final    Btrie_rv trv = new Btrie_rv();
	private byte[] find_colon_no_links__before, find_colon_no_links__after;

	// State constants for the definition list colon extraction
	private static final int
	  COLON_STATE_TEXT = 0
	, COLON_STATE_TAG = 1
	, COLON_STATE_TAGSTART = 2
	, COLON_STATE_CLOSETAG = 3
	, COLON_STATE_TAGSLASH = 4
	, COLON_STATE_COMMENT = 5
	, COLON_STATE_COMMENTDASH = 6
	, COLON_STATE_COMMENTDASHDASH = 7
	;

	/**
	* Make lists from lines starting with ':', '*', '#', etc.
	*
	* @param String $text
	* @param boolean $linestart Whether or not this is at the start of a line.
	* @return String The lists rendered as HTML
	*/
//		public static function doBlockLevels($text, $linestart) {
//			$pass = new self($text, $linestart);
//			return $pass->execute();
//		}
	public void doBlockLevels(XomwParserCtx pctx, XomwParserBfr pbfr, boolean linestart) {
		this.linestart = linestart;
		execute(pctx, pbfr, linestart);
	}

//		/**
//		* Private constructor
//		*/
//		private function __construct($text, $linestart) {
//			$this->text = $text;
//			$this->linestart = $linestart;
//		}

	/**
	* If a pre or p is open, return the corresponding close tag and update
	* the state. If no tag is open, return an empty String.
	* @return String
	*/
	private byte[] closeParagraph() {
		byte[] result = Bry_.Empty;
		if (this.lastSection != LAST_SECTION_NONE) {
			result = tmp.Add(lastSection == LAST_SECTION_PARA ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs).Add_byte_nl().To_bry_and_clear(); // $result = '</' . $this->lastSection . ">\n";
		}
		this.inPre = false;
		this.lastSection = LAST_SECTION_NONE;
		return result;
	}

	/**
	* getCommon() returns the length of the longest common substring
	* of both arguments, starting at the beginning of both.
	*
	* @param String $st1
	* @param String $st2
	*
	* @return int
	*/
	// getCommon() returns the length of the longest common substring
	// of both arguments, starting at the beginning of both.
	private int getCommon(byte[] st1, byte[] st2) {
		int st1Len = st1.length, st2Len = st2.length;
		int shorter = st1Len < st2Len ? st1Len : st2Len;

		int i;
		for (i = 0; i < shorter; ++i) {
			if (st1[i] != st2[i]) {
				break;
			}
		}
		return i;
	}

	/**
	* Open the list item element identified by the prefix character.
	*
	* @param String $char
	*
	* @return String
	*/
	private byte[] openList(byte c) {
		byte[] result = this.closeParagraph();

		if      (c == Byte_ascii.Star)
			result = Bry_.Add(result, Bry_.new_a7("<ul><li>"));
		else if (c == Byte_ascii.Hash)
			result = Bry_.Add(result, Bry_.new_a7("<ol><li>"));
		else if (c == Byte_ascii.Colon)
			result = Bry_.Add(result, Bry_.new_a7("<dl><dd>"));
		else if (c == Byte_ascii.Semic) {
			result = Bry_.Add(result, Bry_.new_a7("<dl><dt>"));
			this.DTopen = true;
		}
		else {
			result = Bry_.new_a7("<!-- ERR 1 -->");
		}

		return result;
	}

	/**
	* Close the current list item and open the next one.
	* @param String $char
	*
	* @return String
	*/
	private byte[] nextItem(byte c) {
		if (c == Byte_ascii.Star || c == Byte_ascii.Hash) {
			return Bry_.new_a7("</li>\n<li>");
		}
		else if (c == Byte_ascii.Colon || c == Byte_ascii.Semic) {
			byte[] close = Bry_.new_a7("</dd>\n");
			if (this.DTopen) {
				close = Bry_.new_a7("</dt>\n");
			}
			if (c == Byte_ascii.Semic) {
				this.DTopen = true;
				return Bry_.Add(close, Bry_.new_a7("<dt>"));
			}
			else {
				this.DTopen = false;
				return Bry_.Add(close, Bry_.new_a7("<dd>"));
			}
		}
		return Bry_.new_a7("<!-- ERR 2 -->");
	}

	/**
	* Close the current list item identified by the prefix character.
	* @param String $char
	*
	* @return String
	*/
	private byte[] closeList(byte c) {
		byte[] text = null;
		if (c == Byte_ascii.Star) {
			text = Bry_.new_a7("</li></ul>");
		}
		else if (c == Byte_ascii.Hash) {
			text = Bry_.new_a7("</li></ol>");
		}
		else if (c == Byte_ascii.Colon) {
			if (this.DTopen) {
				this.DTopen = false;
				text = Bry_.new_a7("</dt></dl>");
			}
			else {
				text = Bry_.new_a7("</dd></dl>");
			}
		}
		else {
			return Bry_.new_a7("<!-- ERR 3 -->");
		}
		return text;
	}

	/**
	* Execute the pass.
	* @return String
	*/
	public void execute(XomwParserCtx pctx, XomwParserBfr pbfr, boolean linestart) {
		// XO.PBFR
		Bry_bfr src_bfr = pbfr.Src();
		byte[] src = src_bfr.Bfr();
		int src_bgn = 0;
		int src_end = src_bfr.Len();
		Bry_bfr bfr = pbfr.Trg();
		pbfr.Switch();

		// XO.STATIC
		if (block_chars_ary == null) {
			synchronized (Type_adp_.ClassOf_obj(this)) {
				block_chars_ary = Block_chars_ary__new();
				openMatchTrie = Btrie_slim_mgr.ci_a7().Add_many_str
				( "<table", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6", "<pre", "<tr"
				, "<p", "<ul", "<ol", "<dl", "<li", "</tr", "</td", "</th");
				closeMatchTrie = Btrie_slim_mgr.ci_a7().Add_many_str
				( "</table", "</h1", "</h2", "</h3", "</h4", "</h5", "</h6"
				, "<td", "<th", "<blockquote", "</blockquote", "<div", "</div", "<hr", "</pre", "</p", "</mw:"
				, XomwParser.MARKER_PREFIX_STR + "-pre"
				, "</li", "</ul", "</ol", "</dl", "<center", "</center");
				blockquoteTrie = Btrie_slim_mgr.ci_a7().Add_many_str("<blockquote", "</blockquote");
				pre_trie = Btrie_slim_mgr.ci_a7().Add_str_int("<pre", PRE_BGN).Add_str_int("</pre", PRE_END);
			}
		}

		// clear state
		this.inPre = false;
		this.lastSection = LAST_SECTION_NONE;
		byte[] prefix2 = null;
		bfr.Clear();

		// Parsing through the text line by line.  The main thing
		// happening here is handling of block-level elements p, pre,
		// and making lists from lines starting with * # : etc.
		byte[] lastPrefix = Bry_.Empty;
		this.DTopen = false;
		boolean inBlockElem = false;
		int prefixLen = 0;
		byte pendingPTag = PARA_STACK_NONE;
		boolean inBlockquote = false;

		// PORTED.SPLIT: $textLines = StringUtils::explode("\n", $text);
		int lineBgn = src_bgn;
		while (lineBgn < src_end) {
			int lineEnd = Bry_find_.Find_fwd(src, Byte_ascii.Nl, lineBgn);
			if (lineEnd == Bry_find_.Not_found)
				lineEnd = src_end;

			// Fix up linestart
			if (!this.linestart) {
				bfr.Add_mid(src, lineBgn, lineEnd);
				this.linestart = true;
				continue;
			}
			// * = ul
			// # = ol
			// ; = dt
			// : = dd
			int lastPrefixLen = lastPrefix.length;

			// PORTED.BGN: preCloseMatch = preg_match('/<\\/pre/i', $oLine); preOpenMatch = preg_match('/<pre/i', $oLine);
			int preCur = lineBgn;
			boolean preCloseMatch = false;
			boolean preOpenMatch = false;
			while (true) {
				if (preCur >= lineEnd)
					break;
				Object o = pre_trie.Match_at(trv, src, preCur, lineEnd);
				if (o == null)
					preCur++;
				else {
					int pre_tid = Int_.cast(o);
					if (pre_tid == PRE_BGN)
						preOpenMatch = true;
					else if (pre_tid == PRE_END)
						preCloseMatch = true;
					preCur = trv.Pos();
				}
			}
			// PORTED.END

			byte[] prefix = null, t = null;
			// If not in a <pre> element, scan for and figure out what prefixes are there.
			if (!this.inPre) {
				// Multiple prefixes may abut each other for nested lists.
				prefixLen = XophpString.strspn_fwd__ary(src, block_chars_ary, lineBgn, lineEnd, lineEnd); // strspn($oLine, '*#:;');
				prefix = XophpString.substr(src, lineBgn, prefixLen);

				// eh?
				// ; and : are both from definition-lists, so they're equivalent
				//  for the purposes of determining whether or not we need to open/close
				//  elements.
				// substr($inputLine, $prefixLength);
				prefix2 = Bry_.Replace(prefix, Byte_ascii.Semic, Byte_ascii.Colon);
				t = Bry_.Mid(src, lineBgn + prefixLen, lineEnd);
				this.inPre = preOpenMatch;
			}
			else {
				// Don't interpret any other prefixes in preformatted text
				prefixLen = 0;
				prefix = prefix2 = Bry_.Empty;
				t = Bry_.Mid(src, lineBgn, lineEnd);
			}

			// List generation
			byte[] term = null, t2 = null;
			int commonPrefixLen = -1;
			if (prefixLen > 0 && Bry_.Eq(lastPrefix, prefix2)) {
				// Same as the last item, so no need to deal with nesting or opening stuff
				bfr.Add(this.nextItem(XophpString.substr_byte(prefix, -1)));
				pendingPTag = PARA_STACK_NONE;

				if (prefixLen > 0 && prefix[prefixLen - 1] == Byte_ascii.Semic) {
					// The one nasty exception: definition lists work like this:
					// ; title : definition text
					// So we check for : in the remainder text to split up the
					// title and definition, without b0rking links.
					term = t2 = Bry_.Empty;
					if (this.findColonNoLinks(t, term, t2) != Bry_find_.Not_found) {
						term = find_colon_no_links__before;
						t2   = find_colon_no_links__after;
						t = t2;
						bfr.Add(term).Add(nextItem(Byte_ascii.Colon));
					}
				}
			}
			else if (prefixLen > 0 || lastPrefixLen > 0) {
				// We need to open or close prefixes, or both.

				// Either open or close a level...
				commonPrefixLen = this.getCommon(prefix, lastPrefix);
				pendingPTag = PARA_STACK_NONE;

				// Close all the prefixes which aren't shared.
				while (commonPrefixLen < lastPrefixLen) {
					bfr.Add(this.closeList(lastPrefix[lastPrefixLen - 1]));
					--lastPrefixLen;
				}

				// Continue the current prefix if appropriate.
				if (prefixLen <= commonPrefixLen && commonPrefixLen > 0) {
					bfr.Add(this.nextItem(prefix[commonPrefixLen - 1]));
				}

				// Open prefixes where appropriate.
				if (Bry_.Len_gt_0(lastPrefix) && prefixLen > commonPrefixLen) {
					bfr.Add_byte_nl();
				}
				while (prefixLen > commonPrefixLen) {
					byte c = XophpString.substr_byte(prefix, commonPrefixLen, 1);
					bfr.Add(this.openList(c));

					if (c == Byte_ascii.Semic) {
						// @todo FIXME: This is dupe of code above
						if (findColonNoLinks(t, term, t2) != Bry_find_.Not_found) {
							term = find_colon_no_links__before;
							t2   = find_colon_no_links__after;
							t = t2;
							bfr.Add(term).Add(nextItem(Byte_ascii.Colon));
						}
					}
					++commonPrefixLen;
				}
				if (prefixLen == 0 && Bry_.Len_gt_0(lastPrefix)) {
					bfr.Add_byte_nl();
				}
				lastPrefix = prefix2;
			}

			// If we have no prefixes, go to paragraph mode.
			if (0 == prefixLen) {
				// No prefix (not in list)--go to paragraph mode
				// @todo consider using a stack for nestable elements like span, table and div
				int tLen = t.length;

				// XO.MW.PORTED.BGN:
				boolean openMatch = XophpPreg.match(openMatchTrie, trv, t, 0, tLen) != null;
				boolean closeMatch = XophpPreg.match(closeMatchTrie, trv, t, 0, tLen) != null;
				// XO.MW.PORTED.END
				if (openMatch || closeMatch) {
					pendingPTag = PARA_STACK_NONE;
					// @todo bug 5718: paragraph closed
					bfr.Add(this.closeParagraph());
					if (preOpenMatch && !preCloseMatch) {
						this.inPre = true;
					}
					int bqOffset = 0;
					// PORTED:preg_match('/<(\\/?)blockquote[\s>]/i', t, $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset)
					while (true) {
						Object o = XophpPreg.match(blockquoteTrie, trv, t, bqOffset, tLen);
						if (o == null) { // no more blockquotes found; exit
							break;
						}
						else {
							byte[] bq_bry = (byte[])o;
							inBlockquote = bq_bry[1] != Byte_ascii.Slash; // is this a close tag?
							bqOffset = trv.Pos();
						}
					}
					// PORTED:END
					inBlockElem = !closeMatch;
				}
				else if (!inBlockElem && !this.inPre) {
					if (XophpString.substr_byte(t, 0) == Byte_ascii.Space
						&& (this.lastSection == LAST_SECTION_PRE || Bry_.Trim(t) != Bry_.Empty)
						&& !inBlockquote
					) {
						// pre
						if (this.lastSection != LAST_SECTION_PRE) {
							pendingPTag = PARA_STACK_NONE;
							bfr.Add(closeParagraph()).Add(Gfh_tag_.Pre_lhs);
							this.lastSection = LAST_SECTION_PRE;
						}
						t = Bry_.Mid(t, 1);
					}
					else {
						// paragraph
						if (Bry_.Trim(t) == Bry_.Empty) {
							if (pendingPTag != PARA_STACK_NONE) {
								ParaStackAdd(bfr, pendingPTag);
								bfr.Add_str_a7("<br />");
								pendingPTag = PARA_STACK_NONE;
								this.lastSection = LAST_SECTION_PARA;
							}
							else {
								if (this.lastSection != LAST_SECTION_PARA) {
									bfr.Add(this.closeParagraph());
									this.lastSection = LAST_SECTION_NONE;
									pendingPTag = PARA_STACK_BGN;
								}
								else {
									pendingPTag = PARA_STACK_MID;
								}
							}
						}
						else {
							if (pendingPTag != PARA_STACK_NONE) {
								ParaStackAdd(bfr, pendingPTag);
								pendingPTag = PARA_STACK_NONE;
								this.lastSection = LAST_SECTION_PARA;
							}
							else if (lastSection != LAST_SECTION_PARA) {
								bfr.Add(this.closeParagraph()).Add(Gfh_tag_.P_lhs);
								this.lastSection = LAST_SECTION_PARA;
							}
						}
					}
				}
			}
			// somewhere above we forget to get out of pre block (bug 785)
			if (preCloseMatch && this.inPre) {
				this.inPre = false;
			}
			if (pendingPTag == PARA_STACK_NONE) {
				bfr.Add(t);
				if (prefixLen == 0) {
					bfr.Add_byte_nl();
				}
			}

			lineBgn = lineEnd + 1;
		}

		while (prefixLen > 0) {
			bfr.Add(this.closeList(prefix2[prefixLen - 1]));
			--prefixLen;
			if (prefixLen > 0) {
				bfr.Add_byte_nl();
			}
		}
		if (this.lastSection != LAST_SECTION_NONE) {
			bfr.Add(this.lastSection == LAST_SECTION_PARA ? Gfh_tag_.P_rhs : Gfh_tag_.Pre_rhs);
			this.lastSection = LAST_SECTION_NONE;
		}
	}

	/**
	* Split up a String on ':', ignoring any occurrences inside tags
	* to prevent illegal overlapping.
	*
	* @param String $str The String to split
	* @param String &$before Set to everything before the ':'
	* @param String &$after Set to everything after the ':'
	* @throws MWException
	* @return String The position of the ':', or false if none found
	*/
	private int findColonNoLinks(byte[] str, byte[] before, byte[] after) {
		int len = str.length;
		int colonPos = XophpString.strpos(str, Byte_ascii.Colon, 0, len);
		if (colonPos == Bry_find_.Not_found) {
			// Nothing to find!
			return Bry_find_.Not_found;
		}

		int ltPos = XophpString.strpos(str, Byte_ascii.Angle_bgn, 0, len);
		if (ltPos == Bry_find_.Not_found || ltPos > colonPos) {
			// Easy; no tag nesting to worry about
			// XOMW: MW passes before / after by reference; XO: changes member and depends on callers to update
			find_colon_no_links__before = XophpString.substr(str, 0, colonPos);
			find_colon_no_links__after = XophpString.substr(str, colonPos + 1);
			return colonPos;
		}

		// Ugly state machine to walk through avoiding tags.
		int state = COLON_STATE_TEXT;
		int level = 0;
		for (int i = 0; i < len; i++) {
			byte c = str[i];

			switch (state) {
				case COLON_STATE_TEXT:
					switch (c) {
						case Byte_ascii.Angle_bgn:
							// Could be either a <start> tag or an </end> tag
							state = COLON_STATE_TAGSTART;
							break;
						case Byte_ascii.Colon:
							if (level == 0) {
								// We found it!
								find_colon_no_links__before = XophpString.substr(str, 0, i);
								find_colon_no_links__after = XophpString.substr(str, i + 1);
								return i;
							}
							// Embedded in a tag; don't break it.
							break;
						default:
							// Skip ahead looking for something interesting
							colonPos = XophpString.strpos(str, Byte_ascii.Colon, i, len);
							if (colonPos == Bry_find_.Not_found) {
								// Nothing else interesting
								return Bry_find_.Not_found;
							}
							ltPos = XophpString.strpos(str, Byte_ascii.Angle_bgn, i, len);
							if (level == 0) {
								if (ltPos == Bry_find_.Not_found || colonPos < ltPos) {
									// We found it!
									find_colon_no_links__before = XophpString.substr(str, 0, colonPos);
									find_colon_no_links__after = XophpString.substr(str, colonPos + 1);
									return i;
								}
							}
							if (ltPos == Bry_find_.Not_found) {
								// Nothing else interesting to find; abort!
								// We're nested, but there's no close tags left. Abort!
								i = len;	// break 2
								break;
							}
							// Skip ahead to next tag start
							i = ltPos;
							state = COLON_STATE_TAGSTART;
							break;
						}
					break;
				case COLON_STATE_TAG:
					// In a <tag>
					switch (c) {
						case Byte_ascii.Angle_end:
							level++;
							state = COLON_STATE_TEXT;
							break;
						case Byte_ascii.Slash:
							// Slash may be followed by >?
							state = COLON_STATE_TAGSLASH;
							break;
						default:
							// ignore
							break;
					}
					break;
				case COLON_STATE_TAGSTART:
					switch (c) {
						case Byte_ascii.Slash:
							state = COLON_STATE_CLOSETAG;
							break;
						case Byte_ascii.Bang:
							state = COLON_STATE_COMMENT;
							break;
						case Byte_ascii.Angle_end:
							// Illegal early close? This shouldn't happen D:
							state = COLON_STATE_TEXT;
							break;
						default:
							state = COLON_STATE_TAG;
							break;
					}
					break;
				case COLON_STATE_CLOSETAG:
					// In a </tag>
					if (c == Byte_ascii.Angle_end) {
						level--;
						if (level < 0) {
							Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; too many close tags");
							return Bry_find_.Not_found;
						}
						state = COLON_STATE_TEXT;
					}
					break;
				case COLON_STATE_TAGSLASH:
					if (c == Byte_ascii.Angle_end) {
						// Yes, a self-closed tag <blah/>
						state = COLON_STATE_TEXT;
					}
					else {
						// Probably we're jumping the gun, and this is an attribute
						state = COLON_STATE_TAG;
					}
					break;
				case COLON_STATE_COMMENT:
					if (c == Byte_ascii.Dash) {
						state = COLON_STATE_COMMENTDASH;
					}
					break;
				case COLON_STATE_COMMENTDASH:
					if (c == Byte_ascii.Dash) {
						state = COLON_STATE_COMMENTDASHDASH;
					}
					else {
						state = COLON_STATE_COMMENT;
					}
					break;
				case COLON_STATE_COMMENTDASHDASH:
					if (c == Byte_ascii.Angle_bgn) {
						state = COLON_STATE_TEXT;
					}
					else {
						state = COLON_STATE_COMMENT;
					}
					break;
			default:
				throw Err_.new_wo_type("State machine error");
			}
		}
		if (level > 0) {
			Gfo_usr_dlg_.Instance.Warn_many("", "", "Invalid input; not enough close tags (level ~{0}, state ~{1})", level, state);
			return Bry_find_.Not_found;
		}
		return Bry_find_.Not_found;
	}

	private static final byte
	  LAST_SECTION_NONE = 0    // ''
	, LAST_SECTION_PARA = 1    // p
	, LAST_SECTION_PRE  = 2    // pre
	;
	private static final byte
	  PARA_STACK_NONE = 0	// false
	, PARA_STACK_BGN  = 1	// <p>
	, PARA_STACK_MID  = 2	// </p><p>
	;
	private static final int PRE_BGN = 0, PRE_END = 1;
	private static Btrie_slim_mgr pre_trie;
	private static boolean[] block_chars_ary; 
	private static boolean[] Block_chars_ary__new() {
		boolean[] rv = new boolean[256];
		rv[Byte_ascii.Star] = true;
		rv[Byte_ascii.Hash] = true;
		rv[Byte_ascii.Colon] = true;
		rv[Byte_ascii.Semic] = true;
		return rv;
	}
	private static Btrie_slim_mgr openMatchTrie, closeMatchTrie, blockquoteTrie;
	private static void ParaStackAdd(Bry_bfr bfr, int id) {
		switch (id) {
			case PARA_STACK_BGN: bfr.Add_str_a7("<p>"); break;
			case PARA_STACK_MID: bfr.Add_str_a7("</p><p>"); break;
			default:              throw Err_.new_unhandled_default(id);
		}
	}
}