/* XOWA: the XOWA Offline Wiki Application Copyright (C) 2012-2017 gnosygnu@gmail.com XOWA is licensed under the terms of the General Public License (GPL) Version 3, or alternatively under the terms of the Apache License Version 2.0. You may use XOWA according to either of these licenses as is most appropriate for your project on a case-by-case basis. The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.mediawiki.includes.parsers.tables; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; import gplx.xowa.mediawiki.includes.parsers.*; import gplx.xowa.parsers.htmls.*; import gplx.xowa.mediawiki.includes.libs.*; import gplx.xowa.parsers.uniqs.*; public class Xomw_table_wkr implements gplx.core.brys.Bry_split_wkr {// THREAD.UNSAFE: caching for repeated calls private final Bry_bfr tmp; private Bry_bfr bfr; private final XomwSanitizer sanitizer; private final XomwStripState strip_state; private final List_adp td_history = List_adp_.New() // Is currently a td tag open? , last_tag_history = List_adp_.New() // Save history of last lag activated (td, th or caption) , tr_history = List_adp_.New() // Is currently a tr tag open? , tr_attributes = List_adp_.New() // history of tr attributes , has_opened_tr = List_adp_.New() // Did this table open a <tr> element? ; private int indent_level = 0; // indent level of the table private byte[] first_2 = new byte[2]; public Xomw_table_wkr(Bry_bfr tmp, XomwSanitizer sanitizer, XomwStripState stripState) { this.tmp = tmp; this.sanitizer = sanitizer; this.strip_state = stripState; } public void doTableStuff(XomwParserCtx pctx, XomwParserBfr pbfr) { Bry_bfr src_bfr = pbfr.Src(); byte[] src = src_bfr.Bfr(); int src_bgn = 0; int src_end = src_bfr.Len(); this.bfr = pbfr.Trg(); pbfr.Switch(); indent_level = 0; Bry_split_.Split(src, src_bgn, src_end, Byte_ascii.Nl, Bool_.N, this); // PORTED.SPLIT: $lines = StringUtils::explode("\n", $text); // Closing open td, tr && table while (td_history.Len() > 0) { if (XophpArray.popBoolOrN(td_history)) { bfr.Add_str_a7("</td>\n"); } if (XophpArray.popBoolOrN(tr_history)) { bfr.Add_str_a7("</tr>\n"); } if (!XophpArray.popBoolOrN(has_opened_tr)) { bfr.Add_str_a7("<tr><td></td></tr>\n"); } bfr.Add_str_a7("</table>\n"); } // Remove trailing line-ending (b/c) if (bfr.Get_at_last_or_nil_if_empty() == Byte_ascii.Nl) { bfr.Del_by_1(); } // special case: don't return empty table if ( bfr.Len() == Len__tb__empty && Bry_.Eq(bfr.Bfr(), 0, Len__tb__empty, Html__tb__empty)) { bfr.Clear(); return; } } public int Split(byte[] src, int itm_bgn, int itm_end) { byte[] out_line = Bry_.Mid(src, itm_bgn, itm_end); // MW: "$outLine" byte[] line = Bry_.Trim(out_line); // MW: "$line" int line_len = line.length; if (line_len == 0) { // empty line, go to next line bfr.Add(out_line).Add_byte_nl(); return Bry_split_.Rv__ok; } byte first_char = line[0]; first_2[0] = line[0]; first_2[1] = line_len == 1 ? Byte_ascii.Null : line[1]; // PORTED: preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches) byte[] tblw_atrs = null; boolean tblw_bgn_found = false; int colons_end = Bry_find_.Find_fwd_while(src, 0, line_len, Byte_ascii.Colon); int tblw_bgn = Bry_find_.Find_fwd_while(line, colons_end, line_len, Byte_ascii.Space); int tblw_atrs_bgn = tblw_bgn + 2; if (Bry_.Eq(line, tblw_bgn, tblw_atrs_bgn, Wtxt__tb__bgn)) { tblw_bgn_found = true; tblw_atrs = (tblw_atrs_bgn == line_len) ? Bry_.Empty : Bry_.Mid(line, tblw_atrs_bgn, line_len); } if (tblw_bgn_found) { // First check if we are starting a new table indent_level = colons_end; tblw_atrs = strip_state.unstripBoth(tblw_atrs); // PORTED: out_line = str_repeat('<dl><dd>', $indent_level) . "<table{atrs}>"; for (int j = 0; j < indent_level; j++) tmp.Add(Html__dl__bgn); tmp.Add_str_a7("<table"); sanitizer.fixTagAttributes(tmp, Name__table, tblw_atrs); tmp.Add_byte(Byte_ascii.Angle_end); out_line = tmp.To_bry_and_clear(); td_history.Add(false); last_tag_history.Add(Bry_.Empty); tr_history.Add(false); tr_attributes.Add(Bry_.Empty); has_opened_tr.Add(false); } else if (td_history.Len() == 0) { // Don't do any of the following bfr.Add(out_line).Add_byte_nl(); return Bry_split_.Rv__ok; } else if (Bry_.Eq(first_2, Wtxt__tb__end)) { // We are ending a table line = tmp.Add_str_a7("</table>").Add_mid(line, 2, line.length).To_bry_and_clear(); byte[] last_tag = XophpArray.popBryOrNull(last_tag_history); if (!XophpArray.popBoolOrN(has_opened_tr)) { line = tmp.Add_str_a7("<tr><td></td></tr>").Add(line).To_bry_and_clear(); } if (XophpArray.popBoolOrN(tr_history)) { line = tmp.Add_str_a7("</tr>").Add(line).To_bry_and_clear(); } if (XophpArray.popBoolOrN(td_history)) { line = tmp.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(line).To_bry_and_clear(); } XophpArray.popBryOrNull(tr_attributes); // PORTED:$outLine = $line . str_repeat( '</dd></dl>', $indent_level ); tmp.Add(line); for (int j = 0; j < indent_level; j++) tmp.Add(Html__dl__end); out_line = tmp.To_bry_and_clear(); } else if (Bry_.Eq(first_2, Wtxt__tr)) { // Now we have a table row line = Bry_.Mid(line, 2); // PORTED: $line = preg_replace('#^\|-+#', '', $line); // Whats after the tag is now only attributes byte[] atrs = strip_state.unstripBoth(line); sanitizer.fixTagAttributes(tmp, Name__tr, atrs); atrs = tmp.To_bry_and_clear(); XophpArray.popBryOrNull(tr_attributes); tr_attributes.Add(atrs); line = Bry_.Empty; byte[] last_tag = XophpArray.popBryOrNull(last_tag_history); XophpArray.popBoolOrN(has_opened_tr); has_opened_tr.Add(true); if (XophpArray.popBoolOrN(tr_history)) { line = Html__tr__end; } if (XophpArray.popBoolOrN(td_history)) { line = tmp.Add_str_a7("</").Add(last_tag).Add_byte(Byte_ascii.Gt).Add(line).To_bry_and_clear(); } out_line = line; tr_history.Add(false); td_history.Add(false); last_tag_history.Add(Bry_.Empty); } else if ( first_char == Byte_ascii.Pipe || first_char == Byte_ascii.Bang || Bry_.Eq(first_2, Wtxt__caption) ) { // This might be cell elements, td, th or captions if (Bry_.Eq(first_2, Wtxt__caption)) { first_char = Byte_ascii.Plus; line = Bry_.Mid(line, 2); } else { line = Bry_.Mid(line, 1); } // Implies both are valid for table headings. if (first_char == Byte_ascii.Bang) { XomwStringUtils.replaceMarkup(line, 0, line.length, Wtxt__th2, Wtxt__td2); // $line = StringUtils::replaceMarkup('!!', '||', $line); } // Split up multiple cells on the same line. // FIXME : This can result in improper nesting of tags processed // by earlier parser steps. byte[][] cells = Bry_split_.Split(line, Wtxt__td2); if (cells.length == 0) cells = Cells__empty; // handle "\n|\n" which should still generate "<tr><td></td></tr>", not ""; see TEST out_line = Bry_.Empty; byte[] previous = null; // Loop through each table cell int cells_len = cells.length; for (int j = 0; j < cells_len; j++) { byte[] cell = cells[j]; previous = Bry_.Empty; if (first_char != Byte_ascii.Plus) { byte[] tr_after = XophpArray.popBryOrNull(tr_attributes); if (!XophpArray.popBoolOrN(tr_history)) { previous = tmp.Add_str_a7("<tr").Add(tr_after).Add_str_a7(">\n").To_bry_and_clear(); } tr_history.Add(true); tr_attributes.Add(Bry_.Empty); XophpArray.popBoolOrN(has_opened_tr); has_opened_tr.Add(true); } byte[] last_tag = XophpArray.popBryOrNull(last_tag_history); if (XophpArray.popBoolOrN(td_history)) { previous = tmp.Add_str_a7("</").Add(last_tag).Add_str_a7(">\n").Add(previous).To_bry_and_clear(); } if (first_char == Byte_ascii.Pipe) { last_tag = Name__td; } else if (first_char == Byte_ascii.Bang) { last_tag = Name__th; } else if (first_char == Byte_ascii.Plus) { last_tag = Name__caption; } else { last_tag = Bry_.Empty; } last_tag_history.Add(last_tag); // A cell could contain both parameters and data byte[][] cell_data = Bry_split_.Split_w_max(cell, Byte_ascii.Pipe, 2); // Bug 553: Note that a '|' inside an invalid link should not // be mistaken as delimiting cell parameters byte[] cell_data_0 = cell_data[0]; byte[] cell_data_1 = cell_data[1]; if (Bry_find_.Find_fwd(cell_data_0, Wtxt__lnki__bgn) != Bry_find_.Not_found) { cell = tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell).To_bry_and_clear(); } else if (cell_data_1 == null) { cell = tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag).Add_byte(Byte_ascii.Angle_end).Add(cell_data_0).To_bry_and_clear(); } else { byte[] atrs = strip_state.unstripBoth(cell_data_0); tmp.Add(previous).Add_byte(Byte_ascii.Angle_bgn).Add(last_tag); sanitizer.fixTagAttributes(tmp, last_tag, atrs); tmp.Add_byte(Byte_ascii.Angle_end).Add(cell_data_1); cell = tmp.To_bry_and_clear(); } out_line = Bry_.Add(out_line, cell); td_history.Add(true); } } bfr.Add(out_line).Add_byte_nl(); return Bry_split_.Rv__ok; } // public function doTableStuff($text) { // // $lines = StringUtils::explode("\n", $text); // $out = ''; // $td_history = []; # Is currently a td tag open? // $last_tag_history = []; # Save history of last lag activated (td, th or caption) // $tr_history = []; # Is currently a tr tag open? // $tr_attributes = []; # history of tr attributes // $has_opened_tr = []; # Did this table open a <tr> element? // $indent_level = 0; # indent level of the table // // foreach ($lines as $outLine) { // $line = trim($outLine); // // if ($line === '') { # empty line, go to next line // $out .= $outLine . "\n"; // continue; // } // // $first_character = $line[0]; // $first_two = substr($line, 0, 2); // $matches = []; // // if (preg_match('/^(:*)\s*\{\|(.*)$/', $line, $matches)) { // # First check if we are starting a new table // $indent_level = strlen($matches[1]); // // $attributes = this.mStripState->unstripBoth($matches[2]); // $attributes = Sanitizer::fixTagAttributes($attributes, 'table'); // // $outLine = str_repeat('<dl><dd>', $indent_level) . "<table{$attributes}>"; // array_push($td_history, false); // array_push($last_tag_history, ''); // array_push($tr_history, false); // array_push($tr_attributes, ''); // array_push($has_opened_tr, false); // } elseif (count($td_history) == 0) { // # Don't do any of the following // $out .= $outLine . "\n"; // continue; // } elseif ($first_two === '|}') { // # We are ending a table // $line = '</table>' . substr($line, 2); // $last_tag = array_pop($last_tag_history); // // if (!array_pop($has_opened_tr)) { // $line = "<tr><td></td></tr>{$line}"; // } // // if (array_pop($tr_history)) { // $line = "</tr>{$line}"; // } // // if (array_pop($td_history)) { // $line = "</{$last_tag}>{$line}"; // } // array_pop($tr_attributes); // $outLine = $line . str_repeat('</dd></dl>', $indent_level); // } elseif ($first_two === '|-') { // # Now we have a table row // $line = preg_replace('#^\|-+#', '', $line); // // # Whats after the tag is now only attributes // $attributes = this.mStripState->unstripBoth($line); // $attributes = Sanitizer::fixTagAttributes($attributes, 'tr'); // array_pop($tr_attributes); // array_push($tr_attributes, $attributes); // // $line = ''; // $last_tag = array_pop($last_tag_history); // array_pop($has_opened_tr); // array_push($has_opened_tr, true); // // if (array_pop($tr_history)) { // $line = '</tr>'; // } // // if (array_pop($td_history)) { // $line = "</{$last_tag}>{$line}"; // } // // $outLine = $line; // array_push($tr_history, false); // array_push($td_history, false); // array_push($last_tag_history, ''); // } elseif ($first_character === '|' // || $first_character === '!' // || $first_two === '|+' // ) { // # This might be cell elements, td, th or captions // if ($first_two === '|+') { // $first_character = '+'; // $line = substr($line, 2); // } else { // $line = substr($line, 1); // } // // // Implies both are valid for table headings. // if ($first_character === '!') { // $line = StringUtils::replaceMarkup('!!', '||', $line); // } // // # Split up multiple cells on the same line. // # FIXME : This can result in improper nesting of tags processed // # by earlier parser steps. // $cells = explode('||', $line); // // $outLine = ''; // // # Loop through each table cell // foreach ($cells as $cell) { // $previous = ''; // if ($first_character !== '+') { // $tr_after = array_pop($tr_attributes); // if (!array_pop($tr_history)) { // $previous = "<tr{$tr_after}>\n"; // } // array_push($tr_history, true); // array_push($tr_attributes, ''); // array_pop($has_opened_tr); // array_push($has_opened_tr, true); // } // // $last_tag = array_pop($last_tag_history); // // if (array_pop($td_history)) { // $previous = "</{$last_tag}>\n{$previous}"; // } // // if ($first_character === '|') { // $last_tag = 'td'; // } elseif ($first_character === '!') { // $last_tag = 'th'; // } elseif ($first_character === '+') { // $last_tag = 'caption'; // } else { // $last_tag = ''; // } // // array_push($last_tag_history, $last_tag); // // # A cell could contain both parameters and data // $cell_data = explode('|', $cell, 2); // // # T2553: Note that a '|' inside an invalid link should not // # be mistaken as delimiting cell parameters // # Bug T153140: Neither should language converter markup. // if (preg_match('/\[\[|-\{/', $cell_data[0]) === 1) { // $cell = "{$previous}<{$last_tag}>{$cell}"; // } elseif (count($cell_data) == 1) { // $cell = "{$previous}<{$last_tag}>{$cell_data[0]}"; // } else { // $attributes = this.mStripState->unstripBoth($cell_data[0]); // $attributes = Sanitizer::fixTagAttributes($attributes, $last_tag); // $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}"; // } // // $outLine .= $cell; // array_push($td_history, true); // } // } // $out .= $outLine . "\n"; // } // // # Closing open td, tr && table // while (count($td_history) > 0) { // if (array_pop($td_history)) { // $out .= "</td>\n"; // } // if (array_pop($tr_history)) { // $out .= "</tr>\n"; // } // if (!array_pop($has_opened_tr)) { // $out .= "<tr><td></td></tr>\n"; // } // // $out .= "</table>\n"; // } // // # Remove trailing line-ending (b/c) // if (substr($out, -1) === "\n") { // $out = substr($out, 0, -1); // } // // # special case: don't return empty table // if ($out === "<table>\n<tr><td></td></tr>\n</table>") { // $out = ''; // } // // return $out; // } private static final byte[] Wtxt__tb__bgn = Bry_.new_a7("{|") , Wtxt__tb__end = Bry_.new_a7("|}") , Wtxt__tr = Bry_.new_a7("|-") , Wtxt__caption = Bry_.new_a7("|+") , Wtxt__th2 = Bry_.new_a7("!!") , Wtxt__td2 = Bry_.new_a7("||") , Wtxt__lnki__bgn = Bry_.new_a7("[[") , Name__table = Bry_.new_a7("table") , Name__tr = Bry_.new_a7("tr") , Name__td = Bry_.new_a7("td") , Name__th = Bry_.new_a7("th") , Name__caption = Bry_.new_a7("caption") , Html__tr__end = Bry_.new_a7("</tr>") , Html__dl__bgn = Bry_.new_a7("<dl><dd>") , Html__dl__end = Bry_.new_a7("</dd></dl>") , Html__tb__empty = Bry_.new_a7("<table>\n<tr><td></td></tr>\n</table>") ; private static final int Len__tb__empty = Html__tb__empty.length; private static final byte[][] Cells__empty = new byte[][] {Bry_.Empty}; }