/* XOWA: the XOWA Offline Wiki Application Copyright (C) 2012-2017 gnosygnu@gmail.com XOWA is licensed under the terms of the General Public License (GPL) Version 3, or alternatively under the terms of the Apache License Version 2.0. You may use XOWA according to either of these licenses as is most appropriate for your project on a case-by-case basis. The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.mediawiki.includes.parsers; import gplx.*; import gplx.xowa.*; import gplx.xowa.mediawiki.*; import gplx.xowa.mediawiki.includes.*; public class Xomw_regex_parser { private Bry_bfr tmp; public byte[][] Rslt() {return rslt;} private byte[][] rslt; public Xomw_regex_parser Add_ary(String... ary) {return Set_or_add(Parse_ary(ary));} private byte[][] Parse_ary(String... ary) { if (tmp == null) tmp = Bry_bfr_.New(); int ary_len = ary.length; byte[][] rv = new byte[ary_len][]; for (int i = 0; i < ary_len; i++) { rv[i] = Compile_itm(tmp, Bry_.new_u8(ary[i])); } return rv; } public Xomw_regex_parser Add_rng(String bgn, String end) {return Set_or_add(Parse_rng(bgn, end));} private byte[][] Parse_rng(String bgn, String end) { if (tmp == null) tmp = Bry_bfr_.New(); byte[] bgn_bry = Compile_itm(tmp, Bry_.new_u8(bgn)); int bgn_val = gplx.core.intls.Utf16_.Decode_to_int(bgn_bry, 0); byte[] end_bry = Compile_itm(tmp, Bry_.new_u8(end)); int end_val = gplx.core.intls.Utf16_.Decode_to_int(end_bry, 0); int rv_len = end_val - bgn_val + 1; byte[][] rv = new byte[rv_len][]; for (int i = 0; i < rv_len; i++) { rv[i] = gplx.core.intls.Utf16_.Encode_int_to_bry(i + bgn_val); } return rv; } private Xomw_regex_parser Set_or_add(byte[][] val) { rslt = rslt == null ? val : Bry_.Ary_add(rslt, val); return this; } private static byte[] Compile_itm(Bry_bfr tmp, byte[] src) { // parse each itm int src_end = src.length; int cur = 0; int prv = cur; boolean dirty = false; while (true) { // eos if (cur == src_end) { if (dirty) tmp.Add_mid(src, prv, src_end); break; } // look at byte byte b = src[cur]; switch (b) { // escape case Byte_ascii.Backslash: int nxt = cur + 1; if (nxt >= src_end) throw Err_.new_wo_type("regex escape failed: no more chars left", "src", src, "pos", nxt); byte nxt_byte = src[nxt]; switch (nxt_byte) { case Byte_ascii.Ltr_s: // \s -> " " src = Byte_ascii.Space_bry; cur = src_end; break; case Byte_ascii.Ltr_x: // \ u -> utf8 sequence in hex-dec; EX: "\xc2\xad" -> new byte[] {194, 160} // read next two bytes dirty = true; nxt++; if (nxt + 2 > src_end) throw Err_.new_wo_type("utf8 escape failed: no more chars left", "src", src, "pos", nxt); tmp.Add_byte((byte)gplx.core.encoders.Hex_utl_.Parse_or(src, nxt, nxt + 2, -1)); cur = nxt + 2; prv = cur; break; default: throw Err_.new_wo_type("regex escape failed: unknown char", "src", src, "pos", nxt); } break; default: // handles ascii only if (b > 127) throw Err_.new_wo_type("regex compiled failed: unknown char", "src", src, "pos", cur); cur++; break; } } // set item return dirty ? tmp.To_bry_and_clear() : src; } }