/*
XOWA: the XOWA Offline Wiki Application
Copyright (C) 2012-2017 gnosygnu@gmail.com
XOWA is licensed under the terms of the General Public License (GPL) Version 3,
or alternatively under the terms of the Apache License Version 2.0.
You may use XOWA according to either of these licenses as is most appropriate
for your project on a case-by-case basis.
The terms of each license can be found in the source code repository:
GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt
Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt
*/
package gplx.xowa.addons.wikis.searchs.searchers; import gplx.*; import gplx.xowa.*; import gplx.xowa.addons.*; import gplx.xowa.addons.wikis.*; import gplx.xowa.addons.wikis.searchs.*;
import gplx.core.btries.*;
import gplx.xowa.addons.wikis.searchs.searchers.crts.*;
public class Srch_search_phrase {
public Srch_search_phrase(boolean wildcard, byte[] orig, byte[] compiled, Srch_crt_scanner_syms syms) {
this.Orig = orig;
this.Compiled = compiled;
this.Wildcard = wildcard;
this.Syms = syms;
}
public final boolean Wildcard;
public final byte[] Orig; // EX: "Earth"
public final byte[] Compiled; // EX: "earth*"
public final Srch_crt_scanner_syms Syms;
public static Srch_search_phrase New(gplx.xowa.langs.cases.Xol_case_mgr case_mgr, Srch_crt_scanner_syms syms, boolean auto_wildcard, byte[] orig) {
int orig_len = orig.length;
if ( orig_len > 0 // if "*" at end, remove and change to wildcard; needed for Special:Search which will send in "earth*" but "earth" needed for highlighting
&& orig[orig_len - 1] == syms.Wild()) {
orig = Bry_.Mid(orig, 0, orig_len - 1);
auto_wildcard = true;
}
byte[] lcase = case_mgr.Case_build_lower(orig);
lcase = Auto_wildcard(lcase, auto_wildcard, syms);
return new Srch_search_phrase(auto_wildcard, orig, lcase, syms);
}
public static byte[] Auto_wildcard(byte[] raw, boolean auto_wildcard, Srch_crt_scanner_syms syms) {
Btrie_slim_mgr trie = syms.Trie();
int raw_len = raw.length;
int insert_pos = -1;
int fail_pos = -1;
for (int i = raw_len - 1; i > -1; --i) {
byte b = raw[i];
byte tid = trie.Match_byte_or(b, raw, i, i + 1, Byte_.Max_value_127);
if (tid == Byte_.Max_value_127) { // unknown sym
if (b == syms.Wild()) { // wildcard is not tokenized
fail_pos = i;
break;
}
else { // alphanum-char
insert_pos = i;
break;
}
}
else {
switch (tid) {
case Srch_crt_tkn.Tid__quote:
case Srch_crt_tkn.Tid__space:
case Srch_crt_tkn.Tid__not:
case Srch_crt_tkn.Tid__and:
case Srch_crt_tkn.Tid__or:
case Srch_crt_tkn.Tid__paren_bgn:
fail_pos = i; // these symbols will not auto-wildcard, unless they are escaped
i = -1;
break;
case Srch_crt_tkn.Tid__escape:
if (i > 0) {
int prv_pos = i -1;
if (raw[prv_pos] == syms.Escape()) { // an escaped escape can be wildcarded; EX: "\\" -> "\\*"
insert_pos = i;
i = -1;
}
else
fail_pos = i;
}
else
fail_pos = i;
i = -1;
break;
case Srch_crt_tkn.Tid__paren_end:
break;
}
}
}
// check if preceded by escape
if (insert_pos == -1) {
if ( fail_pos > 0
&& raw[fail_pos - 1] == syms.Escape()) {
insert_pos = fail_pos;
}
else
return raw;
}
// check if word already has wildcard; EX: "a*b" x> "a*b*"
for (int i = insert_pos - 1; i > -1; --i) {
byte b = raw[i];
if (b == syms.Wild()) {
int prv_pos = i - 1;
if (prv_pos > -1) {
if (raw[prv_pos] == syms.Escape()) { // ignore escaped wildcard
i = prv_pos;
continue;
}
}
return raw; // existing wildcard cancels auto-wildcard
}
else if (b == syms.Space()) { // stop looking when word ends
break;
}
else {} // alphanum; keep going
}
// add wildcard
if (insert_pos == raw_len - 1)
return auto_wildcard ? Bry_.Add(raw, syms.Wild()) : raw;
else {
byte[] rv = new byte[raw_len + 1];
int wildcard_pos = insert_pos + 1;
for (int i = 0; i < wildcard_pos; ++i)
rv[i] = raw[i];
rv[wildcard_pos] = syms.Wild();
for (int i = wildcard_pos; i < raw_len; ++i)
rv[i + 1] = raw[i];
return rv;
}
}
}