/* XOWA: the XOWA Offline Wiki Application Copyright (C) 2012-2017 gnosygnu@gmail.com XOWA is licensed under the terms of the General Public License (GPL) Version 3, or alternatively under the terms of the Apache License Version 2.0. You may use XOWA according to either of these licenses as is most appropriate for your project on a case-by-case basis. The terms of each license can be found in the source code repository: GPLv3 License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-GPLv3.txt Apache License: https://github.com/gnosygnu/xowa/blob/master/LICENSE-APACHE2.txt */ package gplx.xowa.bldrs.wms.dumps; import gplx.*; import gplx.xowa.*; import gplx.xowa.bldrs.*; import gplx.xowa.bldrs.wms.*; import gplx.core.ios.*; import gplx.xowa.wikis.domains.*; public class Xowm_dump_file { public Xowm_dump_file(String domain_str, String dump_date, String dump_type_str) { this.dump_date = dump_date; this.dump_type_str = dump_type_str; this.domain_itm = Xow_domain_itm_.parse(Bry_.new_u8(domain_str)); this.dump_abrv = Xow_abrv_wm_.To_abrv(domain_itm); this.file_name = Bld_file_name(dump_abrv, dump_date, dump_type_str); } public Xow_domain_itm Domain_itm() {return domain_itm;} private final Xow_domain_itm domain_itm; // EX: en.wikipedia.org public byte[] Dump_abrv() {return dump_abrv;} private final byte[] dump_abrv; // EX: enwiki public String Dump_date() {return dump_date;} private String dump_date; // EX: 20150807 public String Dump_type_str() {return dump_type_str;} private final String dump_type_str; // EX: pages-articles public String Server_url() {return server_url;} private String server_url; // EX: https://dumps.wikimedia.org public String File_url() {return file_url;} private String file_url; // EX: https://dumps.wikimedia.org/enwiki/20150807/enwiki-20150807-pages-articles.xml.bz2 public String File_name() {return file_name;} private String file_name; // EX: enwiki-20150807-pages-articles.xml.bz2 public long File_len() {return file_len;} private long file_len; // EX: 10 GB public DateAdp File_modified() {return file_modified;} private DateAdp file_modified; // EX: 2015-08-10 20:12:34 public void Dump_date_(String v) {dump_date = v;} public void Server_url_(String server_url) { this.server_url = server_url; String dump_dir_url = String_.new_u8(Xowm_dump_file_.Bld_dump_dir_url(Bry_.new_u8(server_url), dump_abrv, Bry_.new_u8(dump_date))); this.file_url = dump_dir_url + file_name; } public boolean Connect() { IoEngine_xrg_downloadFil args = Io_mgr.Instance.DownloadFil_args("", Io_url_.Empty); boolean rv = Connect_exec(args, file_url); // WMF changed dumping approach to partial dumps; this sometimes causes /latest/ to be missing page_articles; try to get earlier dump; DATE:2015-07-09 if ( !rv // not found && String_.In(server_url, Xowm_dump_file_.Server_wmf_http, Xowm_dump_file_.Server_wmf_https) // server is dumps.wikimedia.org && String_.Eq(dump_date, Xowm_dump_file_.Date_latest) // request dump was latest ) { Xoa_app_.Usr_dlg().Warn_many("", "", "wmf.dump:latest not found; url=~{0}", file_url); byte[] abrv_wm_bry = Xow_abrv_wm_.To_abrv(domain_itm); String new_dump_root = Xowm_dump_file_.Server_wmf_https + String_.new_u8(abrv_wm_bry) + "/"; // EX: http://dumps.wikimedia.org/enwiki/ byte[] wiki_dump_dirs_src = args.Exec_as_bry(new_dump_root); if (wiki_dump_dirs_src == null) {Xoa_app_.Usr_dlg().Warn_many("", "", "could not connect to dump server; url=~{0}", new_dump_root); return false;} String[] dates = gplx.xowa.bldrs.wms.dump_pages.Xowmf_wiki_dump_dirs_parser.Parse(domain_itm.Domain_bry(), wiki_dump_dirs_src); int dates_len = dates.length; for (int i = dates_len - 1; i > -1; --i) { String new_dump_date = dates[i]; if (String_.Eq(new_dump_date, Xowm_dump_file_.Date_latest)) continue; // skip latest; assume it is bad String new_dump_file = String_.Replace(file_name, Xowm_dump_file_.Date_latest, new_dump_date); // replace "-latest-" with "-20150602-"; String new_file_url = new_dump_root + new_dump_date + "/" + new_dump_file; rv = Connect_exec(args, new_file_url); if (rv) { Xoa_app_.Usr_dlg().Note_many("", "", "wmf.dump:dump found; url=~{0}", new_file_url); dump_date = new_dump_date; file_name = new_dump_file; file_url = new_file_url; break; } else Xoa_app_.Usr_dlg().Warn_many("", "", "wmf.dump:dump not found; url=~{0}", new_file_url); } } return rv; } private boolean Connect_exec(IoEngine_xrg_downloadFil args, String cur_file_url) { boolean rv = args.Src_last_modified_query_(true).Exec_meta(cur_file_url); long tmp_file_len = args.Src_content_length(); DateAdp tmp_file_modified = args.Src_last_modified(); Xoa_app_.Usr_dlg().Note_many("", "", "wmf.dump:connect rslts; url=~{0} result=~{1} fil_len=~{2} file_modified=~{3} server_url=~{4} dump_date=~{5}", cur_file_url, rv, tmp_file_len, tmp_file_modified == null ? "<<NULL>>" : tmp_file_modified.XtoStr_fmt_yyyy_MM_dd_HH_mm_ss(), server_url, dump_date); if (rv) { if (tmp_file_modified != null && tmp_file_modified.Year() <= 1970) return false; // url has invalid file; note that dumps.wikimedia.org currently returns back an HTML page with "404 not found"; rather than try to download and parse this (since content may change), use the date_modified which always appears to be UnixTime 0; DATE:2015-07-21 file_len = tmp_file_len; file_modified = tmp_file_modified; } return rv; } private static String Bld_file_name(byte[] dump_abrv, String dump_date, String dump_type_str) { byte[] dump_type_bry = Bry_.new_u8(dump_type_str); int dump_type_int = Xowm_dump_type_.parse_by_file(dump_type_bry); byte[] dump_file_ext = Xowm_dump_file_.Ext_xml_bz2; switch (dump_type_int) { case Xowm_dump_type_.Int__page_props: case Xowm_dump_type_.Int__categorylinks: case Xowm_dump_type_.Int__image: case Xowm_dump_type_.Int__pagelinks: dump_file_ext = Xowm_dump_file_.Ext_sql_gz; break; } return String_.new_u8(Xowm_dump_file_.Bld_dump_file_name(dump_abrv, Bry_.new_u8(dump_date), dump_type_bry, dump_file_ext)); } }