package org.wikibrain.core.cmd; import org.wikibrain.core.lang.Language; import java.io.File; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author Yulun Li, Shilad Sen * * Matches database dump files published by Wikipedia at: * http://dumps.wikimedia.org based on their names. * * Extracts counter number (if a certain type of file is split into multiple pieces) * and language. * */ public enum FileMatcher { MULTISTREAM ( "multistream", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-articles-multistream.xml.bz2")), MULTISTREAM_INDEX ( "multistream_index", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-articles-multistream-index\\.txt\\.bz2")), EDIT_HISTORY_7z ( "edit_history_7z", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-meta-history(\\d+).xml-.+\\.7z"), Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-meta-history.xml.7z")), EDIT_HISTORY_bz2 ( "edit_history_bz2", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-meta-history(\\d+).xml-.+\\.bz2"), Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-meta-history.xml.bz2")), LOG ( "log_events", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-logging.xml.gz")), META_CURRENT ( "current_meta", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-meta-current(\\d+).xml-.+\\.bz2"), Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-meta-current.xml.bz2")), ARTICLES ("articles", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-articles(\\d+)\\.xml-.+\\.bz2"), Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pages-articles.xml.bz2")), STUB_ARTICLES ( "stub_articles", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-stub-articles\\d*.xml.gz")), STUB_META_CURRENT ( "stub_meta_current", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-stub-meta-current\\d*.xml.gz")), STUB_META_HISTORY ( "stub_meta_histories", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-stub-meta-history\\d*.xml.gz")), ABSTRACT ( "abstracts", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-abstract(\\d+)\\.xml"), Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-abstract.*(?<!(\\.xml\\-rss))\\.xml")), TITLES ( "titles", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-all-titles-in-ns0.gz")), INTERLINK ( "interwiki_links", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-iwlinks.sql.gz")), REDIRECT_LIST ( "redirect_lists", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-redirect.sql.gz")), PROTECTED_TITLES ( "protected_titles", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-protected_titles.sql.gz")), NAME_PAIRS ( "name-pairs", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-page_props.sql.gz")), PAGE_RESTRICTIONS ( "page_restrictions", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-page_restrictions.sql.gz")), PAGE_BASE ( "base_page_datas", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-page.sql.gz")), CATEGORY ( "categories", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-category.sql.gz")), USER_GROUP ( "user_groups", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-user_groups.sql.gz")), INTERWIKI ( "interwiki_prefixes", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-interwiki.sql.gz")), INTERLANG_LINKS ( "interlang_links", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-langlinks.sql.gz")), EXTERNAL_LINKS ( "external_links", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-externallinks.sql.gz")), TEMPLATE_LINKS ( "template_links", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-templatelinks.sql.gz")), IMAGE_LINKS ( "image_links", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-imagelinks.sql.gz")), CATEGORY_LINKS ( "category_links", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-categorylinks.sql.gz")), LINK_SQL ( "links", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-pagelinks.sql.gz")), OLD_MEDIA_META ( "old_media_metas", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-oldimage.sql.gz")), CURRENT_MEDIA_META( "current_media_metas", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-image.sql.gz")), SITE_STATS ( "site_stats", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-site_stats.sql.gz")), FLAGGED_REVISION ( "flagged_revisions", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-flaggedrevs.sql.gz")), FLAGGED_PAGES ( "flagged_pages", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-flaggedpages.sql.gz")), WIKIDATA_ITEMS ( "wikidata_items", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-wb_items_per_site.sql.gz")), MD5 ( "md5_checksums", Pattern.compile(".*?([a-zA-Z_-]+)wiki.+-md5sums.txt")); private String name; private Pattern[] patterns; /** * Constructs a new link finder. * @param name Name of the link finder * @param patterns A set of patterns that is tried in consecutive order until * at least one match is found. */ FileMatcher(String name, Pattern... patterns) { this.name = name; this.patterns = patterns; } /** * Given a set of Strings containing urls, returns the subset of strings * that match at least one of the specified regular expressions. */ public List<String> match(List<String> links) { List<String> result = new ArrayList<String>(); for (Pattern p : patterns) { for (String link : links) { if (p.matcher(link).matches()) { result.add(link); } } if (!result.isEmpty()) { break; } } return result; } /** * Return all files whose names match at least one of the specified regexes. */ public List<File> matchFiles(List<File> paths) { List<File> result = new ArrayList<File>(); for (Pattern p : patterns) { for (File file : paths) { if (p.matcher(file.getAbsolutePath()).matches()) { result.add(file); } } if (!result.isEmpty()) { break; } } return result; } /** * @param link A link as returned by this.match() (e.g. enwiki-latest-abstract10.xml-rss.xml) * @return The index of the file (e.g. 10). If there is no index, returns 1. */ public int getNumber(String link) { for (Pattern p : patterns) { Matcher m = p.matcher(link); if (m.matches()) { if (m.groupCount() >= 2) { return Integer.valueOf(m.group(m.groupCount())); // get last group } else { return 1; // Wikipedia file indexes start at 1 } } } throw new IllegalStateException(); } public Language getLanguage(String link) { int end = link.lastIndexOf("wiki"); if (end < 1) { throw new IllegalStateException("No language detected for " + link); } int beg; for (beg = end-1; beg >=0 && isLangChar(link.charAt(beg)); beg--) { // All work is done in loop condition. } return Language.getByLangCode(link.substring(beg + 1, end)); } private boolean isLangChar(char c) { return Character.isLetter(c) || Character.isDigit(c) || c == '_' || c == '-'; } public String getName() { return name; } public static FileMatcher getByName(String name) { for (FileMatcher linkMatcher : FileMatcher.values()) { if (linkMatcher.getName().equalsIgnoreCase(name)) { return linkMatcher; } } return null; } public static List<FileMatcher> getListByNames(List<String> listNames) { List<FileMatcher> listMatchers = new ArrayList<FileMatcher>(); for (String name : listNames) { listMatchers.add(getByName(name)); } return listMatchers; } static public List<String> getAllNames() { List<String> result = new ArrayList<String>(); for (FileMatcher matcher : FileMatcher.values()) { result.add(matcher.getName()); } return result; } }