// License: GPL. For details, see LICENSE file. package org.wikipedia; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.AbstractList; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.TreeMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import java.util.regex.Pattern; import java.util.stream.Collector; import java.util.stream.Collectors; import java.util.stream.Stream; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.openstreetmap.josm.Main; import org.openstreetmap.josm.data.coor.LatLon; import org.openstreetmap.josm.data.osm.OsmPrimitive; import org.openstreetmap.josm.gui.datatransfer.ClipboardUtils; import org.openstreetmap.josm.tools.HttpClient; import org.openstreetmap.josm.tools.Utils; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.wikipedia.data.WikidataEntry; import org.wikipedia.data.WikipediaEntry; import org.wikipedia.tools.XPath; public final class WikipediaApp { public static final Pattern WIKIDATA_PATTERN = Pattern.compile("Q\\d+"); private static final XPath X_PATH = XPath.getInstance(); private final String wikipediaLang; private final String siteId; private WikipediaApp(final String wikipediaLang) { // FIXME: the proper way to get any wiki's site id is through an API call: // https://zh-yue.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general // use "wikiid" value. The value may be cached as it will never change String siteId = wikipediaLang.replace('-', '_'); switch (siteId) { case "be_tarask": siteId = "be_x_old"; break; } this.wikipediaLang = wikipediaLang; this.siteId = siteId + "wiki"; } public static WikipediaApp forLanguage(final String wikipediaLang) { return new WikipediaApp(wikipediaLang); } static String getMediawikiLocale(Locale locale) { if (!locale.getCountry().isEmpty()) { return locale.getLanguage() + "-" + locale.getCountry().toLowerCase(); } else { return locale.getLanguage(); } } public String getSiteUrl() { if ("wikidata".equals(wikipediaLang)) { return "https://www.wikidata.org"; } else { return "https://" + wikipediaLang + ".wikipedia.org"; } } private static HttpClient.Response connect(String url) throws IOException { final HttpClient.Response response = HttpClient.create(new URL(url)).setReasonForRequest("Wikipedia").connect(); if (response.getResponseCode() != 200) { throw new IOException("Server responded with HTTP " + response.getResponseCode()); } return response; } public List<WikipediaEntry> getEntriesFromCoordinates(LatLon min, LatLon max) { try { // construct url final String url = getSiteUrl() + "/w/api.php" + "?action=query" + "&list=geosearch" + "&format=xml" + "&gslimit=500" + "&gsbbox=" + max.lat() + "|" + min.lon() + "|" + min.lat() + "|" + max.lon(); // parse XML document try (final InputStream in = connect(url).getContent()) { final Document doc = newDocumentBuilder().parse(in); final List<WikipediaEntry> entries = X_PATH.evaluateNodes("//gs", doc).stream() .map(node -> { final String name = X_PATH.evaluateString("@title", node); final LatLon latLon = new LatLon( X_PATH.evaluateDouble("@lat", node), X_PATH.evaluateDouble("@lon", node)); if ("wikidata".equals(wikipediaLang)) { return new WikidataEntry(name, null, latLon, null); } else { return new WikipediaEntry(wikipediaLang, name, latLon); } }).collect(Collectors.toList()); if ("wikidata".equals(wikipediaLang)) { return getLabelForWikidata(entries, Locale.getDefault()).stream().collect(Collectors.toList()); } else { return entries; } } } catch (Exception ex) { throw new RuntimeException(ex); } } public static List<WikidataEntry> getWikidataEntriesForQuery(final String languageForQuery, final String query, final Locale localeForLabels) { try { final String url = "https://www.wikidata.org/w/api.php" + "?action=wbsearchentities" + "&language=" + languageForQuery + "&strictlanguage=false" + "&search=" + Utils.encodeUrl(query) + "&limit=50" + "&format=xml"; try (final InputStream in = connect(url).getContent()) { final Document xml = newDocumentBuilder().parse(in); final List<WikidataEntry> r = X_PATH.evaluateNodes("//entity", xml).stream() .map(node -> new WikidataEntry(X_PATH.evaluateString("@id", node), null, null, null)) .collect(Collectors.toList()); return getLabelForWikidata(r, localeForLabels); } } catch (Exception ex) { throw new RuntimeException(ex); } } public List<WikipediaEntry> getEntriesFromCategory(String category, int depth) { try { final String url = "https://tools.wmflabs.org/cats-php/" + "?lang=" + wikipediaLang + "&depth=" + depth + "&cat=" + Utils.encodeUrl(category); try (final BufferedReader reader = connect(url).getContentReader()) { return reader.lines() .map(line -> new WikipediaEntry(wikipediaLang, line.trim().replace("_", " "))) .collect(Collectors.toList()); } } catch (IOException ex) { throw new RuntimeException(ex); } } public static List<WikipediaEntry> getEntriesFromClipboard(final String wikipediaLang) { return Pattern.compile("[\\n\\r]+") .splitAsStream(ClipboardUtils.getClipboardStringContent()) .map(x -> new WikipediaEntry(wikipediaLang, x)) .collect(Collectors.toList()); } public void updateWIWOSMStatus(List<WikipediaEntry> entries) { if (entries.size() > 20) { partitionList(entries, 20).forEach(chunk -> updateWIWOSMStatus(chunk)); return; } Map<String, Boolean> status = new HashMap<>(); if (!entries.isEmpty()) { final String url = "https://tools.wmflabs.org/wiwosm/osmjson/getGeoJSON.php?action=check&lang=" + wikipediaLang; try { final String articles = entries.stream().map(i -> i.article).collect(Collectors.joining(",")); final String requestBody = "articles=" + Utils.encodeUrl(articles); try (final BufferedReader reader = HttpClient.create(new URL(url), "POST").setReasonForRequest("Wikipedia") .setHeader("Content-Type", "application/x-www-form-urlencoded") .setRequestBody(requestBody.getBytes(StandardCharsets.UTF_8)) .connect().getContentReader()) { reader.lines().forEach(line -> { //[article]\t[0|1] final String[] x = line.split("\t"); if (x.length == 2) { status.put(x[0], "1".equals(x[1])); } else { Main.error("Unknown element " + line); } }); } } catch (Exception ex) { throw new RuntimeException(ex); } } for (WikipediaEntry i : entries) { i.setWiwosmStatus(status.get(i.article)); } } public Stream<String> getWikipediaArticles(final OsmPrimitive p) { if ("wikidata".equals(wikipediaLang)) { return Stream.of(p.get("wikidata")).filter(Objects::nonNull); } return Stream .of("wikipedia", "wikipedia:" + wikipediaLang) .map(key -> WikipediaEntry.parseTag(key, p.get(key))) .filter(Objects::nonNull) .filter(wp -> wikipediaLang.equals(wp.lang)) .map(wp -> wp.article); } /** * Returns a map mapping wikipedia articles to wikidata ids. */ public Map<String, String> getWikidataForArticles(Collection<String> articles) { return articles.stream() .distinct() .collect(Collectors.groupingBy(new Function<String, Integer>() { final AtomicInteger group = new AtomicInteger(); final AtomicInteger count = new AtomicInteger(); final AtomicInteger length = new AtomicInteger(); @Override public Integer apply(String o) { // max. 50 titles, max. 2048 of URL encoded title chars (to avoid HTTP 414) if (count.incrementAndGet() > 50 || length.addAndGet(Utils.encodeUrl(o).length()) > 2048) { count.set(0); length.set(0); return group.incrementAndGet(); } else { return group.get(); } } })) .values() .stream() .flatMap(chunk -> resolveWikidataItems(chunk).entrySet().stream()) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } /** * Get Wikidata IDs. For any unknown IDs, resolve them (normalize and get redirects), * and try getting Wikidata IDs again */ private Map<String, String> resolveWikidataItems(Collection<String> articles) { final Map<String, String> result = getWikidataForArticles0(articles); final List<String> unresolved = articles.stream() .filter(title -> !result.containsKey(title)) .collect(Collectors.toList()); if (!unresolved.isEmpty()) { final Map<String, String> redirects = resolveRedirectsForArticles(unresolved); final Map<String, String> result2 = getWikidataForArticles0(redirects.values()); redirects.forEach((original, resolved) -> { if (result2.containsKey(resolved)) { result.put(original, result2.get(resolved)); } }); } return result; } private Map<String, String> getWikidataForArticles0(Collection<String> articles) { if (articles.isEmpty()) { return Collections.emptyMap(); } try { final String url = "https://www.wikidata.org/w/api.php" + "?action=wbgetentities" + "&props=sitelinks" + "&sites=" + siteId + "&sitefilter=" + siteId + "&format=xml" + "&titles=" + articles.stream().map(Utils::encodeUrl).collect(Collectors.joining("|")); final Map<String, String> r = new TreeMap<>(); try (final InputStream in = connect(url).getContent()) { final Document xml = newDocumentBuilder().parse(in); X_PATH.evaluateNodes("//entity", xml).forEach(node -> { final String wikidata = X_PATH.evaluateString("./@id", node); final String wikipedia = X_PATH.evaluateString("./sitelinks/sitelink/@title", node); if (WIKIDATA_PATTERN.matcher(wikidata).matches()) { // non existing entries result in negative integers r.put(wikipedia, wikidata); } }); } return r; } catch (Exception ex) { throw new RuntimeException(ex); } } /** * Given a list of wikipedia titles, returns a map of corresponding normalized title names, * or if the title is a redirect page, the result is the redirect target. */ private Map<String, String> resolveRedirectsForArticles(Collection<String> articles) { try { final String url = getSiteUrl() + "/w/api.php" + "?action=query" + "&redirects" + "&format=xml" + "&titles=" + articles.stream().map(Utils::encodeUrl).collect(Collectors.joining("|")); try (final InputStream in = connect(url).getContent()) { final Document xml = newDocumentBuilder().parse(in); // Add both redirects and normalization results to the same map final Collector<Node, ?, Map<String, String>> fromToCollector = Collectors.toMap( node -> X_PATH.evaluateString("./@from", node), node -> X_PATH.evaluateString("./@to", node) ); final Map<String, String> normalized = X_PATH.evaluateNodes("//normalized/n", xml) .stream() .collect(fromToCollector); final Map<String, String> redirects = X_PATH.evaluateNodes("//redirects/r", xml) .stream() .collect(fromToCollector); // We should only return those keys that were originally requested, excluding titles that are both normalized and redirected return articles.stream() .collect(Collectors.toMap(Function.identity(), title -> { final String normalizedTitle = normalized.getOrDefault(title, title); return redirects.getOrDefault(normalizedTitle, normalizedTitle); } )); } } catch (Exception ex) { throw new RuntimeException(ex); } } public List<String> getCategoriesForPrefix(final String prefix) { try { final String url = getSiteUrl() + "/w/api.php" + "?action=query" + "&list=prefixsearch" + "&format=xml" + "&psnamespace=14" + "&pslimit=50" + "&pssearch=" + Utils.encodeUrl(prefix); // parse XML document try (final InputStream in = connect(url).getContent()) { final Document doc = newDocumentBuilder().parse(in); return X_PATH.evaluateNodes("//ps/@title", doc).stream() .map(Node::getNodeValue) .map(value -> value.contains(":") ? value.split(":", 2)[1] : value) .collect(Collectors.toList()); } } catch (Exception ex) { throw new RuntimeException(ex); } } public static String getLabelForWikidata(String wikidataId, Locale locale, String... preferredLanguage) { try { final List<WikidataEntry> entry = Collections.singletonList(new WikidataEntry(wikidataId, null, null, null)); return getLabelForWikidata(entry, locale, preferredLanguage).get(0).label; } catch (IndexOutOfBoundsException ignore) { return null; } } static List<WikidataEntry> getLabelForWikidata(List<? extends WikipediaEntry> entries, Locale locale, String ... preferredLanguage) { if (entries.size() > 50) { return partitionList(entries, 50).stream() .flatMap(chunk -> getLabelForWikidata(chunk, locale, preferredLanguage).stream()) .collect(Collectors.toList()); } else if (entries.isEmpty()) { return Collections.emptyList(); } try { final String url = "https://www.wikidata.org/w/api.php" + "?action=wbgetentities" + "&props=labels|descriptions" + "&ids=" + entries.stream().map(x -> x.article).collect(Collectors.joining("|")) + "&format=xml"; final Collection<String> languages = new ArrayList<>(); if (locale != null) { languages.add(getMediawikiLocale(locale)); languages.add(getMediawikiLocale(new Locale(locale.getLanguage()))); } languages.addAll(Arrays.asList(preferredLanguage)); languages.add("en"); languages.add(null); final List<WikidataEntry> r = new ArrayList<>(entries.size()); try (final InputStream in = connect(url).getContent()) { final Document xml = newDocumentBuilder().parse(in); for (final WikipediaEntry entry : entries) { final Node entity = X_PATH.evaluateNode("//entity[@id='" + entry.article + "']", xml); if (entity == null) { continue; } r.add(new WikidataEntry( entry.article, getFirstField(languages, "label", entity), entry.coordinate, getFirstField(languages, "description", entity) )); } } return r; } catch (Exception ex) { throw new RuntimeException(ex); } } private static String getFirstField(Collection<String> languages, String field, Node entity) { return languages.stream() .map(language -> X_PATH.evaluateString(language != null ? ".//" + field + "[@language='" + language + "']/@value" : ".//" + field + "/@value", entity)) .filter(label -> label != null && !label.isEmpty()) .findFirst() .orElse(null); } public Collection<WikipediaEntry> getInterwikiArticles(String article) { try { final String url = getSiteUrl() + "/w/api.php" + "?action=query" + "&prop=langlinks" + "&titles=" + Utils.encodeUrl(article) + "&lllimit=500" + "&format=xml"; try (final InputStream in = connect(url).getContent()) { final Document xml = newDocumentBuilder().parse(in); return X_PATH.evaluateNodes("//ll", xml).stream() .map(node -> { final String lang = X_PATH.evaluateString("@lang", node); final String name = node.getTextContent(); return new WikipediaEntry(lang, name); }).collect(Collectors.toList()); } } catch (Exception ex) { throw new RuntimeException(ex); } } public LatLon getCoordinateForArticle(String article) { try { final String url = getSiteUrl() + "/w/api.php" + "?action=query" + "&prop=coordinates" + "&titles=" + Utils.encodeUrl(article) + "&format=xml"; try (final InputStream in = connect(url).getContent()) { final Document xml = newDocumentBuilder().parse(in); final Node node = X_PATH.evaluateNode("//coordinates/co", xml); if (node == null) { return null; } else { return new LatLon(X_PATH.evaluateDouble("@lat", node), X_PATH.evaluateDouble("@lon", node)); } } } catch (Exception ex) { throw new RuntimeException(ex); } } public static <T> List<List<T>> partitionList(final List<T> list, final int size) { return new AbstractList<List<T>>() { @Override public List<T> get(int index) { final int fromIndex = index * size; final int toIndex = Math.min(fromIndex + size, list.size()); return list.subList(fromIndex, toIndex); } @Override public int size() { return (int) Math.ceil(((float) list.size()) / size); } }; } private static DocumentBuilder newDocumentBuilder() { try { return DocumentBuilderFactory.newInstance().newDocumentBuilder(); } catch (ParserConfigurationException e) { Main.warn("Cannot create DocumentBuilder"); Main.warn(e); throw new RuntimeException(e); } } }