/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2016 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.constants; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.Charset; import java.util.Collections; import java.util.HashMap; import java.util.Map; import org.wikipediacleaner.api.HttpUtils; import org.wikipediacleaner.api.constants.wiki.AbstractWikiSettings; /** * Analysis of URL representing articles. */ public class ArticleUrl { private static Charset utf8Charset = null; private static Charset iso88591Charset = null; static { utf8Charset = Charset.forName("UTF8"); iso88591Charset = Charset.forName("ISO-8859-1"); } /** * @param wiki Wiki. * @param url URL. * @return Information about article if URL is a link to the given wiki. */ public static ArticleUrl isArticleUrl(EnumWikipedia wiki, String url) { if ((wiki == null) || (url == null)) { return null; } ArticleUrl result = null; // Analysis based on Wiki settings AbstractWikiSettings settings = wiki.getSettings(); if (settings != null) { for (String directPath : settings.getArticleDirectPath()) { if (result == null) { result = isArticleDirectUrl(url, directPath); } } for (String paramPath : settings.getArticleParamPath()) { if (result == null) { result = isArticleParamUrl(url, paramPath, "title"); } } } // Analysis based on Wiki configuration WikiConfiguration wikiConf = wiki.getWikiConfiguration(); if (wikiConf != null) { String server = wikiConf.getServer(); String articlePath = wikiConf.getArticlePath(); if ((result == null) && (server != null) && (articlePath != null)) { result = isArticleDirectUrl(url, server + articlePath); } String script = wikiConf.getScript(); if ((result == null) && (server != null) && (script != null)) { result = isArticleParamUrl(url, server + script, "title"); } } return result; } /** * @param url URL. * @return Corresponding URI. */ private static URI getURI(String url) { if (url == null) { return null; } // Cleanup URL while (url.endsWith("|")) { url = url.substring(0, url.length() - 1); } // Create URI URI uri = null; try { uri = new URI(url); } catch (URISyntaxException e) { return null; } // Various checks if (!uri.isAbsolute() || uri.isOpaque()) { return null; } // Check scheme String scheme = uri.getScheme(); if (scheme == null) { return null; } if (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https")) { return null; } return uri; } /** * @param url URL. * @param base Base URL for accessing articles. * @return Information about article if URL is a link following the base URL. */ private static ArticleUrl isArticleDirectUrl(String url, String base) { if ((url == null) || (base == null)) { return null; } // Cleanup URL while (url.endsWith("|")) { url = url.substring(0, url.length() - 1); } // Create URI URI uri = getURI(url); if (uri == null) { return null; } // Check that URL is coherent with provided base StringBuilder buffer = new StringBuilder(); buffer.append("//"); buffer.append(uri.getAuthority()); buffer.append(HttpUtils.parseEncodedString(uri.getPath(), utf8Charset, iso88591Charset)); int paramIndex = base.indexOf("$1"); if (paramIndex < 0) { return null; } String bufferStr = buffer.toString(); if (bufferStr.length() <= paramIndex) { return null; } if (!bufferStr.startsWith(base.substring(0, paramIndex))) { return null; } // Retrieve optional attributes Map<String, String> paramValues = null; if (uri.getQuery() != null) { String[] params = uri.getQuery().split("\\&"); if ((params != null) && (params.length > 0)) { paramValues = new HashMap<String, String>(); for (String param : params) { String[] value = param.split("\\=", 2); if (value.length >= 1) { if (value.length >= 2) { paramValues.put(value[0], value[1]); } else { paramValues.put(value[0], null); } } } } } return new ArticleUrl(bufferStr.substring(paramIndex), paramValues, uri.getFragment()); } /** * @param url URL. * @param base Base URL for accessing articles. * @param paramName Parameter name for title. * @return Information about article if URL is a link following the base URL. */ private static ArticleUrl isArticleParamUrl(String url, String base, String paramName) { if ((url == null) || (base == null) || (paramName == null)) { return null; } // Cleanup URL while (url.endsWith("|")) { url = url.substring(0, url.length() - 1); } // Create URI URI uri = getURI(url); if (uri == null) { return null; } // Check that URL is coherent with provided base StringBuilder buffer = new StringBuilder(); buffer.append("//"); buffer.append(uri.getAuthority()); buffer.append(HttpUtils.parseEncodedString(uri.getRawPath(), utf8Charset, iso88591Charset)); String bufferStr = buffer.toString(); if (!bufferStr.equals(base)) { return null; } // Check parameters if (uri.getQuery() == null) { return null; } String[] params = uri.getQuery().split("\\&"); if ((params == null) || (params.length == 0)) { return null; } String title = null; Map<String, String> paramValues = new HashMap<String, String>(); for (String param : params) { String[] value = param.split("\\=", 2); if (value.length >= 1) { if (paramName.equals(value[0])) { if (value.length >= 2) { title = value[1]; } } else { if (value.length >= 2) { paramValues.put(value[0], value[1]); } else { paramValues.put(value[0], null); } } } } if (title == null) { return null; } return new ArticleUrl(title, paramValues, uri.getFragment()); } /** Article title */ private final String title; /** Attributes */ private final Map<String, String> attributes; /** Fragment */ private final String fragment; /** * @param title Article title. * @param attributes Attributes. */ private ArticleUrl(String title, Map<String, String> attributes, String fragment) { String tmpTitle = title; if (tmpTitle != null) { tmpTitle = tmpTitle.replaceAll("\\_", " "); tmpTitle = tmpTitle.replaceAll(" ", " "); if (tmpTitle.endsWith("/")) { tmpTitle = tmpTitle.substring(0, tmpTitle.length() - 1); } } this.title = tmpTitle; this.attributes = (attributes != null) ? Collections.unmodifiableMap(attributes) : null; this.fragment = fragment; } /** * @return Article title. */ public String getTitle() { return title; } /** * @return Attributes. */ public Map<String, String> getAttributes() { return attributes; } /** * @return Fragment (anchor). */ public String getFragment() { return fragment; } /** * @return Title with optional fragment. */ public String getTitleAndFragment() { if (fragment == null) { return title; } return title + "#" + fragment; } }