DomainValidatorTestIT.java example

Explorer
josm-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.openstreetmap.josm.data.validation.routines;

import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.ConnectException;
import java.net.HttpURLConnection;
import java.net.IDN;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.Test;
import org.openstreetmap.josm.Main;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;

/**
 * Integration tests for the DomainValidator.
 *
 * @version $Revision: 1723861 $
 */
public class DomainValidatorTestIT {

    /**
     * Download and process local copy of http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     * Check if the internal TLD table is up to date
     * Check if the internal TLD tables have any spurious entries
     * @throws Exception if an error occurs
     */
    @Test
    public void testIanaTldList() throws Exception {
        // Check the arrays first as this affects later checks
        // Doing this here makes it easier when updating the lists
        boolean OK = true;
        for (String list : new String[]{"INFRASTRUCTURE_TLDS", "COUNTRY_CODE_TLDS", "GENERIC_TLDS", "LOCAL_TLDS"}) {
            OK &= isSortedLowerCase(list);
        }
        if (!OK) {
            System.out.println("Fix arrays before retrying; cannot continue");
            return;
        }
        Set<String> ianaTlds = new HashSet<>(); // keep for comparison with array contents
        DomainValidator dv = DomainValidator.getInstance();
        File txtFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.txt");
        long timestamp;
        try {
            timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
        } catch (ConnectException e) {
            Main.error(e);
            // Try again one more time in case of random network issue
            timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
        }
        final File htmlFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.html");
        // N.B. sometimes the html file may be updated a day or so after the txt file
        // if the txt file contains entries not found in the html file, try again in a day or two
        download(htmlFile, "http://www.iana.org/domains/root/db", timestamp);

        try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(txtFile), StandardCharsets.UTF_8))) {
            String line;
            final String header;
            line = br.readLine(); // header
            if (line != null && line.startsWith("# Version ")) {
                header = line.substring(2);
            } else {
                throw new IOException("File does not have expected Version header");
            }
            final boolean generateUnicodeTlds = false; // Change this to generate Unicode TLDs as well

            // Parse html page to get entries
            Map<String, String[]> htmlInfo = getHtmlInfo(htmlFile);
            Map<String, String> missingTLD = new TreeMap<>(); // stores entry and comments as String[]
            Map<String, String> missingCC = new TreeMap<>();
            while ((line = br.readLine()) != null) {
                if (!line.startsWith("#")) {
                    final String unicodeTld; // only different from asciiTld if that was punycode
                    final String asciiTld = line.toLowerCase(Locale.ENGLISH);
                    if (line.startsWith("XN--")) {
                        unicodeTld = IDN.toUnicode(line);
                    } else {
                        unicodeTld = asciiTld;
                    }
                    if (!dv.isValidTld(asciiTld)) {
                        String[] info = htmlInfo.get(asciiTld);
                        if (info != null) {
                            String type = info[0];
                            String comment = info[1];
                            if ("country-code".equals(type)) { // Which list to use?
                                missingCC.put(asciiTld, unicodeTld + " " + comment);
                                if (generateUnicodeTlds) {
                                    missingCC.put(unicodeTld, asciiTld + " " + comment);
                                }
                            } else {
                                missingTLD.put(asciiTld, unicodeTld + " " + comment);
                                if (generateUnicodeTlds) {
                                    missingTLD.put(unicodeTld, asciiTld + " " + comment);
                                }
                            }
                        } else {
                            System.err.println("Expected to find HTML info for "+ asciiTld);
                        }
                    }
                    ianaTlds.add(asciiTld);
                    // Don't merge these conditions; generateUnicodeTlds is final so needs to be separate to avoid a warning
                    if (generateUnicodeTlds) {
                        if (!unicodeTld.equals(asciiTld)) {
                            ianaTlds.add(unicodeTld);
                        }
                    }
                }
            }
            // List html entries not in TLD text list
            for (String key : (new TreeMap<>(htmlInfo)).keySet()) {
                if (!ianaTlds.contains(key)) {
                    if (isNotInRootZone(key)) {
                        System.out.println("INFO: HTML entry not yet in root zone: "+key);
                    } else {
                        System.err.println("WARN: Expected to find text entry for html: "+key);
                    }
                }
            }
            if (!missingTLD.isEmpty()) {
                printMap(header, missingTLD, "TLD");
                fail("missing TLD");
            }
            if (!missingCC.isEmpty()) {
                printMap(header, missingCC, "CC");
                fail("missing CC");
            }
        }
        // Check if internal tables contain any additional entries
        assertTrue(isInIanaList("INFRASTRUCTURE_TLDS", ianaTlds));
        assertTrue(isInIanaList("COUNTRY_CODE_TLDS", ianaTlds));
        assertTrue(isInIanaList("GENERIC_TLDS", ianaTlds));
        // Don't check local TLDS assertTrue(isInIanaList("LOCAL_TLDS", ianaTlds));
    }

    private static void printMap(final String header, Map<String, String> map, String string) {
        System.out.println("Entries missing from "+ string +" List\n");
        if (header != null) {
            System.out.println("        // Taken from " + header);
        }
        Iterator<Map.Entry<String, String>> it = map.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry<String, String> me = it.next();
            System.out.println("        \"" + me.getKey() + "\", // " + me.getValue());
        }
        System.out.println("\nDone");
    }

    @SuppressFBWarnings(value = "PERFORMANCE")
    private static Map<String, String[]> getHtmlInfo(final File f) throws IOException {
        final Map<String, String[]> info = new HashMap<>();

        final Pattern domain = Pattern.compile(".*<a href=\"/domains/root/db/([^.]+)\\.html");
        final Pattern type = Pattern.compile("\\s+<td>([^<]+)</td>");
        final Pattern comment = Pattern.compile("\\s+<td>([^<]+)</td>");

        try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8))) {
            String line;
            while ((line = br.readLine()) != null) {
                Matcher m = domain.matcher(line);
                if (m.lookingAt()) {
                    String dom = m.group(1);
                    String typ = "??";
                    String com = "??";
                    line = br.readLine();
                    while (line != null && line.matches("^\\s*$")) { // extra blank lines introduced
                        line = br.readLine();
                    }
                    Matcher t = type.matcher(line);
                    if (t.lookingAt()) {
                        typ = t.group(1);
                        line = br.readLine();
                        if (line != null && line.matches("\\s+<!--.*")) {
                            while (line != null && !line.matches(".*-->.*")) {
                                line = br.readLine();
                            }
                            line = br.readLine();
                        }
                        // Should have comment; is it wrapped?
                        while (line != null && !line.matches(".*</td>.*")) {
                            line += " " +br.readLine();
                        }
                        Matcher n = comment.matcher(line);
                        if (n.lookingAt()) {
                            com = n.group(1);
                        }
                        // Don't save unused entries
                        if (!com.contains("Not assigned") && !com.contains("Retired") && !typ.equals("test")) {
                            info.put(dom.toLowerCase(Locale.ENGLISH), new String[]{typ, com});
                        }
                    } else {
                        System.err.println("Unexpected type: " + line);
                    }
                }
            }
        }
        return info;
    }

    /*
     * Download a file if it is more recent than our cached copy.
     * Unfortunately the server does not seem to honour If-Modified-Since for the
     * Html page, so we check if it is newer than the txt file and skip download if so
     */
    private static long download(File f, String tldurl, long timestamp) throws IOException {
        final int HOUR = 60*60*1000; // an hour in ms
        final long modTime;
        // For testing purposes, don't download files more than once an hour
        if (f.canRead()) {
            modTime = f.lastModified();
            if (modTime > System.currentTimeMillis()-HOUR) {
                System.out.println("Skipping download - found recent " + f);
                return modTime;
            }
        } else {
            modTime = 0;
        }
        HttpURLConnection hc = (HttpURLConnection) new URL(tldurl).openConnection();
        if (modTime > 0) {
            SimpleDateFormat sdf = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z"); //Sun, 06 Nov 1994 08:49:37 GMT
            String since = sdf.format(new Date(modTime));
            hc.addRequestProperty("If-Modified-Since", since);
            System.out.println("Found " + f + " with date " + since);
        }
        if (hc.getResponseCode() == 304) {
            System.out.println("Already have most recent " + tldurl);
        } else {
            System.out.println("Downloading " + tldurl);
            byte[] buff = new byte[1024];
            try (InputStream is = hc.getInputStream();
                 FileOutputStream fos = new FileOutputStream(f)) {
                int len;
                while ((len = is.read(buff)) != -1) {
                    fos.write(buff, 0, len);
                }
            }
            System.out.println("Done");
        }
        return f.lastModified();
    }

    /**
     * Check whether the domain is in the root zone currently.
     * Reads the URL http://www.iana.org/domains/root/db/*domain*.html
     * (using a local disk cache)
     * and checks for the string "This domain is not present in the root zone at this time."
     * @param domain the domain to check
     * @return true if the string is found
     */
    private static boolean isNotInRootZone(String domain) {
        String tldurl = "http://www.iana.org/domains/root/db/" + domain + ".html";
        BufferedReader in = null;
        try {
            File rootCheck = new File(System.getProperty("java.io.tmpdir"), "tld_" + domain + ".html");
            download(rootCheck, tldurl, 0L);
            in = new BufferedReader(new InputStreamReader(new FileInputStream(rootCheck), StandardCharsets.UTF_8));
            String inputLine;
            while ((inputLine = in.readLine()) != null) {
                if (inputLine.contains("This domain is not present in the root zone at this time.")) {
                    return true;
                }
            }
            in.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            closeQuietly(in);
        }
        return false;
    }

    private static void closeQuietly(Closeable in) {
        if (in != null) {
            try {
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    // isInIanaList and isSorted are split into two methods.
    // If/when access to the arrays is possible without reflection, the intermediate
    // methods can be dropped
    private static boolean isInIanaList(String arrayName, Set<String> ianaTlds) throws Exception {
        Field f = DomainValidator.class.getDeclaredField(arrayName);
        final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
        if (isPrivate) {
            f.setAccessible(true);
        }
        String[] array = (String[]) f.get(null);
        try {
            return isInIanaList(arrayName, array, ianaTlds);
        } finally {
            if (isPrivate) {
                f.setAccessible(false);
            }
        }
    }

    private static boolean isInIanaList(String name, String[] array, Set<String> ianaTlds) {
        boolean ok = true;
        for (int i = 0; i < array.length; i++) {
            if (!ianaTlds.contains(array[i])) {
                System.out.println(name + " contains unexpected value: " + array[i]);
                ok = false;
            }
        }
        return ok;
    }

    private static boolean isSortedLowerCase(String arrayName) throws Exception {
        Field f = DomainValidator.class.getDeclaredField(arrayName);
        final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
        if (isPrivate) {
            f.setAccessible(true);
        }
        String[] array = (String[]) f.get(null);
        try {
            return isSortedLowerCase(arrayName, array);
        } finally {
            if (isPrivate) {
                f.setAccessible(false);
            }
        }
    }

    private static boolean isLowerCase(String string) {
        return string.equals(string.toLowerCase(Locale.ENGLISH));
    }

    // Check if an array is strictly sorted - and lowerCase
    private static boolean isSortedLowerCase(String name, String[] array) {
        boolean sorted = true;
        boolean strictlySorted = true;
        final int length = array.length;
        boolean lowerCase = isLowerCase(array[length-1]); // Check the last entry
        for (int i = 0; i < length-1; i++) { // compare all but last entry with next
            final String entry = array[i];
            final String nextEntry = array[i+1];
            final int cmp = entry.compareTo(nextEntry);
            if (cmp > 0) { // out of order
                System.out.println("Out of order entry: " + entry + " < " + nextEntry + " in " + name);
                sorted = false;
            } else if (cmp == 0) {
                strictlySorted = false;
                System.out.println("Duplicated entry: " + entry + " in " + name);
            }
            if (!isLowerCase(entry)) {
                System.out.println("Non lowerCase entry: " + entry + " in " + name);
                lowerCase = false;
            }
        }
        return sorted && strictlySorted && lowerCase;
    }
}