/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.openstreetmap.josm.data.validation.routines;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.ConnectException;
import java.net.HttpURLConnection;
import java.net.IDN;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.Test;
import org.openstreetmap.josm.Main;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
/**
* Integration tests for the DomainValidator.
*
* @version $Revision: 1723861 $
*/
public class DomainValidatorTestIT {
/**
* Download and process local copy of http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* Check if the internal TLD table is up to date
* Check if the internal TLD tables have any spurious entries
* @throws Exception if an error occurs
*/
@Test
public void testIanaTldList() throws Exception {
// Check the arrays first as this affects later checks
// Doing this here makes it easier when updating the lists
boolean OK = true;
for (String list : new String[]{"INFRASTRUCTURE_TLDS", "COUNTRY_CODE_TLDS", "GENERIC_TLDS", "LOCAL_TLDS"}) {
OK &= isSortedLowerCase(list);
}
if (!OK) {
System.out.println("Fix arrays before retrying; cannot continue");
return;
}
Set<String> ianaTlds = new HashSet<>(); // keep for comparison with array contents
DomainValidator dv = DomainValidator.getInstance();
File txtFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.txt");
long timestamp;
try {
timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
} catch (ConnectException e) {
Main.error(e);
// Try again one more time in case of random network issue
timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt", 0L);
}
final File htmlFile = new File(System.getProperty("java.io.tmpdir"), "tlds-alpha-by-domain.html");
// N.B. sometimes the html file may be updated a day or so after the txt file
// if the txt file contains entries not found in the html file, try again in a day or two
download(htmlFile, "http://www.iana.org/domains/root/db", timestamp);
try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(txtFile), StandardCharsets.UTF_8))) {
String line;
final String header;
line = br.readLine(); // header
if (line != null && line.startsWith("# Version ")) {
header = line.substring(2);
} else {
throw new IOException("File does not have expected Version header");
}
final boolean generateUnicodeTlds = false; // Change this to generate Unicode TLDs as well
// Parse html page to get entries
Map<String, String[]> htmlInfo = getHtmlInfo(htmlFile);
Map<String, String> missingTLD = new TreeMap<>(); // stores entry and comments as String[]
Map<String, String> missingCC = new TreeMap<>();
while ((line = br.readLine()) != null) {
if (!line.startsWith("#")) {
final String unicodeTld; // only different from asciiTld if that was punycode
final String asciiTld = line.toLowerCase(Locale.ENGLISH);
if (line.startsWith("XN--")) {
unicodeTld = IDN.toUnicode(line);
} else {
unicodeTld = asciiTld;
}
if (!dv.isValidTld(asciiTld)) {
String[] info = htmlInfo.get(asciiTld);
if (info != null) {
String type = info[0];
String comment = info[1];
if ("country-code".equals(type)) { // Which list to use?
missingCC.put(asciiTld, unicodeTld + " " + comment);
if (generateUnicodeTlds) {
missingCC.put(unicodeTld, asciiTld + " " + comment);
}
} else {
missingTLD.put(asciiTld, unicodeTld + " " + comment);
if (generateUnicodeTlds) {
missingTLD.put(unicodeTld, asciiTld + " " + comment);
}
}
} else {
System.err.println("Expected to find HTML info for "+ asciiTld);
}
}
ianaTlds.add(asciiTld);
// Don't merge these conditions; generateUnicodeTlds is final so needs to be separate to avoid a warning
if (generateUnicodeTlds) {
if (!unicodeTld.equals(asciiTld)) {
ianaTlds.add(unicodeTld);
}
}
}
}
// List html entries not in TLD text list
for (String key : (new TreeMap<>(htmlInfo)).keySet()) {
if (!ianaTlds.contains(key)) {
if (isNotInRootZone(key)) {
System.out.println("INFO: HTML entry not yet in root zone: "+key);
} else {
System.err.println("WARN: Expected to find text entry for html: "+key);
}
}
}
if (!missingTLD.isEmpty()) {
printMap(header, missingTLD, "TLD");
fail("missing TLD");
}
if (!missingCC.isEmpty()) {
printMap(header, missingCC, "CC");
fail("missing CC");
}
}
// Check if internal tables contain any additional entries
assertTrue(isInIanaList("INFRASTRUCTURE_TLDS", ianaTlds));
assertTrue(isInIanaList("COUNTRY_CODE_TLDS", ianaTlds));
assertTrue(isInIanaList("GENERIC_TLDS", ianaTlds));
// Don't check local TLDS assertTrue(isInIanaList("LOCAL_TLDS", ianaTlds));
}
private static void printMap(final String header, Map<String, String> map, String string) {
System.out.println("Entries missing from "+ string +" List\n");
if (header != null) {
System.out.println(" // Taken from " + header);
}
Iterator<Map.Entry<String, String>> it = map.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<String, String> me = it.next();
System.out.println(" \"" + me.getKey() + "\", // " + me.getValue());
}
System.out.println("\nDone");
}
@SuppressFBWarnings(value = "PERFORMANCE")
private static Map<String, String[]> getHtmlInfo(final File f) throws IOException {
final Map<String, String[]> info = new HashMap<>();
final Pattern domain = Pattern.compile(".*<a href=\"/domains/root/db/([^.]+)\\.html");
final Pattern type = Pattern.compile("\\s+<td>([^<]+)</td>");
final Pattern comment = Pattern.compile("\\s+<td>([^<]+)</td>");
try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), StandardCharsets.UTF_8))) {
String line;
while ((line = br.readLine()) != null) {
Matcher m = domain.matcher(line);
if (m.lookingAt()) {
String dom = m.group(1);
String typ = "??";
String com = "??";
line = br.readLine();
while (line != null && line.matches("^\\s*$")) { // extra blank lines introduced
line = br.readLine();
}
Matcher t = type.matcher(line);
if (t.lookingAt()) {
typ = t.group(1);
line = br.readLine();
if (line != null && line.matches("\\s+<!--.*")) {
while (line != null && !line.matches(".*-->.*")) {
line = br.readLine();
}
line = br.readLine();
}
// Should have comment; is it wrapped?
while (line != null && !line.matches(".*</td>.*")) {
line += " " +br.readLine();
}
Matcher n = comment.matcher(line);
if (n.lookingAt()) {
com = n.group(1);
}
// Don't save unused entries
if (!com.contains("Not assigned") && !com.contains("Retired") && !typ.equals("test")) {
info.put(dom.toLowerCase(Locale.ENGLISH), new String[]{typ, com});
}
} else {
System.err.println("Unexpected type: " + line);
}
}
}
}
return info;
}
/*
* Download a file if it is more recent than our cached copy.
* Unfortunately the server does not seem to honour If-Modified-Since for the
* Html page, so we check if it is newer than the txt file and skip download if so
*/
private static long download(File f, String tldurl, long timestamp) throws IOException {
final int HOUR = 60*60*1000; // an hour in ms
final long modTime;
// For testing purposes, don't download files more than once an hour
if (f.canRead()) {
modTime = f.lastModified();
if (modTime > System.currentTimeMillis()-HOUR) {
System.out.println("Skipping download - found recent " + f);
return modTime;
}
} else {
modTime = 0;
}
HttpURLConnection hc = (HttpURLConnection) new URL(tldurl).openConnection();
if (modTime > 0) {
SimpleDateFormat sdf = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z"); //Sun, 06 Nov 1994 08:49:37 GMT
String since = sdf.format(new Date(modTime));
hc.addRequestProperty("If-Modified-Since", since);
System.out.println("Found " + f + " with date " + since);
}
if (hc.getResponseCode() == 304) {
System.out.println("Already have most recent " + tldurl);
} else {
System.out.println("Downloading " + tldurl);
byte[] buff = new byte[1024];
try (InputStream is = hc.getInputStream();
FileOutputStream fos = new FileOutputStream(f)) {
int len;
while ((len = is.read(buff)) != -1) {
fos.write(buff, 0, len);
}
}
System.out.println("Done");
}
return f.lastModified();
}
/**
* Check whether the domain is in the root zone currently.
* Reads the URL http://www.iana.org/domains/root/db/*domain*.html
* (using a local disk cache)
* and checks for the string "This domain is not present in the root zone at this time."
* @param domain the domain to check
* @return true if the string is found
*/
private static boolean isNotInRootZone(String domain) {
String tldurl = "http://www.iana.org/domains/root/db/" + domain + ".html";
BufferedReader in = null;
try {
File rootCheck = new File(System.getProperty("java.io.tmpdir"), "tld_" + domain + ".html");
download(rootCheck, tldurl, 0L);
in = new BufferedReader(new InputStreamReader(new FileInputStream(rootCheck), StandardCharsets.UTF_8));
String inputLine;
while ((inputLine = in.readLine()) != null) {
if (inputLine.contains("This domain is not present in the root zone at this time.")) {
return true;
}
}
in.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
closeQuietly(in);
}
return false;
}
private static void closeQuietly(Closeable in) {
if (in != null) {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// isInIanaList and isSorted are split into two methods.
// If/when access to the arrays is possible without reflection, the intermediate
// methods can be dropped
private static boolean isInIanaList(String arrayName, Set<String> ianaTlds) throws Exception {
Field f = DomainValidator.class.getDeclaredField(arrayName);
final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
if (isPrivate) {
f.setAccessible(true);
}
String[] array = (String[]) f.get(null);
try {
return isInIanaList(arrayName, array, ianaTlds);
} finally {
if (isPrivate) {
f.setAccessible(false);
}
}
}
private static boolean isInIanaList(String name, String[] array, Set<String> ianaTlds) {
boolean ok = true;
for (int i = 0; i < array.length; i++) {
if (!ianaTlds.contains(array[i])) {
System.out.println(name + " contains unexpected value: " + array[i]);
ok = false;
}
}
return ok;
}
private static boolean isSortedLowerCase(String arrayName) throws Exception {
Field f = DomainValidator.class.getDeclaredField(arrayName);
final boolean isPrivate = Modifier.isPrivate(f.getModifiers());
if (isPrivate) {
f.setAccessible(true);
}
String[] array = (String[]) f.get(null);
try {
return isSortedLowerCase(arrayName, array);
} finally {
if (isPrivate) {
f.setAccessible(false);
}
}
}
private static boolean isLowerCase(String string) {
return string.equals(string.toLowerCase(Locale.ENGLISH));
}
// Check if an array is strictly sorted - and lowerCase
private static boolean isSortedLowerCase(String name, String[] array) {
boolean sorted = true;
boolean strictlySorted = true;
final int length = array.length;
boolean lowerCase = isLowerCase(array[length-1]); // Check the last entry
for (int i = 0; i < length-1; i++) { // compare all but last entry with next
final String entry = array[i];
final String nextEntry = array[i+1];
final int cmp = entry.compareTo(nextEntry);
if (cmp > 0) { // out of order
System.out.println("Out of order entry: " + entry + " < " + nextEntry + " in " + name);
sorted = false;
} else if (cmp == 0) {
strictlySorted = false;
System.out.println("Duplicated entry: " + entry + " in " + name);
}
if (!isLowerCase(entry)) {
System.out.println("Non lowerCase entry: " + entry + " in " + name);
lowerCase = false;
}
}
return sorted && strictlySorted && lowerCase;
}
}