package uk.bl.scope; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.lang3.StringUtils; import models.FieldUrl; import models.LookupEntry; import models.Target; import play.Logger; import uk.bl.Const; import uk.bl.exception.ActException; import uk.bl.exception.WhoisException; import uk.bl.wa.whois.JRubyWhois; import uk.bl.wa.whois.record.WhoisContact; import uk.bl.wa.whois.record.WhoisResult; import com.avaje.ebean.Ebean; import com.avaje.ebean.SqlRow; import com.maxmind.geoip2.DatabaseReader; import com.maxmind.geoip2.model.CityResponse; /** * This class implements scope rule engine. * A Target is in scope if any of the following statements is true: * * Legal Deposit * ============= * 1. Manual rules settings in Target edit page * 1.1. The Target is known to be hosted in the UK (manual boolean field). * 1.2. The Target features an page that specified a UK postal address * (a manual boolean field plus a text field to hold a specific URL that contains the address). * 1.3. The Target is known to be a UK publication, according to correspondence with a curator * (a manual boolean field plus a text field to hold details of the correspondence). * 1.4. The Target is known to be a UK publication, in the professional judgement of a curator * (a manual boolean field plus a text field to hold the justification). * 1.5. If no LD criteria met - checking result for scope is negativeTarget * (a manual boolean field). * * 2. By permission * The Target is one for which we have a license that gives us permission to crawl the site * (and make it available), even if the Target does not fall under any Legal Deposit criteria. * * 3. All URLs for this Target meet at least one of the following automated criteria: * 3.1 The authority of the URI (i.e. the hostname) end with '.uk' or other acceptable TLD (e.g. '.scot'). * 3.2 The IP address associated with the URI is geo-located in the UK * (using this GeoIP2 database, in a manner similar to our H3 GeoIP module). * 3.3 Use whois lookup service to check whether the given domain name is associated with a UK registrant. * */ public enum Scope { INSTANCE; private static final String UK_DOMAIN = ".uk"; private static final String LONDON_DOMAIN = ".london"; private static final String SCOT_DOMAIN = ".scot"; private static final String WALES_DOMAIN = ".wales"; private static final String CYMRU_DOMAIN = ".cymru"; public static List<String> DOMAINS; static { DOMAINS = new ArrayList<String>(); DOMAINS.add(UK_DOMAIN); DOMAINS.add(LONDON_DOMAIN); DOMAINS.add(SCOT_DOMAIN); DOMAINS.add(WALES_DOMAIN); DOMAINS.add(CYMRU_DOMAIN); } public static final String GEO_IP_SERVICE = "GeoLite2-City.mmdb"; public static final String UK_COUNTRY_CODE = "GB"; public static final String HTTP = "http://"; public static final String HTTPS = "https://"; public static final String WWW = "www."; public static final String END_STR = "/"; private static final int WHOIS_TIMEOUT = 15; // Whois lookup timeout (seconds) public static boolean WHOIS_ENABLED = false; // Should whois be used at all? public static DatabaseReader databaseReader; static { // A File object pointing to your GeoIP2 or GeoLite2 database File database = new File(GEO_IP_SERVICE); // This creates the DatabaseReader object, which should be reused across // lookups. try { databaseReader = new DatabaseReader.Builder(database).build(); } catch (IOException e) { Logger.warn("Can't read database file. " + e); } } /** * This method is the rule engine for checking if a given URL is in scope. * * @param url The search URL * @param nidUrl The identifier URL in the project domain model * @param whether to include by-permission as acceptable * * @return true if in scope * @throws WhoisException */ public boolean check(String url, Target target, boolean includedByPermission) { url = normalizeUrl(url); Logger.debug("Scope.check url: " + url); /** * Check if given URL is already in project database in a table LookupEntry. * If this is in return associated value, otherwise process lookup using expert rules. */ /* boolean inProjectDb = false; if (url != null && url.length() > 0) { // List<LookupEntry> lookupEntryCount = LookupEntry.filterByName(url); LookupEntry resLookupEntry = LookupEntry.findBySiteName(url); if (resLookupEntry != null && !resLookupEntry.name.toLowerCase().equals(Const.NONE)) { // if (lookupEntryCount.size() > 0) { inProjectDb = true; res = LookupEntry.getValueByUrl(url); Logger.debug("check lookup entry for '" + url + "' is in database with value: " + res); } } return res; Logger.debug("URL not in database - calculate scope"); */ // read Target fields with manual entries and match to the given NID URL (Rules 1.1 - 1.5) if (target != null && target.checkManualScope() ) { return true; } // Rule 2: by permission if (includedByPermission && target != null && target.checkLicense() ) { return true; } // Rule 3.1: check domain name if (url != null && url.length() > 0 && checkScopeDomain(url)) { return true; } // Rule 3.2: check geo IP if ( url != null && url.length() > 0 && checkGeoIp(url) ) { return true; } // Rule 3.3: check whois lookup service if ( url != null && url.length() > 0 ) { if(checkWhois(url, target) ) { return true; } } return false; } /** * * Checks if a Target is in NPLD scope by running each of it's URL fields through the checks. * * @param target * @return * @throws WhoisException */ public boolean check(Target target, boolean includedByPermission ) { for( FieldUrl url : target.fieldUrls) { if( ! check( url.url, target, includedByPermission) ) { return false; } } return true; } /** * This method queries geo IP from database * * Synchronized in case the underlying database is not thread-safe. * * @param ip - The host IP * @return true if in UK domain */ public synchronized boolean queryDb(String ip) { boolean res = false; try { // Find city by given IP CityResponse response = databaseReader.city(InetAddress.getByName(ip)); Logger.info(response.getCountry().getIsoCode()); Logger.info(response.getCountry().getName()); // Check country code in city response if (response.getCountry().getIsoCode().equals(UK_COUNTRY_CODE)) { res = true; } } catch (Exception e) { Logger.warn("GeoIP error. " + e); } Logger.debug("Geo IP query result: " + res); return res; } /** * This method normalizes passed URL that it is appropriate for IP calculation. * @param url The passed URL * @return normalized URL */ public static String normalizeUrl(String url, boolean slash) { String res = url; if (res != null && res.length() > 0) { //if (!res.contains(WWW) && !res.contains(HTTP) && !res.contains(HTTPS)) { // res = WWW + res; //} if (!res.contains(HTTP)) { if (!res.contains(HTTPS)) { res = HTTP + res; } } if (slash && !res.endsWith(END_STR)) { res = res + END_STR; } } // Logger.debug("normalized URL: " + res); return res; } public static String normalizeUrl(String url) { return normalizeUrl(url, true); } public static String normalizeUrlNoSlash(String url) { return normalizeUrl(url, false); } /** * This method comprises rule engine for checking if a given URL is in scope for rules * associated with Domain analysis. * @param url The search URL * @param nidUrl The identifier URL in the project domain model * @return true if in scope * @throws WhoisException */ public static boolean checkScopeDomain(String ourl) { // Grab the domain part: String domain; try { domain = getDomainFromUrl(normalizeUrl(ourl)); Logger.debug("Checking domain: "+domain); } catch (ActException e) { Logger.error("Exception when normalising "+ourl, e); return false; } // Rule 3.1: check domain name ends with an acceptable suffix: if ( domain != null ) { domain = domain.toLowerCase(); for( String okd : DOMAINS ) { if ( domain.endsWith(okd)) { return true; } } } return false; } /** * This method extracts host from the given URL and checks geo IP using geo IP database. * @param url * @return true if in UK domain */ public boolean checkGeoIp(String url) { boolean res = false; String ip = getIpFromUrl(url); Logger.debug("ip: " + ip); res = queryDb(ip); return res; } /** * Check parsed WHOIS result for UK/GB. * * @param whoIsRes * @return */ public static boolean isUKRegistrant( WhoisResult whoIsRes ) { boolean isUK = false; for( WhoisContact c : whoIsRes.getRegistrantContacts() ) { if( "uk".equalsIgnoreCase(c.getCountry_code()) || "gb".equalsIgnoreCase(c.getCountry_code()) ) { isUK = true; break; } if( "united kingdom".equalsIgnoreCase(c.getCountry()) || "great britain".equalsIgnoreCase(c.getCountry()) ) { isUK = true; break; } } return isUK; } /** * This method extracts domain name from the given URL and checks country or country code * in response using whois lookup service. * @param url * @return true if in UK domain * @throws WhoisException */ public boolean checkWhois(String url, Target target) { if( WHOIS_ENABLED != true ) { Logger.warn("WHOIS is currently disabled!"); return false; } // Perform whois check: Logger.info("Performing whois lookup on "+url); boolean res = false; try { System.getProperties().put("JRUBY_OPTS", "--1.9"); JRubyWhois whoIs = new JRubyWhois(); Logger.debug("checkWhois: " + url); WhoisResult whoIsRes = whoIs.lookup(getDomainFromUrl(url), WHOIS_TIMEOUT); res = isUKRegistrant(whoIsRes); Logger.debug("isUKRegistrant?: " + res); if( whoIsRes.getRegistrantContacts() != null ) { for( WhoisContact wrc : whoIsRes.getRegistrantContacts()) { Logger.debug("WhoIsRes: "+wrc.getName()+" "+wrc.getCountry()+" "+wrc.getCountry_code()); } } if( target != null ) ScopeLookupEntries.storeInProjectDb(url, "WHOIS", res, target); } catch (Exception e) { Logger.warn("whois lookup message: " + e.getMessage(),e); if( target != null ) ScopeLookupEntries.storeInProjectDb(url, "WHOIS", false, target); } Logger.debug("whois res: " + res); return res; } /** * This method converts URL to IP address. * @param url * @return IP address as a string */ public String getIpFromUrl(String url) { String ip = ""; InetAddress address; try { address = InetAddress.getByName(new URL(url).getHost()); ip = address.getHostAddress(); } catch (UnknownHostException e) { Logger.debug("ip calculation unknown host error for url=" + url + ". " + e.getMessage()); } catch (MalformedURLException e) { Logger.debug("ip calculation error for url=" + url + ". " + e.getMessage()); } return ip; } /** * Actually gets the host. * * @param url * @return * @throws ActException */ public static String getDomainFromUrl(String url) throws ActException { URL uri; try { uri = new URL(url); Logger.debug("getDomainFromUrl: "+uri); String domain = uri.getHost(); Logger.debug("getDomainFromUrl GOT: "+domain); if (StringUtils.isNotEmpty(domain)) { return domain.startsWith(WWW) ? domain.substring(4) : domain; } } catch (MalformedURLException e) { throw new ActException(e); } return null; } public boolean isUkHosting(String url) { if (this.checkGeoIp(url)) { return true; } return false; } public boolean isInScopeUkRegistration(String url, Target target) throws WhoisException { return checkWhois(url, target); } // UK GeoIP public boolean isUkHosting(Target target) { for (FieldUrl fieldUrl : target.fieldUrls) { if (!this.checkGeoIp(fieldUrl.url)) return false; } return true; } // UK Domain public static boolean isTopLevelDomain(Target target) { for (FieldUrl fieldUrl : target.fieldUrls) { if( !checkScopeDomain(fieldUrl.url)) return false; } return true; } public boolean isUkRegistration(Target target) { for (FieldUrl fieldUrl : target.fieldUrls) { if (!checkWhois(fieldUrl.url, target)) return false; } return true; } /** * * @param number * @return * @throws WhoisException */ public WhoIsData checkWhois(int number) throws WhoisException { Logger.debug("checkWhoisThread: " + number); boolean res = false; List<Target> targets = new ArrayList<Target>(); int ukRegistrantCount = 0; int nonUKRegistrantCount = 0; int failedCount = 0; JRubyWhois whoIs = new JRubyWhois(); List<Target> targetList = Target.findLastActive(number); Logger.debug("targetList: " + targetList.size()); Iterator<Target> itr = targetList.iterator(); while (itr.hasNext()) { Target target = itr.next(); for (FieldUrl fieldUrl : target.fieldUrls) { try { // Logger.debug("checkWhoisThread URL: " + target.field_url + ", last update: " + String.valueOf(target.lastUpdate)); WhoisResult whoIsRes = whoIs.lookup(getDomainFromUrl(fieldUrl.url)); // Logger.debug("whoIsRes: " + whoIsRes); // DOMAIN A UK REGISTRANT? res = isUKRegistrant(whoIsRes); if (res) ukRegistrantCount++; else nonUKRegistrantCount++; // Logger.debug("isUKRegistrant?: " + res); // STORE Logger.debug("CHECK TO SAVE " + target.fieldUrl()); ScopeLookupEntries.storeInProjectDb(fieldUrl.url, "WHOIS", res, target); // ASSIGN TO TARGET target.isUkRegistration = res; ukRegistrantCount++; } catch (Exception e) { Logger.debug("whois lookup message: " + e.getMessage()); // store in project DB // FAILED - UNCHECKED ScopeLookupEntries.storeInProjectDb(fieldUrl.url, "WHOIS", false, target); // FALSE - WHAT'S DIFF BETWEEN THAT AND NON UK? create a transient field? target.isUkRegistration = false; failedCount++; } } Ebean.update(target); targets.add(target); } // List<Target> result = Target.find.select("title").where().eq(Const.ACTIVE, true).orderBy(Const.LAST_UPDATE + " " + Const.DESC).setMaxRows(number).findList(); // LookupEntry.find.fetch("target").where().select("name").select("target.title") // // StringBuilder lookupSql = new StringBuilder("select l.name as lookup_name, t.title as title, t.updated_at as target_date, l.updated_at as lookup_date, (l.updated_at::timestamp - t.updated_at::timestamp) as diff from Lookup_entry l, Target t "); lookupSql.append(" where l.name in (select f.url from field_url as f, target tar where tar.active = true and tar.id = f.target_id order by tar.updated_at desc "); lookupSql.append(" limit ").append(number).append(") and l.target_id = t.id order by diff desc"); List<SqlRow> results = Ebean.createSqlQuery(lookupSql.toString()).findList(); // for (SqlRow row : results) { // Logger.debug("row: " + row.getString("name") + " - " + row.get("diff")); // } // List<LookupEntry> lookupEntries = LookupEntry.find.where().in("name", result).findList(); // StringBuilder builder = new StringBuilder("name in (select tar.field_url from target tar where tar.active = true order by tar.last_update desc)"); // List<LookupEntry> lookupEntries = LookupEntry.find.where().raw(builder.toString()).findList(); // Logger.debug("lookupEntries: " + lookupEntries.size()); WhoIsData whoIsData = new WhoIsData(targets, results, ukRegistrantCount, nonUKRegistrantCount, failedCount); // Logger.debug("whois res: " + res); return whoIsData; } /** * This method extracts domain name from the given URL and checks country or country code * in response using whois lookup service. * @param number The number of targets for which the elapsed time since the last check is greatest * @return true if in UK domain * @throws ActException * @throws WhoisException */ public boolean checkWhoisThread(int number) throws ActException { Logger.debug("checkWhoisThread: " + number); boolean res = false; JRubyWhois whoIs = new JRubyWhois(); List<Target> targetList = Target.findLastActive(number); Logger.debug("targetList: " + targetList.size()); Iterator<Target> itr = targetList.iterator(); while (itr.hasNext()) { Target target = itr.next(); for (FieldUrl fieldUrl : target.fieldUrls) { Logger.debug("checkWhoisThread URL: " + target.fieldUrl() + ", last update: " + String.valueOf(target.updatedAt)); WhoisResult whoIsRes = whoIs.lookup(getDomainFromUrl(fieldUrl.url)); Logger.debug("whoIsRes: " + whoIsRes); // DOMAIN A UK REGISTRANT? res = isUKRegistrant(whoIsRes); Logger.debug("isUKRegistrant?: " + res); // STORE ScopeLookupEntries.storeInProjectDb(fieldUrl.url, "WHOIS", res, target); // ASSIGN TO TARGET target.isUkRegistration = res; // Logger.debug("whois lookup message: " + e.getMessage()); // // store in project DB // // FAILED - UNCHECKED // storeInProjectDb(fieldUrl.url, false); // // FALSE - WHAT'S DIFF BETWEEN THAT AND NON UK? create a transient field? // target.isInScopeUkRegistration = false; } Ebean.update(target); } // Logger.debug("whois res: " + res); return res; } }