/** * */ package uk.bl.wa.extract; /* * #%L * warc-indexer * $Id:$ * $HeadURL:$ * %% * Copyright (C) 2013 - 2014 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.HashSet; import java.util.Set; import org.apache.tika.metadata.Metadata; import uk.bl.wa.parsers.HtmlFeatureParser; import com.google.common.collect.ImmutableList; import com.google.common.net.InternetDomainName; /** * @author AnJackson * */ public class LinkExtractor { public static final String MALFORMED_HOST = "malformed.host"; /** * * @param url * @return */ public static String extractHost(String url) { String host = "unknown.host"; org.apache.commons.httpclient.URI uri = null; // Attempt to parse: try { uri = new org.apache.commons.httpclient.URI(url,false); // Extract domain: host = uri.getHost(); if( host == null ) host = MALFORMED_HOST; } catch ( Exception e ) { // Return a special hostname if parsing failed: host = MALFORMED_HOST; } return host; } /** * * @param input * @param charset * @param baseUri * @param includeImgLinks * @return * @throws IOException */ public static Set<String> extractPublicSuffixes( Metadata metadata ) throws IOException { String[] links = metadata.getValues(HtmlFeatureParser.LINK_LIST); Set<String> suffixes = new HashSet<String>(); for( String link : links ) { String suffix = extractPublicSuffix(link); if( suffix != null ) { suffixes.add(suffix); } } return suffixes; } /** * Extract the public suffix, but compensate for the fact that the library we are * using considers 'uk' to be the public suffix, rather than e.g. 'co.uk' * * @param url e.g. http://this.that.google.com/tootles * @return e.g. "com", or "co.uk". NULL if there was a parsing error. */ public static String extractPublicSuffix( String url ) { String host; try { host = new URI(url).getHost(); } catch (URISyntaxException e) { return null; } return extractPublicSuffixFromHost(host); } public static String extractPublicSuffixFromHost( String host ) { if( host == null ) return null; // Parse out the public suffix: InternetDomainName domainName; try { domainName = InternetDomainName.from(host); } catch( Exception e ) { return null; } InternetDomainName suffix = null; if( host.endsWith(".uk")) { ImmutableList<String> parts = domainName.parts(); if( parts.size() >= 2 ) { suffix = InternetDomainName.from(parts.get(parts.size() - 2) + "." + parts.get(parts.size() - 1)); } } else { suffix = domainName.publicSuffix(); } // Return a value: if( suffix == null ) return null; return suffix.toString(); } public static String extractPrivateSuffix( String url ) { String host; try { host = new URI(url).getHost(); } catch (URISyntaxException e) { return null; } return extractPrivateSuffixFromHost(host); } public static String extractPrivateSuffixFromHost( String host ) { if( host == null ) return null; // Parse out the public suffix: InternetDomainName domainName; try { domainName = InternetDomainName.from(host); } catch( Exception e ) { return null; } InternetDomainName suffix = null; if( host.endsWith(".uk")) { ImmutableList<String> parts = domainName.parts(); if( parts.size() >= 3 ) { suffix = InternetDomainName.from(parts.get(parts.size() - 3) + "." + parts.get(parts.size() - 2) + "." + parts.get(parts.size() - 1)); } } else { if( domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix() ) { suffix = domainName.topPrivateDomain(); } else { suffix = domainName; } } // Return a value: if( suffix == null ) return null; return suffix.toString(); } public static void main( String[] args ) { System.out.println("TEST: "+extractPublicSuffix("http://www.google.com/test.html")); System.out.println("TEST: "+extractPublicSuffix("http://www.google.co.uk/test.html")); System.out.println("TEST: "+extractPublicSuffix("http://www.google.sch.uk/test.html")); System.out.println("TEST: "+extractPublicSuffix("http://www.google.nhs.uk/test.html")); System.out.println("TEST: "+extractPublicSuffix("http://www.nationalarchives.gov.uk/test.html")); System.out.println("TEST: "+extractPublicSuffix("http://www.bl.uk/test.html")); } }