package at.ac.univie.mminf.qskos4j.issues.outlinks; import at.ac.univie.mminf.qskos4j.issues.Issue; import at.ac.univie.mminf.qskos4j.progress.MonitoredIterator; import at.ac.univie.mminf.qskos4j.result.ExtrapolatedCollectionResult; import at.ac.univie.mminf.qskos4j.util.RandomSubSet; import at.ac.univie.mminf.qskos4j.util.url.NoContentTypeProvidedException; import at.ac.univie.mminf.qskos4j.util.url.UrlDereferencer; import at.ac.univie.mminf.qskos4j.util.url.UrlNotDereferencableException; import org.openrdf.OpenRDFException; import org.openrdf.model.impl.URIImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.util.*; /** * Created by christian * Date: 26.01.13 * Time: 16:29 * * Finds <a href="https://github.com/cmader/qSKOS/wiki/Quality-Issues#wiki-Broken_Links">Broken Links</a>. */ public class BrokenLinks extends Issue<ExtrapolatedCollectionResult<URL>> { private final Logger logger = LoggerFactory.getLogger(BrokenLinks.class); private final static String NO_CONTENT_TYPE = "n/a"; private Map<URL, String> urlAvailability = new HashMap<>(); private Set<String> invalidResources = new HashSet<>(); private HttpURIs httpURIs; private Integer extAccessDelayMillis; private Float randomSubsetSize_percent; public BrokenLinks(HttpURIs httpURIs) { super(httpURIs, "bl", "Broken Links", "Checks dereferencability of all links", IssueType.ANALYTICAL, new URIImpl("https://github.com/cmader/qSKOS/wiki/Quality-Issues#broken-links") ); this.httpURIs = httpURIs; } @Override protected ExtrapolatedCollectionResult<URL> invoke() throws OpenRDFException { dereferenceURIs(); return new ExtrapolatedCollectionResult<>(collectUnavailableURLs(), randomSubsetSize_percent); } private void dereferenceURIs() throws OpenRDFException { Collection<URI> urisToBeDereferenced = collectUrisToBeDereferenced(); Iterator<URI> it = new MonitoredIterator<>(urisToBeDereferenced, progressMonitor); int i = 1; while (it.hasNext()) { URI uri = it.next(); logger.debug("processing link " +i+ " of "+urisToBeDereferenced.size()); i++; // delay to avoid flooding the vocabulary host try { Thread.sleep(extAccessDelayMillis); } catch (InterruptedException e) { // ignore this exception } addToResults(uri); } } private Collection<URI> collectUrisToBeDereferenced() throws OpenRDFException { if (randomSubsetSize_percent == null) { return httpURIs.getResult().getData(); } RandomSubSet<URI> urisToBeDereferenced = new RandomSubSet<>( httpURIs.getResult().getData(), randomSubsetSize_percent); logger.info("Using subset of " +urisToBeDereferenced.size()+ " URIs for broken link checking"); return urisToBeDereferenced; } private void addToResults(URI uri) { try { addToAvailabilityMap(uri.toURL()); } catch (MalformedURLException e) { invalidResources.add(uri.toString()); } } private void addToAvailabilityMap(URL url) { UrlDereferencer dereferencer = new UrlDereferencer(); String contentType; try { contentType = dereferencer.getContentType(url); } catch (UrlNotDereferencableException e) { contentType = null; logger.debug("url not dereferencable: " +url.toString()); } catch (NoContentTypeProvidedException e) { contentType = NO_CONTENT_TYPE; logger.debug("no content type in response header for " +url.toString()); } urlAvailability.put(url, contentType); } private Collection<URL> collectUnavailableURLs() { Collection<URL> unavailableURLs = new ArrayList<>(); for (URL url : urlAvailability.keySet()) { if (urlAvailability.get(url) == null) { unavailableURLs.add(url); } } return unavailableURLs; } public void setExtAccessDelayMillis(int delayMillis) { extAccessDelayMillis = delayMillis; } public void setSubsetSize(Float subsetSizePercent) { randomSubsetSize_percent = subsetSizePercent; } }