/*******************************************************************************
* Copyright (c) 2011 Subgraph.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Subgraph - initial API and implementation
******************************************************************************/
package com.subgraph.vega.internal.analysis.urls;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.http.HttpEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.subgraph.vega.api.html.IHTMLParseResult;
import com.subgraph.vega.api.http.requests.IHttpResponse;
public class HtmlUrlExtractor {
List<URI> findHtmlUrls(IHttpResponse response) {
final IHTMLParseResult htmlParseResult = response.getParsedHTML();
if(htmlParseResult != null) {
return extractUrlsFromDocument(htmlParseResult.getJsoupDocument());
} else {
return Collections.emptyList();
}
}
List<URI> findHtmlUrls(HttpEntity entity, URI basePath) throws IOException {
final String htmlString = inputStreamToString(entity.getContent());
final Document document = Jsoup.parse(htmlString, basePath.toString());
return extractUrlsFromDocument(document);
}
private List<URI> extractUrlsFromDocument(Document document) {
final ArrayList<URI> uris = new ArrayList<URI>();
uris.addAll(extractURIs(document, "a[href]", "abs:href"));
uris.addAll(extractURIs(document, "[src]", "abs:src"));
uris.addAll(extractURIs(document, "link[href]", "abs:href"));
return uris;
}
private String inputStreamToString(InputStream in) throws IOException {
final Reader r = new InputStreamReader(in, "UTF-8");
final StringWriter w = new StringWriter();
final char[] buffer = new char[8192];
while(true) {
int n = r.read(buffer, 0, buffer.length);
if(n <= 0)
return w.toString();
w.write(buffer, 0, n);
}
}
private List<URI> extractURIs(Document document, String query, String attribute) {
final ArrayList<URI> uris = new ArrayList<URI>();
for(Element e: document.select(query)) {
String link = e.attr(attribute);
URI uri = createURI(link);
if(uri != null)
uris.add(uri);
}
return uris;
}
private URI createURI(String link) {
try {
if(link.isEmpty())
return null;
return new URI(link);
} catch (URISyntaxException ex) {
return null;
}
}
}