package mobac.mapsources; import java.io.IOException; import java.net.HttpURLConnection; import java.net.URL; import java.nio.charset.Charset; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import mobac.exceptions.MapSourceInitializationException; import mobac.utilities.Utilities; import mobac.utilities.writer.NullPrintWriter; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.w3c.tidy.Tidy; public class MapSourceUrlUpdater { public static final String ACCEPT = " text/*, text/html, text/html;level=1"; /** * Loads the web page specified by <code>url</code>, parses it into DOM and extracts the <code>src</code> attribute * of all <code><img></code> entities. * * @param url * http or https url * @param regex * @return * @throws IOException */ public static List<String> extractImgSrcList(String url, String regex) throws IOException { LinkedList<String> list = new LinkedList<String>(); URL u = new URL(url); HttpURLConnection conn = (HttpURLConnection) u.openConnection(); conn.addRequestProperty("Accept", ACCEPT); if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) { Utilities.getInputBytes(conn.getInputStream()); throw new IOException("Invalid HTTP response code: " + conn.getResponseCode()); } Tidy tidy = new Tidy(); tidy.setErrout(new NullPrintWriter()); // Suppress error messages Document doc = tidy.parseDOM(conn.getInputStream(), null); XPathFactory factory = XPathFactory.newInstance(); XPath xpath = factory.newXPath(); XPathExpression expr; NodeList nodes; try { expr = xpath.compile("//img[@src]"); nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET); } catch (XPathExpressionException e) { throw new RuntimeException(e); } Pattern p = null; if (regex != null) p = Pattern.compile(regex); for (int i = 0; i < nodes.getLength(); i++) { String imgUrl = nodes.item(i).getAttributes().getNamedItem("src").getNodeValue(); if (imgUrl != null && imgUrl.length() > 0) { if (p != null) { if (!p.matcher(imgUrl).matches()) continue; } list.add(imgUrl); } } return list; } /** * Retrieves the text or HTML document on the specified <code>url</code>, interprets the retrieved data as * {@link String} of {@link Charset} <code>charset</code> and returns this {@link String}. * * @param url * @param charset * @return * @throws IOException */ public static String loadDocument(String url, Charset charset) throws IOException { URL u = new URL(url); HttpURLConnection conn = (HttpURLConnection) u.openConnection(); conn.addRequestProperty("Accept", ACCEPT); byte[] data = Utilities.getInputBytes(conn.getInputStream()); if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) { throw new IOException("Invalid HTTP response code: " + conn.getResponseCode()); } return new String(data, charset); } /** * * @param url * @param charset * @param regex * regex defining one group with will be returned * @return * @throws MapSourceInitializationException */ public static String loadDocumentAndExtractGroup(String url, Charset charset, String regex) throws MapSourceInitializationException { String document; try { document = loadDocument(url, charset); } catch (IOException e) { throw new MapSourceInitializationException("Faile dto retrieve initialization document from url: " + url + "\nError: " + e.getMessage(), e); } Matcher m = Pattern.compile(regex).matcher(document); if (!m.find()) throw new MapSourceInitializationException("pattern not found: " + regex); return m.group(1); } public static void main(String[] args) { try { List<String> imgUrls = extractImgSrcList("http://maps.google.com/?ie=UTF8&ll=0,0&spn=0,0&z=2", "^http://mt\\d\\.google\\.com/.*"); for (String s : imgUrls) System.out.println(s); } catch (Exception e) { e.printStackTrace(); } } }