package com.netifera.platform.net.http.service.html; import java.net.URI; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.netifera.platform.util.patternmatching.EmailCollector; public class WebPage { final private URI url; final private String content; public WebPage(URI uri, String content) { this.url = uri; this.content = content; } public URI url() { return url; } public String fileType() { String words[] = url.getPath().split("\\."); String ext = words[words.length - 1].toLowerCase(); if (ext.length()>4) return null; return ext; } public Set<WebLink> links() { Set<WebLink> answer = new HashSet<WebLink>(); // String protocolPattern = "[-a-z0-9]+://"; String protocolPattern = "https?://"; String hostPattern = "[-a-z0-9]+(\\.[-a-z0-9]+)*"; // String pathPattern = "[-a-z0-9_:\\@&?=+,.!/~*'%\\$]*"; String pathPattern = "[-a-z0-9_:\\@&?=+,.!/~*%\\$]*"; String linkPattern = "("+protocolPattern+")?("+hostPattern+")?"+pathPattern; Pattern links = Pattern.compile("(href|src|action)=[\"'\\\\]*("+linkPattern+")[\"'\\\\]*", Pattern.CASE_INSENSITIVE); Pattern otherURLs = Pattern.compile("("+protocolPattern+"("+hostPattern+")?"+pathPattern+")", Pattern.CASE_INSENSITIVE); Matcher matcher = links.matcher(content); while (matcher.find()) { try { answer.add(new WebLink(url.resolve(matcher.group(2)))); // System.out.println(url+" -> "+matcher.group(2)); } catch (IllegalArgumentException e) { // System.err.println("Illegal URI: \""+matcher.group(2)+"\""); } } matcher = otherURLs.matcher(content); while (matcher.find()) { try { answer.add(new WebLink(url.resolve(matcher.group(1)))); // System.out.println(url+" (other) -> "+matcher.group(1)); } catch (IllegalArgumentException e) { // System.err.println("Illegal URI: \""+matcher.group(1)+"\""); } } return answer; } public Set<String> emails() { EmailCollector collector = new EmailCollector(); collector.parse(content, EmailCollector.PARSE_ALL); return collector.results(); } }