/** * Copyright (C) 2011-2012 trivago GmbH <mario.mueller@trivago.com>, <christian.krause@trivago.com> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.trivago.mail.pigeon.web.data.process; import org.cyberneko.html.parsers.DOMParser; import org.w3c.dom.Node; import org.xml.sax.SAXException; import java.io.IOException; import java.util.HashSet; import java.util.Set; public class LinkParser { private static Set<String> links; public static Set<String> parse(final String content) { links = new HashSet<>(); DOMParser parser = new DOMParser(); try { parser.parse(content); links = new HashSet<>(); org.w3c.dom.Document document = parser.getDocument(); Node root = document.getFirstChild(); process(root); return links; } catch (SAXException | IOException e) { e.printStackTrace(); } return null; } private static void process(final Node node) { String name = node.getNodeName(); if (name.equalsIgnoreCase("a")) { if (node.hasAttributes()) { Node item = node.getAttributes().getNamedItem("href"); if (item != null) { links.add(item.getNodeValue()); } } } Node sibling = node.getNextSibling(); if (sibling != null) { process(sibling); } Node child = node.getFirstChild(); if (child != null) { process(child); } } }