/** * Copyright (c) 2009 - 2010 AppWork UG(haftungsbeschränkt) <e-mail@appwork.org> * * This file is part of org.appwork.utils.parser * * This software is licensed under the Artistic License 2.0, * see the LICENSE file or http://www.opensource.org/licenses/artistic-license-2.0.php * for details */ package org.appwork.utils.parser; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import org.appwork.utils.Regex; import org.appwork.utils.logging.Log; /** * @author coalado * */ public class HTMLParser { public static ArrayList<String> findUrls(final String source) { /* TODO: better parsing */ /* remove tags!! */ final ArrayList<String> ret = new ArrayList<String>(); try { for (String link : new Regex(source, "\\(?\\b(ftp://|https?://)[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]").getColumn(-1)) { try { if (link != null) { link = link.trim(); } new URL(link); if (!ret.contains(link)) { ret.add(link); } } catch (final MalformedURLException e) { } } } catch (final Exception e) { Log.exception(e); } return HTMLParser.removeDuplicates(ret); } public static ArrayList<String> removeDuplicates(final ArrayList<String> links) { final ArrayList<String> tmplinks = new ArrayList<String>(); if ((links == null) || (links.size() == 0)) { return tmplinks; } for (final String link : links) { if (link.contains("...")) { final String check = link.substring(0, link.indexOf("...")); String found = link; for (final String link2 : links) { if (link2.startsWith(check) && !link2.contains("...")) { found = link2; break; } } if (!tmplinks.contains(found)) { tmplinks.add(found); } } else { tmplinks.add(link); } } return tmplinks; } }