/** * EasySOA Registry * Copyright 2011 Open Wide * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact : easysoa-dev@googlegroups.com */ package org.easysoa.registry.dbb.strategies; import java.net.MalformedURLException; import java.net.URL; import java.util.LinkedList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.easysoa.registry.dbb.ServiceFinderStrategy; import org.easysoa.registry.dbb.BrowsingContext; import org.easysoa.registry.dbb.FoundService; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; /** * * Service scraper based on parsing all hypertext links from a web page to find WSDLs. * * @author mkalam-alami * */ public class ScrapingStrategy extends DefaultAbstractStrategy implements ServiceFinderStrategy { private static final Log log = LogFactory.getLog(ScrapingStrategy.class); @Override public List<FoundService> findFromContext(BrowsingContext context) throws Exception { List<FoundService> foundServices = new LinkedList<FoundService>(); if (context.getData() != null) { URL url = context.getURL(); // Web page parsing HtmlCleaner cleaner = new HtmlCleaner(); TagNode cleanHtml = null; try { cleanHtml = cleaner.clean(context.getData()); } catch (StackOverflowError e) { log.warn("HtmlCleaner stack overflow while parsing " + url + ", aborting strategy"); return foundServices; } // Find app name String applicationName = guessApplicationName(context); // Find links List<String> foundServicesNames = new LinkedList<String>(); Object[] links = cleanHtml.evaluateXPath("//a"); changeToAbsolutePath(links, "href", url); for (Object o : links) { TagNode link = (TagNode) o; try { String linkHref = link.getAttributeByName("href"); if (linkHref == null) { // NB. happens in some bad html continue ; } String ref = new URL(url, linkHref).toString(); String name = (link.getText() != null) ? link.getText() .toString() : ref; // TODO else title attr // Truncate if name is an URL (serviceName cannot contain slashes) if (name.contains("/")) { String[] nameParts = name.split("/}"); name = nameParts[nameParts.length - 1].replaceAll( "(\\?|\\.|\\?wsdl)", ""); // AND NOT 'wsdl' only (see below) } // Append digits to the link name if it already exists int i = 1; if (ref != null && ref.toLowerCase().endsWith("?wsdl")) { // AND NOT "wsdl" only (see below) while (foundServicesNames.contains(name)) { name = (i == 1 ? name + i++ : name.substring(0, name.length() - 1)) + i++; } name = name.replaceAll("[\n\r]", "").trim(); String nameWithoutWsdl = name.replaceAll("([ ]*\\?WSDL|[ ]*\\?wsdl)", "").trim(); // and not only replace "wsdl" else link to http://www.w3.org/TR/wsdl titled idem would fail ///name = name.replaceAll("([\n\r]|[ ]*WSDL|[ ]*wsdl)", "").trim(); if (!nameWithoutWsdl.isEmpty()) { // NOT REQUIRED ANYMORE name = nameWithoutWsdl; } foundServices.add(new FoundService(name, ref, applicationName)); foundServicesNames.add(name); } } catch (Exception e) { // Nothing (link parsing failure: MalformedURLException or anything else...) } } } return foundServices; } private static void changeToAbsolutePath(Object[] tagNodes, String attribute, URL context) { for (Object o : tagNodes) { TagNode tag = (TagNode) o; String attrValue = tag.getAttributeByName(attribute); if ((attrValue != null) && (!attrValue.startsWith("http://"))) { try { tag.setAttribute(attribute, new URL(context, attrValue).toString()); } catch (Exception e) { // Nothing (Could not set attrValue to absolute path) } } } } }