/** * Copyright (c) 2008-2011 Sonatype, Inc. * All rights reserved. Includes the third-party code listed at http://www.sonatype.com/products/nexus/attributions. * * This program is free software: you can redistribute it and/or modify it only under the terms of the GNU Affero General * Public License Version 3 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License Version 3 * for more details. * * You should have received a copy of the GNU Affero General Public License Version 3 along with this program. If not, see * http://www.gnu.org/licenses. * * Sonatype Nexus (TM) Open Source Version is available from Sonatype, Inc. Sonatype and Sonatype Nexus are trademarks of * Sonatype, Inc. Apache Maven is a trademark of the Apache Foundation. M2Eclipse is a trademark of the Eclipse Foundation. * All other trademarks are the property of their respective owners. */ package org.sonatype.nexus.plugins.rrb.parsers; import java.util.ArrayList; import org.codehaus.plexus.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.sonatype.nexus.plugins.rrb.RepositoryDirectory; public class HtmlRemoteRepositoryParser implements RemoteRepositoryParser { private static final String[] EXCLUDES = { ">Skip to content<", ">Log in<", ">Products<", "Parent Directory", "?", ">../", ">..<", ">._.<", "-logo.png", ">Community<", ">Support<", ">Resources<", ">About us<", ">Downloads<", ">Documentation<", ">Resources<", ">About This Site<", ">Contact Us<", ">Legal Terms and Privacy Policy<", ">Log out<", ">IONA Technologies<", ">Site Index<", ">Skip to content<", ">Log In<" }; private final Logger logger = LoggerFactory.getLogger( HtmlRemoteRepositoryParser.class ); protected String localUrl; protected String remotePath; protected String linkStart = "<a "; protected String linkEnd = "/a>"; protected String href = "href=\""; protected String id; protected String baseUrl; public HtmlRemoteRepositoryParser( String remotePath, String localUrl, String id, String baseUrl ) { this.remotePath = remotePath; this.localUrl = localUrl; this.id = id; this.baseUrl = baseUrl; } /** * Extracts the links and sets the data in the RepositoryDirectory object. * * @param indata * @return a list of RepositoryDirectory objects */ public ArrayList<RepositoryDirectory> extractLinks( StringBuilder indata ) { ArrayList<RepositoryDirectory> result = new ArrayList<RepositoryDirectory>(); if ( indata.indexOf( linkStart.toUpperCase() ) != -1 ) { linkStart = linkStart.toUpperCase(); linkEnd = linkEnd.toUpperCase(); href = href.toUpperCase(); } int start = 0; int end = 0; if ( !remotePath.endsWith( "/" ) ) { remotePath += "/"; } if ( remotePath.equals( "/" ) ) { remotePath = ""; } if ( !localUrl.endsWith( "/" ) ) { localUrl += "/"; } do { RepositoryDirectory rp = new RepositoryDirectory(); StringBuilder temp = new StringBuilder(); start = indata.indexOf( linkStart, start ); if ( start < 0 ) { break; } end = indata.indexOf( linkEnd, start ) + linkEnd.length(); temp.append( indata.subSequence( start, end ) ); if ( !exclude( temp ) ) { if ( !getLinkName( temp ).trim().endsWith( "/" ) ) { rp.setLeaf( true ); } rp.setText( getLinkName( temp ).replace( "/", "" ).trim() ); String uri = getLinkUrl( temp ).replace( baseUrl, localUrl ); uri = uri.startsWith( localUrl ) ? uri : localUrl + remotePath + uri; rp.setResourceURI( uri ); rp.setRelativePath( uri.replace( localUrl, "" ) ); if ( !rp.getRelativePath().startsWith( "/" ) ) { rp.setRelativePath( "/" + rp.getRelativePath() ); } if ( StringUtils.isNotEmpty( rp.getText() ) ) { result.add( rp ); } logger.debug( "adding {} to result", rp.toString() ); } start = end + 1; } while ( start > 0 ); return result; } /** * Extracts the link name. */ protected String getLinkName( StringBuilder temp ) { int start = temp.indexOf( ">" ) + 1; int end = temp.indexOf( "</" ); return cleanup( temp.substring( start, end ) ); } protected String cleanup( String value ) { int start = value.indexOf( '<' ); int end = value.indexOf( '>' ); if ( start != -1 && start < end ) { CharSequence seq = value.substring( start, end + 1 ); value = value.replace( seq, "" ); cleanup( value ); } return value.trim(); } /** * Extracts the link url. */ protected String getLinkUrl( StringBuilder temp ) { int start = temp.indexOf( href ) + href.length(); int end = temp.indexOf( "\"", start + 1 ); return temp.substring( start, end ); } /** * Excludes links that are not relevant for the listing. */ boolean exclude( StringBuilder value ) { for ( String s : EXCLUDES ) { if ( value.indexOf( s ) > 0 ) { logger.debug( "{} is in EXCLUDES array", value ); return true; } } return false; } }