// Jericho HTML Parser - Java based library for analysing and manipulating HTML // Version 3.2 // Copyright (C) 2004-2009 Martin Jericho // http://jericho.htmlparser.net/ // // This library is free software; you can redistribute it and/or // modify it under the terms of either one of the following licences: // // 1. The Eclipse Public License (EPL) version 1.0, // included in this distribution in the file licence-epl-1.0.html // or available at http://www.eclipse.org/legal/epl-v10.html // // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, // included in this distribution in the file licence-lgpl-2.1.txt // or available at http://www.gnu.org/licenses/lgpl.txt // // This library is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the individual licence texts for more details. package net.htmlparser.jericho; import java.util.Collections; import java.util.List; import java.util.ArrayList; final class URIAttributes { private static final String[] uriAttributeNames=new String[] {"action","background","cite","href","longdesc","src","usemap"}; private static final String[] objectURIAttributeNames=new String[] {"classid","codebase","data"}; public static List<Attribute> getList(final Segment segment) { if (segment==null || segment.getFirstStartTag()==null) return Collections.emptyList(); List<Attribute> attributes=new ArrayList<Attribute>(); for (String uriAttributeName : uriAttributeNames) { for (StartTag startTag : segment.getAllStartTags(uriAttributeName,null)) { Attribute attribute=startTag.getAttributes().get(uriAttributeName); attribute.startTag=startTag; attributes.add(attribute); } } for (StartTag startTag : segment.getAllStartTags(HTMLElementName.OBJECT)) { for (String uriAttributeName : objectURIAttributeNames) { Attribute attribute=startTag.getAttributes().get(uriAttributeName); if (attribute==null) continue; attribute.startTag=startTag; attributes.add(attribute); } } Collections.sort(attributes); return attributes; } public static List<Segment> getStyleURISegments(final Segment segment) { if (segment==null || segment.length()==0) return Collections.emptyList(); if (segment.getFirstStartTag()==null) { // no start tags in this segment, assume the segment is a style attribute value int urlDelimiterStartPos=segment.getSource().getParseText().indexOf("url(",segment.getBegin(),segment.getEnd()); if (urlDelimiterStartPos==-1) return Collections.emptyList(); return addURLSegmentsFromCSS(new ArrayList<Segment>(),new Segment(segment.getSource(),urlDelimiterStartPos,segment.getEnd())); } List<Segment> uriSegments=new ArrayList<Segment>(); for (StartTag startTag : segment.getAllStartTags("style",null)) { addURLSegmentsFromCSS(uriSegments,startTag.getAttributes().get("style").getValueSegment()); } return uriSegments; } private static List<Segment> addURLSegmentsFromCSS(final List<Segment> uriSegments, final Segment cssSegment) { final Source source=cssSegment.getSource(); final ParseText parseText=source.getParseText(); final int breakAtIndex=cssSegment.getEnd(); for (int pos=cssSegment.getBegin(); (pos=parseText.indexOf("url(",pos,breakAtIndex))!=-1;) { pos+=4; while (pos<breakAtIndex && Segment.isWhiteSpace(parseText.charAt(pos))) pos++; if (pos>=breakAtIndex) break; if (isQuote(parseText.charAt(pos))) { pos++; if (pos>=breakAtIndex) break; } final int uriBegin=pos; final int closingBracketPos=parseText.indexOf(')',uriBegin,breakAtIndex); if (closingBracketPos==-1) break; pos=closingBracketPos; while (Segment.isWhiteSpace(parseText.charAt(pos-1))) pos--; if (isQuote(parseText.charAt(pos-1))) pos--; final int uriEnd=pos; if (uriEnd<=uriBegin) break; uriSegments.add(new Segment(source,uriBegin,uriEnd)); pos=closingBracketPos; } return uriSegments; } private static boolean isQuote(final char ch) { return ch=='"' || ch=='\''; } }