/** * Copyright (C) 2012-2014 Gist Labs, LLC. (http://gistlabs.com) * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ package com.gistlabs.mechanize.document.html; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.Header; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * This file inspired by org.jsoup.helper.DataUtil.getCharsetFromContentType(String) method. * @author John Heintz <john@gistlabs.com> */ public class JsoupDataUtil { private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)"); static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset /** * Parse out a charset from a content type header. * @param header e.g. "text/html; charset=EUC-JP" * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. */ public static String getCharsetFromContentType(Header header) { if (header == null || header.getValue()==null || "".equals(header.getValue())) return null; Matcher m = charsetPattern.matcher(header.getValue()); if (m.find()) { return m.group(1).trim().toUpperCase(); } return null; } /** Returns the all elements matching any of the given tags (case-insensitive). */ public static Elements findElementsByTag(Element element, String ... tags) { List<Element> results = new ArrayList<Element>(); Set<String> tagSet = new HashSet<String>(); for(String tag : tags) tagSet.add(tag.toLowerCase()); filterElementsByTag(results, element, tagSet); return new Elements(results); } private static void filterElementsByTag(List<Element> results, Element element, Set<String> tagSet) { if(tagSet.contains(element.tag().getName().toLowerCase())) results.add(element); for(Element child : element.children()) filterElementsByTag(results, child, tagSet); } /** Returns the first element found with the given tag (or tag sequence separated by '/') or null. */ public static Element findFirstByTag(Element element, String tag) { return findFirstByTag(element, tag.split("/"), 0); } private static Element findFirstByTag(Element current, String [] tags, int index) { if(index < tags.length) { Elements elements = current.getElementsByTag(tags[index]); for(Element element : elements) { Element result = findFirstByTag(element, tags, index + 1); if(result != null) return result; } return null; } else return current; } }