/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.util; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.lang.StringUtils; import org.w3c.dom.DocumentFragment; // Utility class used to extract refresh tags from HTML pages public abstract class RefreshTag { private static XPathExpression expression; private static Matcher matcher = Pattern.compile("^.*;\\s*URL=(.+)$", Pattern.CASE_INSENSITIVE).matcher(""); static { XPath xpath = XPathFactory.newInstance().newXPath(); try { expression = xpath .compile("//META[@http-equiv=\"refresh\"]/@content"); } catch (XPathExpressionException e) { throw new RuntimeException(e); } } public static String extractRefreshURL(DocumentFragment doc) { String value; try { value = (String) expression.evaluate(doc, XPathConstants.STRING); } catch (XPathExpressionException e) { return null; } if (StringUtils.isBlank(value)) return null; // 0;URL=http://www.apollocolors.com/site try { if (matcher.reset(value).matches()) { return matcher.group(1); } } catch (Exception e) { } return null; } }