/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.util.htmllex; import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; import java.util.Map; import java.util.logging.Logger; import org.apache.commons.httpclient.URIException; import org.archive.url.UsableURI; import org.archive.url.UsableURIFactory; import org.archive.wayback.core.CaptureSearchResult; /** * Class which tracks the context and state involved with parsing an HTML * document via SAX events. * * Also holds some page URL information, and provides some URL resolving * functionality. * * Lastly, this class exposes a general purpose HashMap<String,String> for use * by specific applications. * * @author brad * @version $Date$, $Revision$ */ public class ParseContext { private static final Logger LOGGER = Logger.getLogger( ParseContext.class.getName()); protected UsableURI baseUrl = null; private boolean inHTML = false; private boolean inCSS = false; private boolean inJS = false; private boolean inScriptText = false; private HashMap<String,String> data = null; /** * constructor */ public ParseContext() { data = new HashMap<String, String>(); } /** * Stores arbitrary key value pairs in this ParseContext * @param key for storage * @param value for storage */ public void putData(String key, String value) { data.put(key, value); } /** * Retrieves previously stored data for key key from this ParseContext * @param key under which value was stored * @return previously stored value for key or null, if nothing was stored */ public String getData(String key) { return data.get(key); } /** * @return the full Map of String to String for this parsing context. */ public Map<String,String> getMap() { return data; } /** * @param baseURL an base URL for relative URLs */ public void setBaseUrl(String baseURL) { try { baseUrl = UsableURIFactory.getInstance(baseURL); } catch (URIException ex) { // XXX ex.printStackTrace(); } } /** * @param url against which relative URLs should be resolved for this parse */ public void setBaseUrl(URL url) { setBaseUrl(url.toExternalForm()); } /** * Resolve possibly-relative {@code url} with {@code baseUrl} set to * this object. * <p>Caveat: this method no longer unescape HTML entities in {@code url}. * HTML entities must be all unescaped before calling method.</p> * @param url which should be resolved * @return absolute URL. * @throws URISyntaxException if the input URL is malformed */ public String resolve(String url) throws URISyntaxException { int hashIdx = url.indexOf('#'); String frag = ""; if (hashIdx != -1) { frag = url.substring(hashIdx); url = url.substring(0, hashIdx); } if (baseUrl == null) { // TODO: log ? return url + frag; } try { url = UsableURIFactory.getInstance(baseUrl, url).toString() + frag; } catch (URIException e) { LOGGER.warning("FAILED RESOLVE: base(" + baseUrl + ") frag(" + url + ") error(" + e.getMessage() + ")"); url = url + frag; } return url; } /** * @param url which should be resolved. * @return absolute form of input url, or url itself if javascript: */ public String contextualizeUrl(String url) { if(url.startsWith("javascript:") || url.startsWith("#")) { return url; } try { return resolve(url); } catch (URISyntaxException e) { e.printStackTrace(); return url; } } /** * set to {@code true} when any HTML open tag * is found. * <p>used for checking if the content really * looks like an HTML document.</p> * @param inHTML */ public void setInHTML(boolean inHTML) { this.inHTML = inHTML; } public boolean isInHTML() { return inHTML; } /** * @return the inCSS */ public boolean isInCSS() { return inCSS; } /** * @param inCSS the inCSS to set */ public void setInCSS(boolean inCSS) { this.inCSS = inCSS; } /** * @return the inJS */ public boolean isInJS() { return inJS; } /** * @param inJS the inJS to set */ public void setInJS(boolean inJS) { this.inJS = inJS; } /** * @return the inScriptText */ public boolean isInScriptText() { return inScriptText; } /** * @param inScriptText the inScriptText to set */ public void setInScriptText(boolean inScriptText) { this.inScriptText = inScriptText; } public String getOraclePolicy() { return getData(CaptureSearchResult.CAPTURE_ORACLE_POLICY); } public void setOraclePolicy(String policy) { putData(CaptureSearchResult.CAPTURE_ORACLE_POLICY, policy); } }