/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.resourcestore.indexer; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.htmllex.ContextAwareLexer; import org.archive.wayback.util.htmllex.ParseContext; import org.archive.wayback.util.htmllex.ParseEventDelegator; import org.archive.wayback.util.url.UrlOperations; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; public class HTTPRecordAnnotater { private RobotMetaRule rule = null; private ParseEventDelegator rules = null; private RobotMetaFlags robotFlags; private static final Logger LOGGER = Logger.getLogger(HTTPRecordAnnotater.class.getName()); private static final String UPPER_LOCATION = WaybackConstants.LOCATION_HTTP_HEADER.toUpperCase(); private final static String[] mimes = { "html" }; public HTTPRecordAnnotater() { rules = new ParseEventDelegator(); rules.init(); rule = new RobotMetaRule(); robotFlags = new RobotMetaFlags(); rule.setRobotFlags(robotFlags); rule.visit(rules); } public boolean isHTML(String mimeType) { String mimeLower = mimeType.toLowerCase(); for(String mime : mimes) { if(mimeLower.contains(mime)) { return true; } } return false; } private String escapeSpaces(final String input) { if(input.contains(" ")) { return input.replace(" ", "%20"); } return input; } public String transformHTTPMime(String input) { if(input == null) { return null; } int semiIdx = input.indexOf(";"); if(semiIdx > 0) { return escapeSpaces(input.substring(0,semiIdx).trim()); } return escapeSpaces(input.trim()); } public void annotateHTTPContent(CaptureSearchResult result, InputStream is, Header[] headers, String mimeGuess) { robotFlags.reset(); String mimeType = null; if (headers != null) { for (Header httpHeader : headers) { if (httpHeader.getName().toUpperCase().equals( UPPER_LOCATION)) { // Old Comment: "Location" is supposed to be absolute: // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) // (section 14.30) but Content-Location can be // relative. // is it correct to resolve a relative Location, as // we are? // it's also possible to have both in the HTTP // headers... // should we prefer one over the other? // right now, we're ignoring "Content-Location" // // NOTE: FILLING THE REDIRECT FIELD IN CDX IS DISABLED! // If we want to support redirect in cdx as long as the url is valid // comment out the following lines: // String locationStr = httpHeader.getValue(); // result.setRedirectUrl( // UrlOperations.resolveUrl(result.getOriginalUrl(), // locationStr, "-")); } else if(httpHeader.getName().toLowerCase().equals("content-type")) { mimeType = transformHTTPMime(httpHeader.getValue()); } else if(httpHeader.getName().toLowerCase().equals( WaybackConstants.X_ROBOTS_HTTP_HEADER)) { robotFlags.parse(httpHeader.getValue()); } } } // TODO: get the encoding: String encoding = "utf-8"; if(mimeType == null) { // nothing present in the HTTP headers.. Use the WARC field: mimeType = transformHTTPMime(mimeGuess); } if(mimeType == null) { mimeType = "unknown"; } result.setMimeType(mimeType); // Now the sticky part: If it looks like an HTML document, look for // robot meta tags: if(isHTML(mimeType)) { String fileContext = result.getFile() + ":" + result.getOffset(); annotateHTMLContent(is, encoding, fileContext, result); } robotFlags.apply(result); } public void annotateHTMLContent(InputStream is, String charSet, String fileContext, CaptureSearchResult result) { ParseContext context = new ParseContext(); Node node; try { ContextAwareLexer lex = new ContextAwareLexer( new Lexer(new Page(is,charSet)),context); while((node = lex.nextNode()) != null) { // System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):"); // System.err.println("-------------------/START"); // System.err.println(node.toHtml(true)); // System.err.println("-------------------/END"); rules.handleNode(context, node); } rules.handleParseComplete(context); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); } catch (IOException e) { LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); } } }