/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.util.htmllex; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import org.apache.commons.lang.StringEscapeUtils; import org.archive.wayback.ResultURIConverter; import org.archive.wayback.archivalurl.FastArchivalUrlReplayParseEventHandler; import org.archive.wayback.replay.html.ContextResultURIConverterFactory; import org.archive.wayback.replay.html.ReplayParseContext; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.util.ParserException; import org.htmlparser.util.Translate; import junit.framework.TestCase; /** * @author brad * */ public class ContextAwareLexerTest extends TestCase { /** * Test method for {@link org.archive.wayback.util.htmllex.ContextAwareLexer#nextNode()}. * @throws ParserException * @throws IOException */ public void testNextNode() throws ParserException, IOException { String url = "http://foo.com/"; String date = "2001"; ConvertAccumulator ca = accumulate(url,date,"<a href=\"/boo\">boo</a>"); assertEquals("wrong number of rewrites",1,ca.ops.size()); assertEquals("wrong URL","http://foo.com/boo",ca.ops.get(0).url); assertEquals("wrong flag","",ca.ops.get(0).context); assertEquals("wrong date",date,ca.ops.get(0).datespec); ca = accumulate(url,date,"<img src=\"/boo\"></img>"); assertEquals("wrong number of rewrites",1,ca.ops.size()); assertEquals("wrong URL","http://foo.com/boo",ca.ops.get(0).url); assertEquals("wrong flag","im_",ca.ops.get(0).context); ca = accumulate(url,date,"<a href=\" /boo\">boo</a>"); assertEquals("wrong number of rewrites",1,ca.ops.size()); assertEquals("wrong URL","http://foo.com/boo",ca.ops.get(0).url); ca = accumulate(url,date,"<a href=\" /boo?foo=bar\">boo</a>"); assertEquals("wrong number of rewrites",1,ca.ops.size()); assertEquals("wrong URL","http://foo.com/boo?foo=bar",ca.ops.get(0).url); ca = accumulate(url,date,"<a href=\" /boo?foo=bar&baz=snazz\">boo</a>"); assertEquals("wrong number of rewrites",1,ca.ops.size()); assertEquals("wrong URL","http://foo.com/boo?foo=bar&baz=snazz",ca.ops.get(0).url); // BUGBUG: org.htmlparer.util.Translate.decode() seems broken... ca = accumulate(url,date,"<a href=\" /boo?foo=bar&lang=gang\">boo</a>"); assertEquals("wrong number of rewrites",1,ca.ops.size()); assertEquals("wrong URL","http://foo.com/boo?foo=bar&lang=gang",ca.ops.get(0).url); ca = accumulate(url,date,"<a href=\" /p/s-w-%E2%80%9Ctext%E2%80%9D\">boo</a>"); assertEquals("wrong number of rewrites",1,ca.ops.size()); assertEquals("wrong URL","http://foo.com/p/s-w-%E2%80%9Ctext%E2%80%9D",ca.ops.get(0).url); // path relative doesn't get contextualized: String url2 = "http://foo.com/bar/baz.html"; ca = accumulate(url2,date,"<a href=\"kay\">key</a>"); assertEquals("wrong number of rewrites",0,ca.ops.size()); // server relative jumps to root directory: ca = accumulate(url2,date,"<a href=\"/kay\">key</a>"); assertEquals("wrong number of rewrites",1,ca.ops.size()); assertEquals("wrong URL","http://foo.com/kay",ca.ops.get(0).url); // real world example: // "http://www.tn.gov/comaging/" // "documents/Tennessee State Plan 2009 - 2013 signed.pdf" // no contextualize for path-relative String url3 = "http://foo.com/bar/"; ca = accumulate(url3,date,"<a href=\"doc/foo bar.pdf\">key</a>"); assertEquals("wrong number of rewrites",0,ca.ops.size()); } private void compareDecodes(String orig) { String htmlparserDecoded = Translate.decode(orig); String apacheDecoded = StringEscapeUtils.unescapeHtml(orig); System.out.format("ORIGINAL:(%s)\n", orig); System.out.format("htmlparser:(%s)\n", htmlparserDecoded); System.out.format("apache:(%s)\n", apacheDecoded); } private ConvertAccumulator accumulate(String base, String datespec, String html) throws IOException, ParserException { assertNull(null); Lexer lexer = new Lexer(html); URL url = new URL(base); // String datespec = "2001"; ConvertAccumulator ca = new ConvertAccumulator(); ReplayParseContext rpc = new ReplayParseContext(new TestContextURICFactory(ca),url,datespec); ContextAwareLexer caLex = new ContextAwareLexer(lexer, rpc); ArrayList<Node> nodes = new ArrayList<Node>(); FastArchivalUrlReplayParseEventHandler handler = new FastArchivalUrlReplayParseEventHandler(); handler.setCommentJsp(null); handler.setJspInsertPath(null); handler.init(); while(true) { Node next = caLex.nextNode(); if(next == null) { break; } handler.handleNode(rpc, next); nodes.add(next); } return ca; } public class ConvertOperation { String context; String datespec; String url; public ConvertOperation(String c, String d, String u) { context = c; datespec = d; url = u; } public String toString() { return "ConvertOp:c("+context+") d("+datespec+") u("+url+")"; } } public class ConvertAccumulator { ArrayList<ConvertOperation> ops = null; public ConvertAccumulator() { ops = new ArrayList<ContextAwareLexerTest.ConvertOperation>(); } public void accumulate(String c, String d, String u) { ops.add(new ConvertOperation(c, d, u)); } } public class TestContextURICFactory implements ContextResultURIConverterFactory { ConvertAccumulator ca = null; public TestContextURICFactory(ConvertAccumulator ca) { this.ca = ca; } public ResultURIConverter getContextConverter(String flags) { return new TestContextURIC(ca, flags); } } public class TestContextURIC implements ResultURIConverter { String context; ConvertAccumulator ca; public TestContextURIC(ConvertAccumulator ca, String context) { this.context = context; this.ca = ca; } public String makeReplayURI(String datespec, String url) { ca.accumulate(context, datespec, url); return url; } } }