package org.archive.resource.html;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.htmlparser.nodes.TextNode;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import junit.framework.TestCase;
public class ExtractingParseObserverTest extends TestCase {
private static final Logger LOG =
Logger.getLogger(ExtractingParseObserverTest.class.getName());
public void testHandleStyleNodeExceptions() throws Exception {
String[] tests = {
"some css",
"url()",
"url () ",
"url ('')",
"url (' ')",
"url('\")",
"url(')",
"url('\"')",
"url('\\\"\"')",
"url(''''')"
};
boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
ExtractingParseObserver epo = new ExtractingParseObserver(md);
for(String css : tests) {
try {
TextNode tn = new TextNode(css);
epo.handleStyleNode(tn);
} catch(Exception e) {
System.err.format("And the winner is....(%s)\n", css);
e.printStackTrace();
except = true;
throw e;
}
assertFalse(except);
}
}
public void testHandleStyleNode() throws Exception {
String[][] tests = {
{""},
{"url(foo.gif)","foo.gif"},
{"url('foo.gif')","foo.gif"},
{"url(\"foo.gif\")","foo.gif"},
{"url(\\\"foo.gif\\\")","foo.gif"},
{"url(\\'foo.gif\\')","foo.gif"},
{"url(''foo.gif'')","foo.gif"},
{"url( foo.gif )","foo.gif"},
{"url('''')"},
{"url('foo.gif'')","foo.gif"},
};
for(String[] testa : tests) {
checkExtract(testa);
}
}
/**
* Test whether the pattern matcher does extract nothing and also does not
* not hang-up if an overlong CSS link is truncated.
*/
public void testHandleStyleNodeNoHangupTruncated() throws Exception {
StringBuilder sb = new StringBuilder();
sb.append("url(");
for (int i = 0; i < 500000; i++)
sb.append('\'');
sb.append("foo.gif");
for (int i = 0; i < 499000; i++)
sb.append('\'');
String[] test = new String[1];
test[0] = sb.toString();
checkExtract(test);
}
private void checkExtract(String[] data) throws JSONException {
// System.err.format("CSS(%s) want[0](%s)\n",css,want[0]);
String css = data[0];
HTMLMetaData md = new HTMLMetaData(new MetaData());
ExtractingParseObserver epo = new ExtractingParseObserver(md);
try {
TextNode tn = new TextNode(css);
epo.handleStyleNode(tn);
} catch(Exception e) {
fail("Exception with CSS:" + css);
}
JSONArray a = md.optJSONArray("Links");
if(data.length > 1) {
assertNotNull(a);
assertEquals(data.length-1,a.length());
for(int i = 1; i < data.length; i++) {
Object o = a.optJSONObject(i-1);
assertTrue(o instanceof JSONObject);
JSONObject jo = (JSONObject) o;
assertEquals("CSS link extraction failed for <" + css + ">",
data[i], jo.getString("href"));
}
} else {
assertNull("Expected no extracted link for <" + css + ">", a);
}
}
private void checkLink(Multimap<String,String> links, String url, String path) {
assertTrue("Link with URL " + url + " not found", links.containsKey(url));
assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
}
private void checkLinks(Resource resource, String[][] expectedLinks) {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
MetaData md = resource.getMetaData();
LOG.info(md.toString());
Multimap<String, String> links = ArrayListMultimap.create();
JSONObject head = md.optJSONObject("Head");
if (head != null) {
// <base href="http://www.example.com/" />
String baseUrl = (String) head.opt("Base");
if (baseUrl != null) {
links.put(baseUrl, "__base__");
}
// <meta http-equiv="Refresh" content="5; URL=http://www.example.com/redirected.html" />
JSONArray metas = head.optJSONArray("Metas");
if (metas != null) {
for (int i = 0; i < metas.length(); i++) {
JSONObject o = (JSONObject) metas.optJSONObject(i);
String httpEquiv = o.optString("http-equiv");
if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) {
String metaRefreshTarget = o.optString("content");
if (metaRefreshTarget != null) {
metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
links.put(metaRefreshTarget, "__meta_refresh__");
}
}
}
}
}
// extract outlinks
List<JSONArray> linkArrays = new ArrayList<JSONArray>();
if (md.optJSONArray("Links") != null) {
linkArrays.add(md.optJSONArray("Links"));
}
try {
if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) {
linkArrays.add(md.getJSONObject("Head").getJSONArray("Link"));
}
} catch (JSONException e1) {
}
for (JSONArray ldata : linkArrays) {
for (int i = 0; i < ldata.length(); i++) {
JSONObject o = (JSONObject) ldata.optJSONObject(i);
try {
String url = o.getString("url");
links.put(url, o.getString("path"));
LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
} catch (JSONException e) {
fail("Failed to extract URL from link: " + e.getMessage());
}
}
}
assertEquals("Unexpected number of links", expectedLinks.length, links.size());
for (String[] l : expectedLinks) {
checkLink(links, l[0], l[1]);
}
}
public void testLinkExtraction() throws ResourceParseException, IOException {
String testFileName = "link-extraction-test.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor =
new ExtractingResourceProducer(producer, mapper);
extractor.getNext(); // skip warcinfo record
String[][] html4links = {
{"http://www.example.com/", "__base__"},
{"http://www.example.com/redirected.html", "__meta_refresh__"},
{"background.jpg", "BODY@/background"},
{"http://www.example.com/a-href.html", "A@/href"},
{"#anchor", "A@/href"},
{"image.png", "IMG@/src"},
{"image.gif", "IMG@/src"},
{"http://example.com/image-description.html#image.gif", "IMG@/longdesc"},
{"helloworld.swf", "OBJECT@/data"},
{"http://www.example.com/shakespeare.html", "Q@/cite"},
{"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
};
checkLinks(extractor.getNext(), html4links);
String[][] html5links = {
{"http:///www.example.com/video.html", "LINK@/href", "canonical"},
{"video.rss", "LINK@/href", "alternate"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
};
checkLinks(extractor.getNext(), html5links);
String[][] html5links2 = {
{"http://www.example.com/", "A@/href"},
};
checkLinks(extractor.getNext(), html5links2);
String[][] fbVideoLinks = {
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
{"https://www.facebook.com/facebook/", "A@/href"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbVideoLinks);
String[][] dataHrefLinks = {
{"standard.css", "LINK@/href", "stylesheet"},
{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
{"https://www.facebook.com/facebook/", "A@/href"},
{"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"},
{"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"},
{"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"},
{"/content-page", "ARTICLE@/data-href"},
{"/content-page", "A@/href"},
{"/tags/content","A@/href"},
{"/tags/headlines", "A@/href"},
{"http://grabaperch.com", "DIV@/data-href"},
{"green.css", "LINK@/data-href"},
{"blue.css", "LINK@/data-href"},
{"http://codecanyon.net/user/CodingJack", "A@/data-href"},
{"jackbox/img/thumbs/4.jpg", "IMG@/src"},
{"//venobox-destination", "A@/data-href"},
{"#", "A@/href"},
{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"},
{"#", "A@/href"},
{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
};
checkLinks(extractor.getNext(), dataHrefLinks);
String[][] fbSocialLinks = {
{"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
{"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
{"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"},
{"https://www.facebook.com/zuck", "DIV@/data-href"},
{"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"},
{"https://www.facebook.com/facebook", "DIV@/data-href"},
{"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"},
{"https://www.facebook.com/facebook", "A@/href"},
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
}
}