package org.commoncrawl.mapred.pipelineV3.crawllistgen;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.MimeTypeFilter;
import org.commoncrawl.util.URLUtils;
import org.junit.Assert;
import org.junit.Test;
import com.google.common.collect.ImmutableList;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonPrimitive;
public class URLFilter {
static Pattern isAnchor = Pattern.compile("^html:a[ \\t\\n\\x0B\\f\\r:]*(.*)$");
static Pattern hasNoFollow = Pattern.compile("[ \\t\\n\\x0B\\f\\r:]*(nofollow|no-follow)[ \\t\\n\\x0B\\f\\r:]*");
static Pattern isImage= Pattern.compile("^html:img[ \\t\\n\\x0B\\f\\r:]*(.*)$");
static Pattern hasTrackback = Pattern.compile("[ \\t\\n\\x0B\\f\\r:]*(trackback|pingback)[ \\t\\n\\x0B\\f\\r:]*");
static Pattern hasTag = Pattern.compile("[ \\t\\n\\x0B\\f\\r:]*(tag)[ \\t\\n\\x0B\\f\\r:]*");
static Pattern isMultimedia = Pattern.compile("[ \\t\\n\\x0B\\f\\r:]*(image|video|audio|img)[ \\t\\n\\x0B\\f\\r:]*");
static Pattern urlWithExtension = Pattern.compile(".*[/]*[^.]+[.]([^.]+)$");
static Pattern mimeTypeExtractor = Pattern.compile("[ \\t\\n\\x0B\\f\\r:]*([^/]*)/([^/]*)[ \\t\\n\\x0B\\f\\r:]*");
public URLFilter() {
}
public boolean isURLCrawlable(GoogleURL urlObject,JsonObject mergeRecord) {
if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY)) {
JsonObject summaryRecord = mergeRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY);
if (summaryRecord.has(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) {
JsonArray crawlStatusArray = summaryRecord.getAsJsonArray(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY);
for (JsonElement crawlStatus : crawlStatusArray) {
JsonObject crawlStatusObj = crawlStatus.getAsJsonObject();
if (crawlStatusObj.has(CrawlDBCommon.CRAWLDETAIL_HTTPRESULT_PROPERTY)) {
int httpResult = crawlStatusObj.get(CrawlDBCommon.CRAWLDETAIL_HTTPRESULT_PROPERTY).getAsInt();
if (httpResult == 404 || httpResult == 301) {
// reject 404s or permanent 301s
return false;
}
}
if (crawlStatusObj.has(CrawlDBCommon.CRAWLDETAIL_MIMETYPE_PROPERTY)) {
if (!MimeTypeFilter.isTextType(crawlStatusObj.get(CrawlDBCommon.CRAWLDETAIL_MIMETYPE_PROPERTY).getAsString())) {
// reject if returned mime type was not text from previous crawl...
return false;
}
}
}
}
}
String path = urlObject.getPath();
// match invalid extensions ...
int lastIndexOfDot = path.lastIndexOf('.');
if (lastIndexOfDot != -1 && lastIndexOfDot + 1 != path.length()) {
String extension = path.substring(lastIndexOfDot + 1);
if (MimeTypeFilter.invalidExtensionMatcher.matches(extension)) {
return false;
}
else if (MimeTypeFilter.isTextType(extension)){
return true;
}
}
/*
Matcher extensionMatcher = urlWithExtension.matcher(path);
if (extensionMatcher.find()) {
String extension = extensionMatcher.group(1);
if (MimeTypeFilter.invalidExtensionMatcher.matches(extension)) {
return false;
}
else if (MimeTypeFilter.isTextType(extension)){
return true;
}
}
*/
if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY)) {
JsonObject linkRecord = mergeRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY);
// if we had anchor information ...
if (linkRecord.has(CrawlDBCommon.LINKSTATUS_TYPEANDRELS_PROPERTY)) {
JsonArray typeAndRels = linkRecord.getAsJsonArray(CrawlDBCommon.LINKSTATUS_TYPEANDRELS_PROPERTY);
for (JsonElement typeAndRel : typeAndRels) {
String typeAndRelText = typeAndRel.getAsString();
// only accept html anchor tags ...
//TODO: OTHER TYPES ???
Matcher anchorMatcher = isAnchor.matcher(typeAndRelText);
if (anchorMatcher.matches()) {
if (anchorMatcher.groupCount() == 1) {
String anchorRelText = anchorMatcher.group(1);
// reject rels of nofollow, tag, trackback, and multimedia etc.
if (hasNoFollow.matcher(anchorRelText).find()
|| hasTrackback.matcher(anchorRelText).find()
|| hasTag.matcher(anchorRelText).find()
|| isMultimedia.matcher(anchorRelText).find()) {
return false;
}
return true;
}
}
else {
// if not anchor tag, see mime type is available in rel context...
// let text related mime types through ...
Matcher mimeTypeMatcher = mimeTypeExtractor.matcher(typeAndRelText);
if (mimeTypeMatcher.find() && mimeTypeMatcher.groupCount() >= 1) {
if (MimeTypeFilter.isTextType(mimeTypeMatcher.group(1)+"/"+mimeTypeMatcher.group(2))) {
return true;
}
}
}
}
// if we had tag type and rel info and we did not pass this url, reject it here ...
if (typeAndRels.size() != 0) {
return false;
}
}
}
return true;
}
/**
* test code
*
* @param typeAndRels
* @return
*/
static JsonObject buildTestLinkAndRel(Integer optionalHttpResult,String optionalMimeType,ImmutableList<String> typeAndRels) {
JsonObject objectOut = new JsonObject();
JsonObject linkStatusObj = new JsonObject();
objectOut.add(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY, linkStatusObj);
if (typeAndRels != null && typeAndRels.size() != 0) {
JsonArray array = new JsonArray();
for (String typeAndRel : typeAndRels) {
array.add(new JsonPrimitive(typeAndRel));
}
linkStatusObj.add(CrawlDBCommon.LINKSTATUS_TYPEANDRELS_PROPERTY, array);
}
if (optionalHttpResult != null || optionalMimeType != null) {
JsonObject crawlSummary = new JsonObject();
JsonObject crawlDetail = new JsonObject();
if (optionalHttpResult != null) {
crawlDetail.add(CrawlDBCommon.CRAWLDETAIL_HTTPRESULT_PROPERTY, new JsonPrimitive(optionalHttpResult));
}
if (optionalMimeType != null) {
crawlDetail.add(CrawlDBCommon.CRAWLDETAIL_MIMETYPE_PROPERTY, new JsonPrimitive(optionalMimeType));
}
crawlSummary.add(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY, new JsonArray());
crawlSummary.getAsJsonArray(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY).add(crawlDetail);
objectOut.add(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY, crawlSummary);
}
return objectOut;
}
void validateURLAndMetadata(String url,Integer optionalHttpResult,String optionalMimeType,ImmutableList<String> typeAndRels,boolean expectedResult ) {
GoogleURL urlObject = new GoogleURL(url);
Assert.assertTrue(urlObject.isValid());
Assert.assertEquals(isURLCrawlable(urlObject,buildTestLinkAndRel(optionalHttpResult,optionalMimeType,typeAndRels)),expectedResult);
}
@Test
public void validateFilter() throws Exception {
validateURLAndMetadata("http://test.com/test.jpg",null,null, null, false);
validateURLAndMetadata("http://test.com/test.png",null,null, null, false);
validateURLAndMetadata("http://test.com/test.pdf",null,null, null, true);
validateURLAndMetadata("http://test.com/test.ext",null,null, ImmutableList.of("html:application/pdf"), true);
validateURLAndMetadata("http://test.com/test.txt",null,null, null, true);
validateURLAndMetadata("http://test.com/test.html",null,null, null, true);
validateURLAndMetadata("http://test.com/test.pdf",null,null, null, true);
validateURLAndMetadata("http://test.com/test.pdf",null,null, ImmutableList.of("html:img"), true);
validateURLAndMetadata("http://www.buffalo.edu/postcards/index.html?.CGI%3A%3AObjects.p.image.ab3e0c4078fb824e9f43cd07dd44834c\u003dnorth_campus_p.jpg",
null,null,null,false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:img"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:img:a"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:img:"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a"), true);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:"), true);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:bar foo"), true);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:nofollow"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:me nofollow"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:tag"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:tag foo"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:foo tag"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:image"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:audio"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:video"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:audio"), false);
validateURLAndMetadata("http://test.com/test.ext", null,null,ImmutableList.of("html:a:video"), false);
validateURLAndMetadata("http://test.com/test.ext", 301,null,null, false);
validateURLAndMetadata("http://test.com/test.ext", 404,null,null, false);
validateURLAndMetadata("http://test.com/test.ext", 200,"image/png",null, false);
validateURLAndMetadata("http://test.com/test.ext", 200,"image/jpeg",null, false);
validateURLAndMetadata("http://test.com/test.ext", 200,"text/pdf",null, true);
validateURLAndMetadata("http://test.com/test.ext", 200,"text/plain",null, true);
System.out.println(URLUtils.getURLFPV2FromHost("mayerm%c3%bcllerschulze.de"));
System.out.println(URLUtils.canonicalizeURL("http://" + "mayerm%c3%bcllerschulze.de", false));
}
}