/*
* Copyright 1996-2002 by Andruid Kerne. All rights reserved. CONFIDENTIAL. Use is subject to
* license terms.
*/
package ecologylab.bigsemantics.model.text.utils;
import ecologylab.generic.Debug;
import ecologylab.net.ParsedURL;
/**
* Filters out <code>URL</code>s that seem to be from ad servers.
*/
public class Filter extends Debug
{
boolean active = true;
static FilterElement patterns[] =
{
// new FilterElement("advertiser", false, false),
new FilterElement("ads", true, false), new FilterElement("adv", true, false),
new FilterElement("ad", true, true),
new FilterElement("adx", true, true),
new FilterElement("doubleclick", true, true),
// new FilterElement("help", false, false),
new FilterElement("banner", false, false), new FilterElement("wp-srv", false, false),
new FilterElement("creditcard", false, false), new FilterElement("promos", true, false),
new FilterElement("click", false, false), new FilterElement("shopping", false, false), // MUST
// BE
// LAST
};
public static int count = patterns.length;
public static void shoppingOK()
{
if (count == patterns.length)
count--;
}
/*
* public boolean match(URL url) { return match(StringTools.noAnchorNoQueryPageString(url)); }
*/
/* The parameter of this method has been changed from URL to ParsedURL */
public boolean match(ParsedURL parsedURL)
{
/* lc() method returns lower case url string from the ParsedURL */
return matchLc(parsedURL.lc());
}
public boolean match(String s)
{
return matchLc(s.toLowerCase());
}
public boolean matchLc(String s)
{
if (!active)
return false;
boolean result = false;
int numPatterns = count;
String save = s;
for (int i = 0; i != numPatterns; i++)
{
s = save;
FilterElement thisFilter = patterns[i];
String pattern = thisFilter.pattern;
int patternLength = pattern.length();
boolean nonAlphaBefore = thisFilter.nonAlphaBefore;
boolean nonAlphaAfter = thisFilter.nonAlphaAfter;
for (int index = s.indexOf(pattern); index != -1; index = s.indexOf(pattern))
{
result = true;
if (nonAlphaBefore)
{
int before = index - 1;
if (before >= 0)
if (Character.isJavaIdentifierStart(s.charAt(before)))
result = false;
// System.out.println("\tChecking before="+s.charAt(before)+
// "... " + result);
}
if (nonAlphaAfter)
{
int sLength = s.length();
int after = index + patternLength;
if (after < sLength)
if (Character.isJavaIdentifierStart(s.charAt(after)))
result = false;
}
if (result)
break;
s = s.substring(index + patternLength);
}
if (result)
break;
}
return result;
}
public static void main(String s[])
{
Filter f = new Filter();
for (int i = 0; i != s.length; i++)
{
String x = s[i];
println(f.match(x) + " " + x);
}
}
}