/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.selftest;
import java.util.regex.Pattern;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
/**
* An example analysis module that prioritizes outlinks of URIs that contain
* a certain keyword over the outlinks of URIs that do not.
*
* <p>This is just a proof-of-concept; it isn't appropriate for actual
* production crawls, and so it lives with the test code. This module has
* the following limitations:
*
* <ol>
* <li>It doesn't parse HTML content; so trying to match a keyword of "body"
* would match.</li>
* <li>It doesn't do any language analysis (eg, "political" if "politics" is
* the specified keyword).</li>
* <li>It can't match more than one keyword.</li>
* <li>It doesn't consider the number of times the keyword appears.</li>
* </ol>
*
* And so on. However, this module does provide a simple example of how to
* modify precedence values of a URI's links based on that URI's content.
*
* NOTE: This processor may open a ReplayCharSequence from the
* CrawlURI's Recorder, without closing that ReplayCharSequence, to allow
* reuse by later processors in sequence. In the usual (Heritrix) case, a
* call after all processing to the Recorder's endReplays() method ensures
* timely close of any reused ReplayCharSequences. Reuse of this processor
* elsewhere should ensure a similar cleanup call to Recorder.endReplays()
* occurs.
*
* @author pjack
*/
public class KeyWordProcessor extends Processor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 1L;
/**
* Regular expression used to detect the presence of a keyword.
*/
Pattern pattern = Pattern.compile("\\bkeyword\\b");
public Pattern getPattern() {
return this.pattern;
}
public void setPattern(Pattern pattern) {
this.pattern = pattern;
}
/**
* Precedence value to assign to discovered links of URIs that match
* the pattern.
*/
int foundPrecedence = 1;
public int getFoundPrecedence() {
return this.foundPrecedence;
}
public void setFoundPrecedence(int prec) {
this.foundPrecedence = prec;
}
/**
* Precedence value to assign to discovered links of URIs that do not
* match the pattern.
*/
int notFoundPrecedence = 10;
public int getNotFoundPrecedence() {
return this.notFoundPrecedence;
}
public void setNotFoundPrecedence(int prec) {
this.notFoundPrecedence = prec;
}
@Override
protected void innerProcess(CrawlURI curi) throws InterruptedException {
try {
CrawlURI viaUri = curi.getFullVia();
if(!viaUri.getData().containsKey("keywordHit")) {
ReplayCharSequence seq = viaUri.getRecorder().getContentReplayCharSequence();
viaUri.getData().put("keywordHit", getPattern().matcher(seq).find());
}
boolean keywordHit = (Boolean) viaUri.getData().get("keywordHit");
int precedence = keywordHit ? getFoundPrecedence() : getNotFoundPrecedence();
curi.setPrecedence(precedence);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
protected boolean shouldProcess(CrawlURI uri) {
if (!uri.getFullVia().getContentType().equals("text/html")) {
return false;
}
return uri instanceof CrawlURI;
}
}