/******************************************************************************
* Copyright (c) 2010 Basis Technology Corp.
*
* Basis Technology Corp. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.basistech.readability;
import java.util.regex.Pattern;
/**
*
*/
final class Patterns {
static final Pattern PAGE_NUMBER_LIKE = ciPattern("((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$");
static final Pattern PAGE_AND_NUMBER = ciPattern("p(a|g|ag)?(e|ing|ination)?(=|/)[0-9]{1,2}");
static final Pattern PAGE_OR_PAGING = ciPattern("(page|paging)");
static final Pattern EXTRANEOUS = ciPattern("print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single");
static final Pattern NEXT_LINK = ciPattern("(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))");
// Match: next, continue, >, >>, » but not >|, »| as those usually mean last."
static final Pattern PAGINATION = ciPattern("pag(e|ing|inat)");
static final Pattern FIRST_OR_LAST = ciPattern("(first|last)");
static final Pattern NEGATIVE = ciPattern("(combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget)");
static final Pattern PREV_LINK = ciPattern("(prev|earl|old|new|<|«)");
static final Pattern POSITIVE = ciPattern("(article|body|content|entry|hentry|main|page|pagination|post|text|blog|story)");
//static final Pattern REPLACE_BRS = ciPattern("(<br[^>]*>[ \n\r\t]*){2,}");
//above causes a stack overflow crash on some pages, bottom behaves differnetly for some reason
static final Pattern REPLACE_BRS = ciPattern("(<br[^>]*>[ \n\r\t]*)\1+");
static final Pattern UNLIKELY_CANDIDATES = ciPattern("combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter");
static final Pattern OK_MAYBE_ITS_A_CANDIDATE = ciPattern("and|article|body|column|main|shadow");
//below works better with espn "recap" pages, but unsure that's a good reason to change behavior.
//static final Pattern OK_MAYBE_ITS_A_CANDIDATE = ciPattern("and|article|body|column|main|shadow|subheader");
static final Pattern ENDS_WITH_DOT = Pattern.compile("\\.( |$)");
static final Pattern DIGIT = Pattern.compile("\\d");
static final Pattern BAR_DASH = Pattern.compile(" [\\|\\-] ");
private Patterns() {
//
}
static boolean match(Pattern pattern, String string) {
return pattern.matcher(string).matches();
}
static boolean exists(Pattern pattern, String string) {
return pattern.matcher(string).find();
}
private static Pattern ciPattern(String patternString) {
return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
}
}