/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.extractor; import java.util.Collection; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import org.apache.commons.httpclient.URIException; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.exception.NestableRuntimeException; import org.archive.modules.CrawlURI; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.TextUtils; /** * A subclass of {@link ExtractorJS} that has some customized behavior for * specific kinds of web pages. As of April 2015, the one special behavior it * has is for drupal generated pages. See https://webarchive.jira.com/browse/ARI-4190 */ public class KnowledgableExtractorJS extends ExtractorJS { private static Logger LOGGER = Logger.getLogger(KnowledgableExtractorJS.class.getName()); /** * Wraps a {@link CrawlURI}, allowing baseURI to be overridden, without * changing the underlying CrawlURI. The only methods implemented are the * ones necessary for {@link ExtractorJS} to work properly. */ protected static class CustomizedCrawlURIFacade extends CrawlURI { private static final long serialVersionUID = 1l; protected CrawlURI wrapped; protected UURI baseURI; public CustomizedCrawlURIFacade(CrawlURI wrapped, UURI baseURI) { super(wrapped.getUURI(), wrapped.getPathFromSeed(), wrapped.getVia(), wrapped.getViaContext()); this.wrapped = wrapped; this.baseURI = baseURI; } /** * @return value set in {@link #KnowledgableExtractorJS(CrawlURI, UURI)} */ @Override public UURI getBaseURI() { return baseURI; } /** Delegates to wrapped CrawlURI */ @Override public CrawlURI createCrawlURI(UURI destination, LinkContext context, Hop hop) throws URIException { return wrapped.createCrawlURI(destination, context, hop); } /** Delegates to wrapped CrawlURI */ @Override public Collection<CrawlURI> getOutLinks() { return wrapped.getOutLinks(); } /** Delegates to wrapped CrawlURI */ @Override public void incrementDiscardedOutLinks() { wrapped.incrementDiscardedOutLinks(); } } public long considerStrings(Extractor ext, CrawlURI curi, CharSequence cs, boolean handlingJSFile) { CrawlURI baseUri = curi; Matcher m = TextUtils.getMatcher("jQuery\\.extend\\(Drupal\\.settings,[^'\"]*['\"]basePath['\"]:[^'\"]*['\"]([^'\"]+)['\"]", cs); if (m.find()) { String basePath = m.group(1); try { basePath = StringEscapeUtils.unescapeJavaScript(basePath); } catch (NestableRuntimeException e) { LOGGER.log(Level.WARNING, "problem unescaping purported drupal basePath '" + basePath + "'", e); } try { UURI baseUURI = UURIFactory.getInstance(curi.getUURI(), basePath); baseUri = new CustomizedCrawlURIFacade(curi, baseUURI); } catch (URIException e) { LOGGER.log(Level.WARNING, "problem creating UURI from drupal basePath '" + basePath + "'", e); } } TextUtils.recycleMatcher(m); // extract youtube videoid from youtube javascript embed and create link // for watch page m = TextUtils.getMatcher("new[\\s]+YT\\.Player\\(['\"][^'\"]+['\"],[\\s]+\\{[\\n\\s\\w:'\",]+videoId:[\\s]+['\"]([\\w-]+)['\"],", cs); if (m.find()) { String videoId = m.group(1); String newUri = "https://www.youtube.com/watch?v=" + videoId; try { addRelativeToBase(curi, ext.getExtractorParameters().getMaxOutlinks(), newUri, LinkContext.INFERRED_MISC, Hop.INFERRED); } catch (URIException e) { // no way this should happen throw new IllegalStateException(newUri, e); } } TextUtils.recycleMatcher(m); return super.considerStrings(ext, baseUri, cs, handlingJSFile); } }