/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.extractor; import java.io.IOException; import java.util.ArrayList; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.URIException; import org.archive.modules.CrawlURI; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; /** * PDF Content Extractor. This will parse the text content of a PDF and apply a * regex to search for links within the body of the text. * * Requires itextpdf jar: http://repo1.maven.org/maven2/com/itextpdf/itextpdf/5.5.0/itextpdf-5.5.0.jar * * @contributor adam */ public class ExtractorPDFContent extends ContentExtractor { @SuppressWarnings("unused") private static final long serialVersionUID = 3L; private static final Logger LOGGER = Logger.getLogger(ExtractorPDFContent.class.getName()); public static final Pattern URLPattern = Pattern.compile( "(?i)\\(?(https?):\\/\\/"+ // protocol "(([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+"+ // username "(:([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+)?"+ // password "@)?(?"+ // auth requires @ ")((([a-z0-9]\\.|[a-z0-9][a-z0-9-]*[a-z0-9]\\.)*"+ // domain segments AND "[a-z][a-z0-9-]*[a-z0-9]"+ // top level domain OR "|((\\d|[1-9]\\d|1\\d{2}|2[0-4][0-9]|25[0-5])\\.){3}"+ "(\\d|[1-9]\\d|1\\d{2}|2[0-4][0-9]|25[0-5])"+ // IP address ")(:\\d+)?)"+ // port "(((\\/+([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*"+ // path "(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?)?)?"+ // query string "(\\n(?!http://)"+ // possible newline (seems to happen in pdfs) "((\\/)?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*"+ // continue possible path "(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?"+ // or possible query ")?"); /** * The maximum size of PDF files to consider. PDFs larger than this * maximum will not be searched for links. */ { setMaxSizeToParse(10*1024*1024L); // 10MB } public long getMaxSizeToParse() { return (Long) kp.get("maxSizeToParse"); } public void setMaxSizeToParse(long threshold) { kp.put("maxSizeToParse",threshold); } public ExtractorPDFContent() { } protected boolean innerExtract(CrawlURI curi){ PdfReader documentReader; ArrayList<String> uris = new ArrayList<String>(); try { documentReader = new PdfReader(curi.getRecorder().getContentReplayInputStream()); for(int i=1; i<= documentReader.getNumberOfPages(); i++) { //Page numbers start at 1 String pageParseText = extractPageText(documentReader,i); Matcher matcher = URLPattern.matcher(pageParseText); while(matcher.find()) { String prospectiveURL = pageParseText.substring(matcher.start(),matcher.end()).trim(); //handle URLs wrapped in parentheses if(prospectiveURL.startsWith("(")) { prospectiveURL=prospectiveURL.substring(1,prospectiveURL.length()); if(prospectiveURL.endsWith(")")) prospectiveURL=prospectiveURL.substring(0,prospectiveURL.length()-1); } uris.add(prospectiveURL); //parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.' if(prospectiveURL.endsWith(".") && prospectiveURL.length()>2) uris.add(prospectiveURL.substring(0, prospectiveURL.length()-1)); //Full regex allows newlines which seem to be common, also add match without newline in case we are wrong if(matcher.group(19)!=null) { String alternateURL = matcher.group(1)+"://"+(matcher.group(2)!=null?matcher.group(2):"")+matcher.group(6)+matcher.group(13); //Again, handle URLs wrapped in parentheses if(prospectiveURL.startsWith("(") && alternateURL.endsWith(")")) alternateURL=alternateURL.substring(0,alternateURL.length()-1); uris.add(alternateURL); } } } } catch (IOException e) { curi.getNonFatalFailures().add(e); return false; } catch (RuntimeException e) { curi.getNonFatalFailures().add(e); return false; } if (uris.size()<1) { return true; } for (String uri: uris) { try { LinkContext lc = LinkContext.NAVLINK_MISC; Hop hop = Hop.NAVLINK; CrawlURI out = curi.createCrawlURI(uri, lc, hop); curi.getOutLinks().add(out); } catch (URIException e1) { logUriError(e1, curi.getUURI(), uri); } } numberOfLinksExtracted.addAndGet(uris.size()); LOGGER.fine(curi+" has "+uris.size()+" links."); // Set flag to indicate that link extraction is completed. return true; } public String extractPageText(PdfReader documentReader, int pageNum){ String content =""; PdfReaderContentParser parser = new PdfReaderContentParser(documentReader); TextExtractionStrategy strat; try { strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy()); content = strat.getResultantText(); } catch (IOException e) { LOGGER.log(Level.WARNING, "Failed to parse pdf text in " + Thread.currentThread().getName(), e); } return content; } @Override protected boolean shouldExtract(CrawlURI uri) { long max = getMaxSizeToParse(); if (uri.getRecorder().getRecordedInput().getSize() > max) { return false; } String ct = uri.getContentType(); return (ct != null) && (ct.startsWith("application/pdf")); } }