package uk.bl.wa.indexer;
/*
* #%L
* warc-indexer
* $Id:$
* $HeadURL:$
* %%
* Copyright (C) 2013 - 2014 The UK Web Archive
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-2.0.html>.
* #L%
*/
import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.TimeZone;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.ProtocolException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpHeaders;
import org.apache.log4j.PropertyConfigurator;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.tika.mime.MediaType;
import org.archive.format.warc.WARCConstants;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
import org.archive.util.ArchiveUtils;
import org.archive.util.SurtPrefixSet;
import org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigRenderOptions;
import uk.bl.wa.analyser.payload.WARCPayloadAnalysers;
import uk.bl.wa.analyser.text.TextAnalysers;
import uk.bl.wa.annotation.Annotations;
import uk.bl.wa.annotation.Annotator;
import uk.bl.wa.extract.LinkExtractor;
import uk.bl.wa.parsers.HtmlFeatureParser;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.SolrWebServer;
import uk.bl.wa.util.HashedCachedInputStream;
import uk.bl.wa.util.Instrument;
/**
*
* Core indexer class that takes a web archive record and generates a Solr record.
*
* TODO Currently a rather crude, monolithic code structure. Should pull the different metadata generation logic out into separate classes or at least methods.
*
* @author Andrew Jackson <Andrew.Jackson@bl.uk>
*
*/
public class WARCIndexer {
private static Log log = LogFactory.getLog( WARCIndexer.class );
private List<String> url_excludes;
private List<String> protocol_includes;
private List<String> response_includes;
private List<String> record_type_includes;
private MessageDigest md5 = null;
private AggressiveUrlCanonicalizer canon = new AggressiveUrlCanonicalizer();
/** */
private boolean extractText;
private boolean storeText;
private boolean hashUrlId;
/** Wayback-style URI filtering: */
private StaticMapExclusionFilterFactory smef = null;
/** Hook to the solr server: */
private boolean checkSolrForDuplicates = false;
private SolrWebServer solrServer = null;
/** Payload Analysers */
private long inMemoryThreshold;
private long onDiskThreshold;
private WARCPayloadAnalysers wpa;
/** Text Analysers */
private TextAnalysers txa;
/** Annotations */
private Annotator ant = null;
// Paired with HtmlFeatureParsers links-extractor
private final boolean addNormalisedURL;
private final AggressiveUrlCanonicalizer urlNormaliser = new AggressiveUrlCanonicalizer();
// Also canonicalise the HOST field (e.g. drop "www.")
public static final boolean CANONICALISE_HOST = true;
/* ------------------------------------------------------------ */
/**
* Default constructor, with empty configuration.
*/
public WARCIndexer() throws NoSuchAlgorithmException {
this( ConfigFactory.parseString( ConfigFactory.load().root().render( ConfigRenderOptions.concise() ) ) );
}
/**
* Preferred constructor, allows passing in configuration from execution environment.
*/
public WARCIndexer( Config conf ) throws NoSuchAlgorithmException {
log.info("Initialising WARCIndexer...");
try {
Properties props = new Properties();
props.load(getClass().getResourceAsStream("/log4j-override.properties"));
PropertyConfigurator.configure(props);
} catch (IOException e1) {
log.error("Failed to load log4j config from properties file.");
}
// Optional configurations:
this.extractText = conf.getBoolean( "warc.index.extract.content.text" );
log.info("Extract text = " + extractText);
this.storeText = conf
.getBoolean("warc.index.extract.content.text_stored");
log.info("Store text = " + storeText);
this.hashUrlId = conf.getBoolean( "warc.solr.use_hash_url_id" );
log.info("hashUrlId = " + hashUrlId);
addNormalisedURL = conf.hasPath(HtmlFeatureParser.CONF_LINKS_NORMALISE) ?
conf.getBoolean(HtmlFeatureParser.CONF_LINKS_NORMALISE) :
HtmlFeatureParser.DEFAULT_LINKS_NORMALISE;
this.checkSolrForDuplicates = conf.getBoolean("warc.solr.check_solr_for_duplicates");
if( this.hashUrlId == false && this.checkSolrForDuplicates == true ) {
log.warn("Checking Solr for duplicates may not work as expected when using the timestamp+md5(URL) key.");
log.warn("You need to use the payload-hash+md5(URL) key option to resolve revisit records.");
}
// URLs to exclude:
this.url_excludes = conf.getStringList( "warc.index.extract.url_exclude" );
// Protocols to include:
this.protocol_includes = conf.getStringList( "warc.index.extract.protocol_include" );
// Response codes to include:
this.response_includes = conf.getStringList( "warc.index.extract.response_include" );
// Record types to include:
this.record_type_includes = conf.getStringList( "warc.index.extract.record_type_include" );
// URL Filtering options:
if( conf.getBoolean( "warc.index.exclusions.enabled" ) ) {
smef = new StaticMapExclusionFilterFactory();
smef.setFile( conf.getString( "warc.index.exclusions.file" ) );
smef.setCheckInterval( conf.getInt( "warc.index.exclusions.check_interval" ) );
try {
smef.init();
} catch( IOException e ) {
log.error( "Failed to load exclusions file." );
throw new RuntimeException( "StaticMapExclusionFilterFactory failed with IOException when loading " + smef.getFile() );
}
}
// Instanciate required helpers:
md5 = MessageDigest.getInstance( "MD5" );
// Also hook up to Solr server for queries:
if( this.checkSolrForDuplicates ) {
log.info("Initialisating connection to Solr...");
solrServer = new SolrWebServer(conf);
}
// Set up hash-cache properties:
this.inMemoryThreshold = conf.getBytes( "warc.index.extract.inMemoryThreshold" );
this.onDiskThreshold = conf.getBytes( "warc.index.extract.onDiskThreshold" );
log.info("Hashing & Caching thresholds are: < "+this.inMemoryThreshold+" in memory, < "+this.onDiskThreshold+" on disk.");
// Set up analysers
log.info("Setting up analysers...");
this.wpa = new WARCPayloadAnalysers(conf);
this.txa = new TextAnalysers(conf);
// Log so it's clear this completed ok:
log.info("Initialisation of WARCIndexer complete.");
}
/**
*
* @param ann
*/
public void setAnnotations(Annotations ann, SurtPrefixSet openAccessSurts) {
this.ant = new Annotator(ann, openAccessSurts);
}
/**
* @return the checkSolrForDuplicates
*/
public boolean isCheckSolrForDuplicates() {
return checkSolrForDuplicates;
}
/**
* @param checkSolrForDuplicates the checkSolrForDuplicates to set
*/
public void setCheckSolrForDuplicates(boolean checkSolrForDuplicates) {
this.checkSolrForDuplicates = checkSolrForDuplicates;
}
/**
* This extracts metadata and text from the ArchiveRecord and creates a suitable SolrRecord.
*
* @param archiveName
* @param record
* @return
* @throws IOException
*/
public SolrRecord extract( String archiveName, ArchiveRecord record ) throws IOException {
return this.extract( archiveName, record, this.extractText );
}
/**
* This extracts metadata from the ArchiveRecord and creates a suitable SolrRecord.
* Removes the text field if flag set.
*
* @param archiveName
* @param record
* @param isTextIncluded
* @return
* @throws IOException
*/
public SolrRecord extract( String archiveName, ArchiveRecord record, boolean isTextIncluded ) throws IOException {
final long start = System.nanoTime();
ArchiveRecordHeader header = record.getHeader();
SolrRecord solr = new SolrRecord(archiveName, header);
if( !header.getHeaderFields().isEmpty() ) {
if( header.getHeaderFieldKeys().contains( HEADER_KEY_TYPE ) ) {
log.debug("Looking at "
+ header.getHeaderValue(HEADER_KEY_TYPE));
if( !checkRecordType( ( String ) header.getHeaderValue( HEADER_KEY_TYPE ) ) ) {
return null;
}
// Store WARC record type:
solr.setField(SolrFields.SOLR_RECORD_TYPE,
(String) header.getHeaderValue(HEADER_KEY_TYPE));
} else {
// else we're processing ARCs so nothing to filter and no
// revisits
solr.setField(SolrFields.SOLR_RECORD_TYPE, "arc");
}
if( header.getUrl() == null )
return null;
String fullUrl = header.getUrl();
log.debug("Current heap usage: "
+ FileUtils.byteCountToDisplaySize(Runtime.getRuntime()
.totalMemory()));
log.debug("Processing " + fullUrl + " from " + archiveName);
// Check the filters:
if( this.checkProtocol( fullUrl ) == false )
return null;
if( this.checkUrl( fullUrl ) == false )
return null;
if( this.checkExclusionFilter( fullUrl ) == false )
return null;
// --- Basic headers ---
// Basic metadata:
solr.setField(SolrFields.SOURCE_FILE, archiveName);
solr.setField(SolrFields.SOURCE_FILE_OFFSET,
"" + header.getOffset());
byte[] url_md5digest = md5.digest(fullUrl.getBytes("UTF-8"));
// String url_base64 =
// Base64.encodeBase64String(fullUrl.getBytes("UTF-8"));
String url_md5hex = Base64.encodeBase64String(url_md5digest);
solr.setField( SolrFields.SOLR_URL, fullUrl );
if (addNormalisedURL) {
solr.setField( SolrFields.SOLR_URL_NORMALISED, urlNormaliser.canonicalize(fullUrl) );
}
// Get the length, but beware, this value also includes the HTTP headers (i.e. it is the payload_length):
long content_length = header.getLength();
// Also pull out the file extension, if any:
String resourceName = parseResourceName(fullUrl);
solr.addField(SolrFields.RESOURCE_NAME, resourceName);
solr.addField(SolrFields.CONTENT_TYPE_EXT,
parseExtension(resourceName));
// Strip down very long URLs to avoid "org.apache.commons.httpclient.URIException: Created (escaped) uuri > 2083"
// Trac #2271: replace string-splitting with URI-based methods.
URL url = null;
if( fullUrl.length() > 2000 )
fullUrl = fullUrl.substring( 0, 2000 );
try {
url = new URL(fullUrl);
} catch (MalformedURLException e) {
// Some URIs causing problem, so try the canonicalizer; in which
// case try with the full URL.
log.error(e.getMessage());
try {
url = new URL("http://" + canon.urlStringToKey(fullUrl));
} catch (Exception e2) {
// If this fails, abandon all hope.
log.error(e2.getMessage());
return null;
}
}
// Spot 'slash pages':
if (url.getPath().equals("/") || url.getPath().equals("")
|| url.getPath().matches("/index\\.[a-z]+$")) {
solr.setField( SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_SLASHPAGE );
// Spot 'robots.txt':
} else if (url.getPath().equalsIgnoreCase("/robots.txt")) {
solr.setField( SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_ROBOTS_TXT );
} else {
solr.setField(SolrFields.SOLR_URL_TYPE,
SolrFields.SOLR_URL_TYPE_NORMAL);
}
// Record the host (an canonicalised), the domain
// and the public suffix:
String host = url.getHost();
if (CANONICALISE_HOST)
host = canon.urlStringToKey(host).replace("/", "");
solr.setField( SolrFields.SOLR_HOST, host );
solr.setField( SolrFields.DOMAIN, LinkExtractor.extractPrivateSuffixFromHost( host ) );
solr.setField( SolrFields.PUBLIC_SUFFIX, LinkExtractor.extractPublicSuffixFromHost( host ) );
Instrument.timeRel("WARCIndexer.extract#total",
"WARCIndexer.extract#archeaders", start);
InputStream tikainput = null;
// Only parse HTTP headers for HTTP URIs
if( fullUrl.startsWith( "http" ) ) {
// Parse HTTP headers:
String statusCode = null;
if( record instanceof WARCRecord ) {
// There are not always headers! The code should check first.
String statusLine = HttpParser.readLine( record, "UTF-8" );
if( statusLine != null && statusLine.startsWith( "HTTP" ) ) {
String firstLine[] = statusLine.split( " " );
if( firstLine.length > 1 ) {
statusCode = firstLine[ 1 ].trim();
try {
this.processHeaders( solr, statusCode, HttpParser.parseHeaders( record, "UTF-8" ) );
} catch( ProtocolException p ) {
log.error( "ProtocolException [" + statusCode + "]: " + header.getHeaderValue( WARCConstants.HEADER_KEY_FILENAME ) + "@" + header.getHeaderValue( WARCConstants.ABSOLUTE_OFFSET_KEY ), p );
}
} else {
log.warn( "Could not parse status line: " + statusLine );
}
} else {
log.warn( "Invalid status line: " + header.getHeaderValue( WARCConstants.HEADER_KEY_FILENAME ) + "@" + header.getHeaderValue( WARCConstants.ABSOLUTE_OFFSET_KEY ) );
}
// No need for this, as the headers have already been read from the InputStream (above):
// WARCRecordUtils.getPayload(record);
tikainput = record;
} else if( record instanceof ARCRecord ) {
ARCRecord arcr = ( ARCRecord ) record;
statusCode = "" + arcr.getStatusCode();
this.processHeaders( solr, statusCode, arcr.getHttpHeaders() );
arcr.skipHttpHeader();
tikainput = arcr;
} else {
log.error( "FAIL! Unsupported archive record type." );
return solr;
}
// Skip recording non-content URLs (i.e. 2xx responses only please):
if( this.checkResponseCode( statusCode ) == false ) {
log.debug( "Skipping this record based on status code " + statusCode + ": " + header.getUrl() );
return null;
}
}
// Update the content_length based on what's available:
content_length = tikainput.available();
// Record the length:
solr.setField(SolrFields.CONTENT_LENGTH, ""+content_length);
// -----------------------------------------------------
// Headers have been processed, payload ready to cache:
// -----------------------------------------------------
// Create an appropriately cached version of the payload, to allow analysis.
final long hashStreamStart = System.nanoTime();
HashedCachedInputStream hcis = new HashedCachedInputStream(header, tikainput, content_length );
tikainput = hcis.getInputStream();
String hash = hcis.getHash();
Instrument.timeRel("WARCIndexer.extract#total",
"WARCIndexer.extract#hashstreamwrap", hashStreamStart);
// Prepare crawl date information:
String waybackDate = ( header.getDate().replaceAll( "[^0-9]", "" ) );
Date crawlDate = getWaybackDate( waybackDate );
String crawlDateString = parseCrawlDate(waybackDate);
// Optionally use a hash-based ID to store only one version of a URL:
String id = null;
if( hashUrlId ) {
id = hash + "/" + url_md5hex;
} else {
id = url_md5hex + "/" + waybackDate;
}
// Set these last:
solr.setField( SolrFields.ID, id );
solr.setField( SolrFields.HASH, hash );
// -----------------------------------------------------
// Payload has been cached, ready to check crawl dates:
// -----------------------------------------------------
HashSet<Date> currentCrawlDates = new HashSet<Date>();
// If we are collapsing records based on hash:
if (hashUrlId) {
// Query for currently known crawl dates:
if (this.checkSolrForDuplicates && solrServer != null) {
SolrQuery q = new SolrQuery("id:\"" + id + "\"");
q.addField(SolrFields.CRAWL_DATES);
try {
QueryResponse results = solrServer.query(q);
if (results.getResults().size() > 0) {
SolrDocument fr = results.getResults().get(0);
if (fr.containsKey(SolrFields.CRAWL_DATES)) {
for (Object cds : fr
.getFieldValues(SolrFields.CRAWL_DATES)) {
currentCrawlDates.add((Date) cds);
}
}
} else {
log.debug("No matching entries found.");
}
} catch (SolrServerException e) {
e.printStackTrace();
// FIXME retry?
}
}
// Is the current date unknown? (inc. no-solr-check case):
if (!currentCrawlDates.contains(crawlDate)) {
// Dates to be merged under the CRAWL_DATES field:
solr.mergeField(SolrFields.CRAWL_DATES, crawlDateString);
solr.mergeField(SolrFields.CRAWL_YEARS,
extractYear(header.getDate()));
} else {
// Otherwise, ensure the all the known dates (i.e. including
// this one) are copied over:
for (Date ccd : currentCrawlDates) {
solr.addField(SolrFields.CRAWL_DATES,
formatter.format(ccd));
solr.addField(SolrFields.CRAWL_YEARS,
getYearFromDate(ccd));
}
// TODO This could optionally skip re-submission instead?
}
}
// Sort the dates and find the earliest:
List<Date> dateList = new ArrayList<Date>(currentCrawlDates);
dateList.add(crawlDate);
Collections.sort(dateList);
Date firstDate = dateList.get(0);
solr.setField(SolrFields.CRAWL_DATE,
formatter.format(firstDate));
solr.setField( SolrFields.CRAWL_YEAR, getYearFromDate(firstDate) );
// Use the current value as the waybackDate:
solr.setField( SolrFields.WAYBACK_DATE, waybackDate );
// If this is a revisit record, we should just return an update to the crawl_dates:
if (WARCConstants.WARCRecordType.revisit.name().equalsIgnoreCase(
(String) header.getHeaderValue(HEADER_KEY_TYPE))) {
if( currentCrawlDates.contains(crawlDate) ) {
return null;
}
SolrRecord revisited = new SolrRecord();
revisited.setField( SolrFields.ID, id );
// Store crawl-date appropriately depending on whether records
// are collapsing on (hash+url) or not:
if (hashUrlId) {
revisited.mergeField(SolrFields.CRAWL_DATES,
crawlDateString);
revisited.mergeField(SolrFields.CRAWL_YEARS,
extractYear(header.getDate()));
} else {
revisited.setField(SolrFields.CRAWL_DATE, crawlDateString);
revisited.setField(SolrFields.CRAWL_YEAR,
extractYear(header.getDate()));
}
revisited.setField(SolrFields.SOLR_URL, fullUrl);
revisited.setField(SolrFields.WAYBACK_DATE, waybackDate);
String payloadDigest = (String) header
.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST);
revisited.setField(SolrFields.HASH, payloadDigest);
revisited.setField(SolrFields.SOLR_RECORD_TYPE,
(String) header.getHeaderValue(HEADER_KEY_TYPE));
return revisited;
}
// -----------------------------------------------------
// Apply any annotations:
// -----------------------------------------------------
if (ant != null) {
try {
ant.applyAnnotations(url.toURI(), solr.getSolrDocument());
} catch (URISyntaxException e) {
e.printStackTrace();
log.error("Failed to annotate " + url + " : " + e);
}
}
// -----------------------------------------------------
// Payload duplication has been checked, ready to parse:
// -----------------------------------------------------
final long analyzeStart = System.nanoTime();
// Mark the start of the payload.
tikainput.mark( ( int ) content_length );
// Pass on to other extractors as required, resetting the stream before each:
this.wpa.analyse(header, tikainput, solr);
Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#analyzetikainput", analyzeStart);
// Clear up the caching of the payload:
hcis.cleanup();
// Derive normalised/simplified content type:
processContentType(solr, header, content_length);
// -----------------------------------------------------
// Payload analysis complete, now performing text analysis:
// -----------------------------------------------------
this.txa.analyse(solr);
// Remove the Text Field if required
if( !isTextIncluded ) {
solr.removeField( SolrFields.SOLR_EXTRACTED_TEXT );
} else {
// Otherwise, decide whether to store or both store and index
// the text:
if (storeText == false) {
// Copy the text into the indexed (but not stored) field:
solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_NOT_STORED,
(String) solr.getField(
SolrFields.SOLR_EXTRACTED_TEXT)
.getFirstValue());
// Take the text out of the original (stored) field.
solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT);
}
}
}
Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#solrdocCreation",
"WARCIndexer.extract#total", start);
return solr;
}
/**
* @param firstDate
* @return
*/
private synchronized String getYearFromDate(Date date) {
calendar.setTime(date);
return Integer.toString(calendar.get(Calendar.YEAR));
}
private final Calendar calendar = Calendar.getInstance();
/* ----------------------------------- */
private void processHeaders( SolrRecord solr, String statusCode, Header[] httpHeaders ) {
try {
// This is a simple test that the status code setting worked:
int statusCodeInt = Integer.parseInt( statusCode );
if( statusCodeInt < 0 || statusCodeInt > 1000 )
throw new Exception( "Status code out of range: " + statusCodeInt );
// Get the other headers:
for( Header h : httpHeaders ) {
// Get the type from the server
if (h.getName().equalsIgnoreCase(HttpHeaders.CONTENT_TYPE)
&& solr.getField(SolrFields.CONTENT_TYPE_SERVED) == null) {
String servedType = h.getValue();
if (servedType.length() > 200)
servedType = servedType.substring(0, 200);
solr.addField(SolrFields.CONTENT_TYPE_SERVED, servedType);
}
// Also, grab the X-Powered-By or Server headers if present:
if (h.getName().equalsIgnoreCase("X-Powered-By"))
solr.addField( SolrFields.SERVER, h.getValue() );
if (h.getName().equalsIgnoreCase(HttpHeaders.SERVER))
solr.addField( SolrFields.SERVER, h.getValue() );
}
} catch( NumberFormatException e ) {
log.error( "Exception when parsing status code: " + statusCode + ": " + e );
solr.addParseException("when parsing statusCode", e);
} catch( Exception e ) {
log.error( "Exception when parsing headers: " + e );
solr.addParseException("when parsing headers", e);
}
}
/**
*
* @param fullUrl
* @return
*/
protected static String parseResourceName(String fullUrl) {
if( fullUrl.lastIndexOf( "/" ) != -1 ) {
String path = fullUrl.substring(fullUrl.lastIndexOf("/") + 1);
if( path.indexOf( "?" ) != -1 ) {
path = path.substring( 0, path.indexOf( "?" ) );
}
if( path.indexOf( "&" ) != -1 ) {
path = path.substring( 0, path.indexOf( "&" ) );
}
return path;
}
return null;
}
/**
*
* @param fullUrl
* @return
*/
protected static String parseExtension(String path) {
if (path != null && path.indexOf(".") != -1) {
String ext = path.substring(path.lastIndexOf("."));
ext = ext.toLowerCase();
// Avoid odd/malformed extensions:
// if( ext.contains("%") )
// ext = ext.substring(0, path.indexOf("%"));
ext = ext.replaceAll("[^0-9a-z]", "");
return ext;
}
return null;
}
/**
* Timestamp parsing, for the Crawl Date.
*/
public static SimpleDateFormat formatter = new SimpleDateFormat(
"yyyy-MM-dd'T'HH:mm:ss'Z'");
static {
formatter.setTimeZone( TimeZone.getTimeZone( "GMT" ) );
}
/**
* Returns a Java Date object representing the crawled date.
*
* @param timestamp
* @return
*/
public static Date getWaybackDate( String timestamp ) {
Date date = new Date();
try {
if( timestamp.length() == 12 ) {
date = ArchiveUtils.parse12DigitDate( timestamp );
} else if( timestamp.length() == 14 ) {
date = ArchiveUtils.parse14DigitDate( timestamp );
} else if( timestamp.length() == 16 ) {
date = ArchiveUtils.parse17DigitDate( timestamp + "0" );
} else if( timestamp.length() >= 17 ) {
date = ArchiveUtils.parse17DigitDate( timestamp.substring( 0, 17 ) );
}
} catch( ParseException p ) {
p.printStackTrace();
}
return date;
}
/**
* Returns a formatted String representing the crawled date.
*
* @param waybackDate
* @return
*/
protected static String parseCrawlDate( String waybackDate ) {
DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis()
.withZone(DateTimeZone.UTC);
return iso_df.print(new org.joda.time.DateTime(
getWaybackDate(waybackDate)));
}
/**
*
* @param timestamp
* @return
*/
public static String extractYear( String timestamp ) {
// Default to 'unknown':
String waybackYear = "unknown";
String waybackDate = timestamp.replaceAll( "[^0-9]", "" );
if( waybackDate != null )
waybackYear = waybackDate.substring( 0, 4 );
// Reject bad values by resetting to 'unknown':
if( "0000".equals( waybackYear ) )
waybackYear = "unknown";
// Return
return waybackYear;
}
/**
*
* @param solr
* @param header
* @param content_length
*/
private void processContentType(SolrRecord solr,
ArchiveRecordHeader header, long content_length) {
// Get the current content-type:
String contentType = ( String ) solr.getFieldValue( SolrFields.SOLR_CONTENT_TYPE );
// Store the raw content type from Tika:
solr.setField( SolrFields.CONTENT_TYPE_TIKA, contentType );
// Also get the other content types:
MediaType mt_tika = MediaType.parse( contentType );
if( solr.getField( SolrFields.CONTENT_TYPE_DROID ) != null ) {
MediaType mt_droid = MediaType.parse( ( String ) solr.getField( SolrFields.CONTENT_TYPE_DROID ).getFirstValue() );
if( mt_tika == null || mt_tika.equals( MediaType.OCTET_STREAM ) ) {
contentType = mt_droid.toString();
} else if( mt_droid.getBaseType().equals( mt_tika.getBaseType() ) && mt_droid.getParameters().get( "version" ) != null ) {
// Union of results:
mt_tika = new MediaType( mt_tika, mt_droid.getParameters() );
contentType = mt_tika.toString();
}
if( mt_droid.getParameters().get( "version" ) != null ) {
solr.addField( SolrFields.CONTENT_VERSION, mt_droid.getParameters().get( "version" ) );
}
}
// Allow header MIME
if( contentType != null && contentType.isEmpty() ) {
if( header.getHeaderFieldKeys().contains( "WARC-Identified-Payload-Type" ) ) {
contentType = ( ( String ) header.getHeaderFields().get( "WARC-Identified-Payload-Type" ) );
} else {
contentType = header.getMimetype();
}
}
// Determine content type:
if( contentType != null )
solr.setField( SolrFields.FULL_CONTENT_TYPE, contentType );
// If zero-length, then change to application/x-empty for the 'content_type' field.
if (content_length == 0)
contentType = "application/x-empty";
// Content-Type can still be null
if( contentType != null ) {
// Strip parameters out of main type field:
solr.setField( SolrFields.SOLR_CONTENT_TYPE, contentType.replaceAll( ";.*$", "" ) );
// Also add a more general, simplified type, as appropriate:
if( contentType.matches( "^image/.*$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "image" );
} else if( contentType.matches( "^audio/.*$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "audio" );
} else if( contentType.matches( "^video/.*$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "video" );
} else if( contentType.matches( "^text/htm.*$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "html" );
} else if( contentType.matches( "^application/pdf.*$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "pdf" );
} else if( contentType.matches( "^.*word$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "word" );
} else if( contentType.matches( "^.*excel$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "excel" );
} else if( contentType.matches( "^.*powerpoint$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "powerpoint" );
} else if( contentType.matches( "^text/plain.*$" ) ) {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "text" );
} else {
solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "other" );
}
// Remove text from JavaScript, CSS, ...
if( contentType.startsWith( "application/javascript" ) || contentType.startsWith( "text/javascript" ) || contentType.startsWith( "text/css" ) ) {
solr.removeField( SolrFields.SOLR_EXTRACTED_TEXT );
}
}
}
private boolean checkUrl( String url ) {
for( String exclude : url_excludes ) {
if (!"".equalsIgnoreCase(exclude)
&& url.matches(".*" + exclude + ".*")) {
return false;
}
}
return true;
}
private boolean checkProtocol( String url ) {
for( String include : protocol_includes ) {
if ("".equalsIgnoreCase(include) || url.startsWith(include)) {
return true;
}
}
return false;
}
private boolean checkResponseCode( String statusCode ) {
if( statusCode == null )
return false;
// Check for match:
for( String include : response_includes ) {
if ("".equalsIgnoreCase(include) || statusCode.startsWith(include)) {
return true;
}
}
// Exclude
return false;
}
private boolean checkRecordType( String type ) {
if (record_type_includes.contains(type)) {
return true;
}
log.debug("Skipping record of type " + type);
return false;
}
private boolean checkExclusionFilter( String uri ) {
// Default to no exclusions:
if( smef == null )
return true;
// Otherwise:
ExclusionFilter ef = smef.get();
CaptureSearchResult r = new CaptureSearchResult();
// r.setOriginalUrl(uri);
r.setUrlKey( uri );
try {
if( ef.filterObject( r ) == ExclusionFilter.FILTER_INCLUDE ) {
return true;
}
} catch( Exception e ) {
log.error( "Exclusion filtering failed with exception: " + e );
e.printStackTrace();
}
log.debug( "EXCLUDING this URL due to filter: " + uri );
// Exclude:
return false;
}
}