/**
*
*/
package uk.bl.documents;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import models.Alert;
import models.Document;
import models.WatchedTarget;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.jsoup.Jsoup;
import play.Logger;
import play.libs.F.Function0;
import com.avaje.ebean.Ebean;
import controllers.Documents;
import controllers.WaybackController;
import eu.scape_project.bitwiser.utils.FuzzyHash;
import eu.scape_project.bitwiser.utils.SSDeep;
/**
*
* Cleaned up and extended document analyser.
*
* Relies on content being present in Wayback for analysis.
*
* @author andy
*
*/
public class DocumentAnalyser {
public static final String DEFAULT_TITLE = "[untitled]";
public DocumentAnalyser() {
}
public void extractMetadata(Document document) throws Exception {
// Get the binary hash
try {
byte[] digest = DigestUtils.sha256(getWaybackInputStream(document.documentUrl, document.waybackTimestamp));
document.sha256Hash = String.format("%064x", new java.math.BigInteger(1, digest));
Logger.info("Recorded sha256Hash "+document.sha256Hash+" for "+document.documentUrl);
} catch (Exception e) {
Logger.error("Failure while SHA256 hashing "+document.documentUrl, e);
// This is a critical error - we can't submit without a hash.
throw new Exception("Could not generate SHA256 hash for "+document.documentUrl,e);
}
// Extended metadata and text:
String text = null;
Logger.info("Running document parser process...");
try {
// Use Tika on it:
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler();
try {
parser.parse(getWaybackInputStream(document.documentUrl, document.waybackTimestamp), handler, metadata);
} catch( Exception e) {
Logger.error("Exception while running Tika on "+document.documentUrl,e);
}
// Pull in the text:
text = handler.toString();
// Use anything we find:
if( (StringUtils.isBlank(document.title) || DEFAULT_TITLE.equals(document.title)) &&
StringUtils.isNotBlank(metadata.get(TikaCoreProperties.TITLE)) ) {
document.title = metadata.get(TikaCoreProperties.TITLE);
// Strip out any NULL characters:
document.title = document.title.replaceAll("\0+", "");
}
// Otherwise, use the filename:
if( StringUtils.isBlank(document.title)) {
document.title = document.filename;
}
// If that didn't work, set to [untitled] (rather than doing so in the upstream extractor)
if( StringUtils.isBlank(document.title)) {
document.title = DEFAULT_TITLE;
}
if( metadata.get(TikaCoreProperties.CREATED) != null ) {
Date created = metadata.getDate(TikaCoreProperties.CREATED);
if( document.publicationDate == null ) {
document.publicationDate = created;
}
if( document.publicationYear == null ) {
SimpleDateFormat df = new SimpleDateFormat("yyyy");
document.publicationYear = Integer.parseInt(df.format(created));
}
}
if( StringUtils.isBlank(document.author1Fn) &&
StringUtils.isNotBlank(metadata.get(DublinCore.CREATOR)) ) {
String[] authsplit = metadata.get(DublinCore.CREATOR).trim().split("\\s+", 2);
document.author1Fn = authsplit[0];
if( authsplit.length > 1 ) {
document.author1Ln = authsplit[1];
}
}
// Output all for debugging:
for( String k : metadata.names()) {
Logger.debug("Found "+k+" -> "+metadata.get(k));
}
} catch (Exception e) {
Logger.error("Failure while parsing "+document.documentUrl, e);
}
// Use the text from Tika to make a fuzzy hash:
Logger.info("Attempting ssdeep hash generation...");
if( StringUtils.isNoneBlank(text)) {
SSDeep ssd = new SSDeep();
FuzzyHash fh = ssd.fuzzyHashBuf(text.getBytes());
document.ctpHash = fh.toString();
Logger.info("Recorded ctpHash "+document.ctpHash+" for "+document.documentUrl);
}
}
private InputStream getWaybackInputStream(String url, String timestamp ) throws IOException {
String wbu = waybackReplayUrl(url, timestamp);
URL wburl = new URL(wbu);
HttpURLConnection conn = (HttpURLConnection)wburl.openConnection();
// Do NOT follow redirects, as we want precisely the right timestamp:
HttpURLConnection.setFollowRedirects(false);
// Get the input stream:
return conn.getInputStream();
}
private String waybackReplayUrl(String url, String timestamp) throws UnsupportedEncodingException {
return WaybackController.getWaybackEndpoint() + "replay?url=" + URLEncoder.encode(url, "UTF-8") + "&date=" + timestamp;
}
/**
*
* @author andy
*
*/
public static class SimilarityFunction implements Function0<Boolean> {
public List<Document> documents;
public SimilarityFunction(List<Document> documents) {
this.documents = documents;
}
@Override
public Boolean apply() {
Logger.info("Checking similarity against all documents for each WatchedTarget...");
for (Document doc1 : documents) {
for (Document doc2 : doc1.watchedTarget.documents ) {
// Don't compare one with itself:
if( ! doc1.documentUrl.equals(doc2.documentUrl) ) {
double similarity = FuzzyHash.compare(doc1.ctpHash, doc2.ctpHash);
if( similarity >= 90 ) {
Alert alert = new Alert();
alert.user = doc1.watchedTarget.target.authorUser;
alert.text = "possible duplicate found: " + Alert.link(doc1) + " matches " +
Alert.link(doc2) + " with " + similarity + "% " +
"(" + Alert.compareLink(doc1, doc2) + ")";
Ebean.save(alert);
}
}
}
}
return true;
}
}
}