package org.opensextant.xtext.converters;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.io.TikaInputStream;
import org.opensextant.xtext.Content;
import org.opensextant.xtext.ConvertedDocument;
public class WebArchiveConverter extends MessageConverter {
/**
* Convert MHT or .webarchive file to pure text.
* Alternatively, export "archive" exploded on disk and then convert all children items.
* See MessageConverter base and ArchiveNavigator solutions for that.
*
* @param in stream
* @param doc original file
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc)
throws IOException {
TikaHTMLConverter htmlParser = new TikaHTMLConverter(false /* no scrub */);
DefaultConverter objectParser = new DefaultConverter();
ConvertedDocument d = super.conversionImplementation(in, doc);
d.is_webArchive = true;
if (!d.hasRawChildren()) {
return d;
}
StringBuilder buf = new StringBuilder();
for (Content binary : d.getRawChildren()) {
logger.info("{} {} {}", d.id, binary.id, binary.mimeType);
if (binary.mimeType == null) {
continue;
}
if ("application/octet-stream".equalsIgnoreCase(binary.mimeType)) {
ConvertedDocument obj = objectParser.convert(TikaInputStream.get(binary.content));
if (obj != null && obj.hasText() && !isWebScript(obj.getText())) {
buf.append(obj.getText());
buf.append("\n==================\n");
}
} else if (binary.mimeType.startsWith("text/html")) {
ConvertedDocument htmlDoc = htmlParser.convert(TikaInputStream.get(binary.content));
if (htmlDoc != null && htmlDoc.hasText() && !isWebScript(htmlDoc.getText())) {
// Filter out HTML crap -- comments, javascript, etc. that comes through as octet-stream in these archives.
buf.append(htmlDoc.getText());
buf.append("\n==================\n");
}
} else if (binary.mimeType.startsWith("image")) {
buf.append(String.format("\n[Image: %s type='%s'] ", binary.id, binary.mimeType));
}
}
if (d.hasText()) {
d.setText(d.getText() + "\n\n==================\n\n" + buf.toString());
} else {
d.setText(buf.toString());
}
return d;
}
/**
* JavaScript or any script detection.
*
* @param data data
* @return if data is a script
*/
public static boolean isWebScript(final String data) {
if (StringUtils.isBlank(data)) {
return true; /* not really */
}
int sub = Math.min(4000, data.length());
String test = data.substring(0, sub - 1).toLowerCase().trim();
// Typically the term 'script' does not actually appear in these octet-streams.
if (test.contains("javascript") || test.contains("document.write(") || test.contains("xmlhttp")){
return true;
}
// Less obvious clues. Must contain all:
if (test.contains("function") && test.contains("{") && test.contains("var ") && test.contains("=")) {
return true;
}
return false;
}
}