/* * Autopsy Forensic Browser * * Copyright 2012-2013 Basis Technology Corp. * Contact: carrier <at> sleuthkit <dot> org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.nio.charset.Charset; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; /** * Extractor of text from HTML supported AbstractFile content. Extracted text is * divided into chunks and indexed with Solr. If HTML extraction succeeds, * chunks are indexed with Solr. */ class HtmlTextExtractor implements TextExtractor { private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName()); private static Ingester ingester; static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET; static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; private static final int SINGLE_READ_CHARS = 1024; private static final int EXTRA_CHARS = 128; //for whitespace private static final int MAX_SIZE = 50000000; //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; private AbstractFile sourceFile; private int numChunks = 0; static final List<String> WEB_MIME_TYPES = Arrays.asList( "application/javascript", //NON-NLS "application/xhtml+xml", //NON-NLS "application/json", //NON-NLS "text/css", //NON-NLS "text/html", //NON-NLS NON-NLS "text/javascript" //NON-NLS //"application/xml", //"application/xml-dtd", ); HtmlTextExtractor() { ingester = Server.getIngester(); } @Override public boolean setScripts(List<SCRIPT> extractScripts) { return false; } @Override public List<SCRIPT> getScripts() { return null; } @Override public Map<String, String> getOptions() { return null; } @Override public void setOptions(Map<String, String> options) { } @Override public int getNumChunks() { return numChunks; } @Override public AbstractFile getSourceFile() { return sourceFile; } @Override public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException { this.sourceFile = sourceFile; numChunks = 0; //unknown until indexing is done boolean success = false; Reader reader = null; final InputStream stream = new ReadContentInputStream(sourceFile); try { // Parse the stream with Jericho JerichoParserWrapper jpw = new JerichoParserWrapper(stream); jpw.parse(); reader = jpw.getReader(); // In case there is an exception or parse() isn't called if (reader == null) { logger.log(Level.WARNING, "No reader available from HTML parser"); //NON-NLS return false; } success = true; long readSize; long totalRead = 0; boolean eof = false; //we read max 1024 chars at time, this seems to max what this Reader would return while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) { if (context.fileIngestIsCancelled()) { ingester.ingest(this); return true; } totalRead += readSize; //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word) while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) { totalRead += readSize; } if (readSize == -1) { //this is the last chunk eof = true; } else { //try to read until whitespace to not break words while ((totalRead < MAX_EXTR_TEXT_CHARS - 1) && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1]) && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) { totalRead += readSize; } if (readSize == -1) { //this is the last chunk eof = true; } } //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName()); //encode to bytes to index as byte stream String extracted; //add BOM and trim the 0 bytes //set initial size to chars read + bom - try to prevent from resizing StringBuilder sb = new StringBuilder((int) totalRead + 1000); //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM //sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika if (totalRead < MAX_EXTR_TEXT_CHARS) { sb.append(textChunkBuf, 0, (int) totalRead); } else { sb.append(textChunkBuf); } //reset for next chunk totalRead = 0; extracted = sb.toString(); //converts BOM automatically to charSet encoding byte[] encodedBytes = extracted.getBytes(outCharset); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); try { chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset); ++this.numChunks; } catch (Ingester.IngesterException ingEx) { success = false; logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS throw ingEx; //need to rethrow/return to signal error and move on } } } catch (IOException ex) { logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS success = false; } catch (Exception ex) { logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS success = false; } finally { try { stream.close(); } catch (IOException ex) { logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS } try { if (reader != null) { reader.close(); } } catch (IOException ex) { logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS } } //after all chunks, ingest the parent file without content itself, and store numChunks ingester.ingest(this); return success; } @Override public boolean isContentTypeSpecific() { return true; } @Override public boolean isSupported(AbstractFile file, String detectedFormat) { if (detectedFormat == null) { return false; } else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) { return true; } else { return false; } } }