TextExtractor.java example

Explorer
autopsy-master
/*
 * Autopsy Forensic Browser
 *
 * Copyright 2012 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sleuthkit.autopsy.keywordsearch;

import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.datamodel.AbstractFile;

/**
 * Common methods for utilities that extract text and content and divide into
 * chunks
 */
interface TextExtractor {

    /**
     * Common options that can be used by some extractors
     */
    enum ExtractOptions {

        EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
        EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString()
    };

    //generally text extractors should ignore archives
    //and let unpacking modules take case of them
    static final List<String> ARCHIVE_MIME_TYPES
            = Arrays.asList(
                    //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
                    "application/x-7z-compressed", //NON-NLS
                    "application/x-ace-compressed", //NON-NLS
                    "application/x-alz-compressed", //NON-NLS
                    "application/x-arj", //NON-NLS
                    "application/vnd.ms-cab-compressed", //NON-NLS
                    "application/x-cfs-compressed", //NON-NLS
                    "application/x-dgc-compressed", //NON-NLS
                    "application/x-apple-diskimage", //NON-NLS
                    "application/x-gca-compressed", //NON-NLS
                    "application/x-dar", //NON-NLS
                    "application/x-lzx", //NON-NLS
                    "application/x-lzh", //NON-NLS
                    "application/x-rar-compressed", //NON-NLS
                    "application/x-stuffit", //NON-NLS
                    "application/x-stuffitx", //NON-NLS
                    "application/x-gtar", //NON-NLS
                    "application/x-archive", //NON-NLS
                    "application/x-executable", //NON-NLS
                    "application/x-gzip", //NON-NLS
                    "application/zip", //NON-NLS
                    "application/x-zoo", //NON-NLS
                    "application/x-cpio", //NON-NLS
                    "application/x-shar", //NON-NLS
                    "application/x-tar", //NON-NLS
                    "application/x-bzip", //NON-NLS
                    "application/x-bzip2", //NON-NLS
                    "application/x-lzip", //NON-NLS
                    "application/x-lzma", //NON-NLS
                    "application/x-lzop", //NON-NLS
                    "application/x-z", //NON-NLS
                    "application/x-compress"); //NON-NLS

    /**
     * Get number of chunks resulted from extracting this AbstractFile
     *
     * @return the number of chunks produced
     */
    int getNumChunks();

    /**
     * Get the source file associated with this extraction
     *
     * @return the source AbstractFile
     */
    AbstractFile getSourceFile();

    /**
     * Index the Abstract File
     *
     * @param sourceFile file to index
     *
     * @return true if indexed successfully, false otherwise
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
    boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException;

    /**
     * Sets the scripts to use for the extraction
     *
     * @param extractScripts scripts to use
     *
     * @return true if extractor supports script - specific extraction, false
     *         otherwise
     */
    boolean setScripts(List<SCRIPT> extractScript);

    /**
     * Get the currently used scripts for extraction
     *
     * @return scripts currently used or null if not supported
     */
    List<SCRIPT> getScripts();

    /**
     * Get current options
     *
     * @return currently used, extractor specific options, or null of not
     *         supported
     */
    Map<String, String> getOptions();

    /**
     * Set extractor specific options
     *
     * @param options options to use
     */
    void setOptions(Map<String, String> options);

    /**
     * Determines if the extractor works only for specified types is
     * supportedTypes() or whether is a generic content extractor (such as
     * string extractor)
     *
     * @return
     */
    boolean isContentTypeSpecific();

    /**
     * Determines if the file content is supported by the extractor if
     * isContentTypeSpecific() returns true.
     *
     * @param file           to test if its content should be supported
     * @param detectedFormat mime-type with detected format (such as text/plain)
     *                       or null if not detected
     *
     * @return true if the file content is supported, false otherwise
     */
    boolean isSupported(AbstractFile file, String detectedFormat);
}