/* * Autopsy Forensic Browser * * Copyright 2012 Basis Technology Corp. * Contact: carrier <at> sleuthkit <dot> org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sleuthkit.autopsy.keywordsearch; import java.util.Arrays; import java.util.List; import java.util.Map; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.datamodel.AbstractFile; /** * Common methods for utilities that extract text and content and divide into * chunks */ interface TextExtractor { /** * Common options that can be used by some extractors */ enum ExtractOptions { EXTRACT_UTF16, ///< extract UTF16 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString() EXTRACT_UTF8, ///< extract UTF8 text, possible values Boolean.TRUE.toString(), Boolean.FALSE.toString() }; //generally text extractors should ignore archives //and let unpacking modules take case of them static final List<String> ARCHIVE_MIME_TYPES = Arrays.asList( //ignore unstructured binary and compressed data, for which string extraction or unzipper works better "application/x-7z-compressed", //NON-NLS "application/x-ace-compressed", //NON-NLS "application/x-alz-compressed", //NON-NLS "application/x-arj", //NON-NLS "application/vnd.ms-cab-compressed", //NON-NLS "application/x-cfs-compressed", //NON-NLS "application/x-dgc-compressed", //NON-NLS "application/x-apple-diskimage", //NON-NLS "application/x-gca-compressed", //NON-NLS "application/x-dar", //NON-NLS "application/x-lzx", //NON-NLS "application/x-lzh", //NON-NLS "application/x-rar-compressed", //NON-NLS "application/x-stuffit", //NON-NLS "application/x-stuffitx", //NON-NLS "application/x-gtar", //NON-NLS "application/x-archive", //NON-NLS "application/x-executable", //NON-NLS "application/x-gzip", //NON-NLS "application/zip", //NON-NLS "application/x-zoo", //NON-NLS "application/x-cpio", //NON-NLS "application/x-shar", //NON-NLS "application/x-tar", //NON-NLS "application/x-bzip", //NON-NLS "application/x-bzip2", //NON-NLS "application/x-lzip", //NON-NLS "application/x-lzma", //NON-NLS "application/x-lzop", //NON-NLS "application/x-z", //NON-NLS "application/x-compress"); //NON-NLS /** * Get number of chunks resulted from extracting this AbstractFile * * @return the number of chunks produced */ int getNumChunks(); /** * Get the source file associated with this extraction * * @return the source AbstractFile */ AbstractFile getSourceFile(); /** * Index the Abstract File * * @param sourceFile file to index * * @return true if indexed successfully, false otherwise * * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException; /** * Sets the scripts to use for the extraction * * @param extractScripts scripts to use * * @return true if extractor supports script - specific extraction, false * otherwise */ boolean setScripts(List<SCRIPT> extractScript); /** * Get the currently used scripts for extraction * * @return scripts currently used or null if not supported */ List<SCRIPT> getScripts(); /** * Get current options * * @return currently used, extractor specific options, or null of not * supported */ Map<String, String> getOptions(); /** * Set extractor specific options * * @param options options to use */ void setOptions(Map<String, String> options); /** * Determines if the extractor works only for specified types is * supportedTypes() or whether is a generic content extractor (such as * string extractor) * * @return */ boolean isContentTypeSpecific(); /** * Determines if the file content is supported by the extractor if * isContentTypeSpecific() returns true. * * @param file to test if its content should be supported * @param detectedFormat mime-type with detected format (such as text/plain) * or null if not detected * * @return true if the file content is supported, false otherwise */ boolean isSupported(AbstractFile file, String detectedFormat); }