/*
* Autopsy Forensic Browser
*
* Copyright 2011-2016 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.modules.filetypeid;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.stream.Collectors;
import org.apache.tika.Tika;
import org.apache.tika.mime.MimeTypes;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.casemodule.services.Blackboard;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskData;
/**
* Detects the MIME type of a file by an inspection of its contents, using
* custom file type definitions by users, custom file type definitions by
* Autopsy, and Tika.
*/
public class FileTypeDetector {
private static final Logger logger = Logger.getLogger(FileTypeDetector.class.getName());
private static final Tika tika = new Tika();
private static final int BUFFER_SIZE = 64 * 1024;
private final byte buffer[] = new byte[BUFFER_SIZE];
private final List<FileType> userDefinedFileTypes;
private final List<FileType> autopsyDefinedFileTypes;
private static SortedSet<String> detectedTypes; //no optional parameters
/**
* Constructs an object that detects the MIME type of a file by an
* inspection of its contents, using custom file type definitions by users,
* custom file type definitions by Autopsy, and Tika.
*
* @throws FileTypeDetectorInitException if an initialization error occurs,
* e.g., user-defined file type
* definitions exist but cannot be
* loaded.
*/
public FileTypeDetector() throws FileTypeDetectorInitException {
try {
userDefinedFileTypes = CustomFileTypesManager.getInstance().getUserDefinedFileTypes();
autopsyDefinedFileTypes = CustomFileTypesManager.getInstance().getAutopsyDefinedFileTypes();
} catch (CustomFileTypesManager.CustomFileTypesException ex) {
throw new FileTypeDetectorInitException("Error loading custom file types", ex); //NON-NLS
}
}
/**
* Gets the names of the custom file types defined by the user or by
* Autopsy.
*
* @return A list of the user-defined MIME types.
*/
public List<String> getUserDefinedTypes() {
List<String> customFileTypes = new ArrayList<>();
for (FileType fileType : userDefinedFileTypes) {
customFileTypes.add(fileType.getMimeType());
}
for (FileType fileType : autopsyDefinedFileTypes) {
customFileTypes.add(fileType.getMimeType());
}
return customFileTypes;
}
/**
* Determines whether or not a given MIME type is detectable by this
* detector.
*
* @param mimeType The MIME type name (e.g., "text/html").
*
* @return True or false.
*/
public boolean isDetectable(String mimeType) {
return isDetectableAsCustomType(userDefinedFileTypes, mimeType)
|| isDetectableAsCustomType(autopsyDefinedFileTypes, mimeType)
|| isDetectableByTika(mimeType);
}
/**
* Returns an unmodifiable list of standard MIME types that does not contain
* types with optional parameters. The list has no duplicate types and is in
* alphabetical order.
*
* @return an unmodifiable view of a set of MIME types
*/
public static synchronized SortedSet<String> getStandardDetectedTypes() {
if (detectedTypes == null) {
detectedTypes = org.apache.tika.mime.MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry().getTypes()
.stream().filter(t -> !t.hasParameters()).map(s -> s.toString()).collect(Collectors.toCollection(TreeSet::new));
}
return Collections.unmodifiableSortedSet(detectedTypes);
}
/**
* Determines whether or not a given MIME type is detectable as a
* user-defined MIME type by this detector.
*
* @param customTypes
* @param mimeType The MIME type name (e.g., "text/html").
*
* @return True or false.
*/
private boolean isDetectableAsCustomType(List<FileType> customTypes, String mimeType) {
for (FileType fileType : customTypes) {
if (fileType.getMimeType().equals(mimeType)) {
return true;
}
}
return false;
}
/**
* Determines whether or not a given MIME type is detectable by Tika.
*
* @param mimeType The MIME type name (e.g., "text/html").
*
* @return True or false.
*/
private boolean isDetectableByTika(String mimeType) {
return FileTypeDetector.getStandardDetectedTypes().contains(removeOptionalParameter(mimeType));
}
/**
* Gets the MIME type of a file, detecting it if it is not already known. If
* detection is necessary, the result is added to the case database.
*
* IMPORTANT: This method should only be called by ingest modules. All other
* clients should call AbstractFile.getMIMEType, and may call
* FileTypeDetector.detect, if AbstractFile.getMIMEType returns null.
*
* @param file The file.
*
* @return A MIME type name. If file type could not be detected or results
* were uncertain, octet-stream is returned.
*
* @throws TskCoreException if detection is required and there is a problem
* writing the result to the case database.
*/
public String getFileType(AbstractFile file) throws TskCoreException {
return detect(file, true);
}
/**
* Detects the MIME type of a file. The result is not added to the case
* database.
*
* @param file The file to test.
*
* @return A MIME type name. If file type could not be detected or results
* were uncertain, octet-stream is returned.
*
* @throws TskCoreException If there is a problem writing the result to the
* case database.
*/
public String detect(AbstractFile file) throws TskCoreException {
return detect(file, false);
}
/**
* Detects the MIME type of a file. The result is saved to the case database
* only if the add to case database flag is set.
*
* @param file The file to test.
* @param addToCaseDb Whether the MIME type should be added to the case
* database. This flag is part of a partial workaround
* for a check-then-act-race condition (see notes in
* comments for details).
*
* @return A MIME type name. If file type could not be detected or results
* were uncertain, octet-stream is returned.
*
* @throws TskCoreException If there is a problem writing the result to the
* case database.
*/
private String detect(AbstractFile file, boolean addToCaseDb) throws TskCoreException {
/*
* Check to see if the file has already been typed. This is the "check"
* part of a check-then-act race condition (see note below).
*/
String mimeType = file.getMIMEType();
if (null != mimeType) {
// We remove the optional parameter to allow this method to work
// with legacy databases that may contain MIME types with the
// optional parameter attached.
return removeOptionalParameter(mimeType);
}
/*
* Mark non-regular files (refer to TskData.TSK_FS_META_TYPE_ENUM),
* zero-sized files, unallocated space, and unused blocks (refer to
* TskData.TSK_DB_FILES_TYPE_ENUM) as octet-stream.
*/
if (!file.isFile() || file.getSize() <= 0
|| (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
|| (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS)
|| (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)
|| (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.SLACK)) {
mimeType = MimeTypes.OCTET_STREAM;
}
/*
* If the file is a regular file, give precedence to user-defined custom
* file types.
*/
if (null == mimeType) {
mimeType = detectUserDefinedType(file);
}
/*
* If the file does not match a user-defined type, give precedence to
* custom file types defined by Autopsy.
*/
if (null == mimeType) {
mimeType = detectAutopsyDefinedType(file);
}
/*
* If the file does not match a user-defined type, send the initial
* bytes to Tika.
*/
if (null == mimeType) {
try {
byte buf[];
int len = file.read(buffer, 0, BUFFER_SIZE);
if (len < BUFFER_SIZE) {
buf = new byte[len];
System.arraycopy(buffer, 0, buf, 0, len);
} else {
buf = buffer;
}
String tikaType = tika.detect(buf, file.getName());
/*
* Remove the Tika suffix from the MIME type name.
*/
mimeType = tikaType.replace("tika-", ""); //NON-NLS
/*
* Remove the optional parameter from the MIME type.
*/
mimeType = removeOptionalParameter(mimeType);
} catch (Exception ignored) {
/*
* This exception is swallowed and not logged rather than
* propagated because files in data sources are not always
* consistent with their file system metadata, making for read
* errors. Also, Tika can be a bit flaky at times, making this a
* best effort endeavor. Default to octet-stream.
*/
mimeType = MimeTypes.OCTET_STREAM;
}
}
/*
* If adding the result to the case database, do so now.
*
* NOTE: This condtional is a way to deal with the check-then-act race
* condition created by the gap between querying the MIME type and
* recording it. It is not really a problem for the mime_type column of
* the tsk_files table, but it can lead to duplicate blackboard posts,
* and the posts are required to maintain backward compatibility.
* Various mitigation strategies were considered. It was decided to go
* with the policy that only ingest modules are allowed to add file
* types to the case database, at least until such time as file types
* are no longer posted to the blackboard. Of course, this is not a
* perfect solution. It's not really enforceable for community
* contributed plug ins and it does not handle the unlikely but possible
* scenario of multiple processes typing the same file for a multi-user
* case.
*/
if (addToCaseDb) {
/*
* Add the MIME type to the files table in the case database.
*/
Case.getCurrentCase().getSleuthkitCase().setFileMIMEType(file, mimeType);
}
return mimeType;
}
/**
* Removes the optional parameter from a MIME type string
* @param mimeType
* @return MIME type without the optional parameter
*/
private String removeOptionalParameter(String mimeType) {
int indexOfSemicolon = mimeType.indexOf(";");
if (indexOfSemicolon != -1 ) {
return mimeType.substring(0, indexOfSemicolon).trim();
} else {
return mimeType;
}
}
/**
* Determines whether or not the a file matches a user-defined custom file
* type.
*
* @param file The file to test.
*
* @return The file type name string or null, if no match is detected.
*
* @throws TskCoreException
*/
private String detectUserDefinedType(AbstractFile file) throws TskCoreException {
for (FileType fileType : userDefinedFileTypes) {
if (fileType.matches(file)) {
if (fileType.createInterestingFileHit()) {
BlackboardArtifact artifact;
artifact = file.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_INTERESTING_FILE_HIT);
BlackboardAttribute setNameAttribute = new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_SET_NAME, FileTypeIdModuleFactory.getModuleName(), fileType.getInterestingFilesSetName());
artifact.addAttribute(setNameAttribute);
/*
* Use the MIME type as the category attribute, i.e., the
* rule that determined this file belongs to the interesting
* files set.
*/
BlackboardAttribute ruleNameAttribute = new BlackboardAttribute(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_CATEGORY, FileTypeIdModuleFactory.getModuleName(), fileType.getMimeType());
artifact.addAttribute(ruleNameAttribute);
/*
* Index the artifact for keyword search.
*/
try {
Case.getCurrentCase().getServices().getBlackboard().indexArtifact(artifact);
} catch (Blackboard.BlackboardException ex) {
logger.log(Level.SEVERE, String.format("Unable to index blackboard artifact %d", artifact.getArtifactID()), ex); //NON-NLS
MessageNotifyUtil.Notify.error(
NbBundle.getMessage(Blackboard.class, "Blackboard.unableToIndexArtifact.exception.msg"), artifact.getDisplayName());
}
}
return fileType.getMimeType();
}
}
return null;
}
/**
* Determines whether or not the a file matches a custom file type defined
* by Autopsy.
*
* @param file The file to test.
*
* @return The file type name string or null, if no match is detected.
*
* @throws TskCoreException
*/
private String detectAutopsyDefinedType(AbstractFile file) throws TskCoreException {
for (FileType fileType : autopsyDefinedFileTypes) {
if (fileType.matches(file)) {
return fileType.getMimeType();
}
}
return null;
}
/*
* Exception thrown when a file type detector experiences an error
* condition.
*/
public static class FileTypeDetectorInitException extends Exception {
private static final long serialVersionUID = 1L;
/**
* Constructs an exception to throw when a file type detector
* experiences an error condition.
*
* @param message The exception message,
*/
FileTypeDetectorInitException(String message) {
super(message);
}
/**
* Constructs an exception to throw when a file type detector
* experiences an error condition.
*
* @param message The exception message,
* @param throwable The underlying cause of the exception.
*/
FileTypeDetectorInitException(String message, Throwable throwable) {
super(message, throwable);
}
}
/**
* Gets the MIME type of a file, detecting it if it is not already known. If
* detection is necessary, the result is added to the case database.
*
* @param file The file.
*
* @return A MIME type name.
*
* @throws TskCoreException if detection is required and there is a problem
* writing the result to the case database.
* @deprecated Use getFileType instead and use AbstractFile.getMIMEType
* instead of querying the blackboard.
*/
@Deprecated
public String detectAndPostToBlackboard(AbstractFile file) throws TskCoreException {
return getFileType(file);
}
}