/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
* Content type detection based on magic bytes, i.e. type-specific patterns
* near the beginning of the document input stream.
*
* @since Apache Tika 0.3
*/
public class MagicDetector implements Detector {
private static final long serialVersionUID = 1001153472537110376L;
/**
* The matching media type. Returned by the
* {@link #detect(InputStream, Metadata)} method if a match is found.
*/
private final MediaType type;
/**
* Length of the comparison window. All the byte arrays here are this long.
*/
private final int length;
/**
* The magic match pattern. If this byte pattern is equal to the
* possibly bit-masked bytes from the input stream, then the type
* detection succeeds and the configured {@link #type} is returned.
*/
private final byte[] pattern;
/**
* Bit mask that is applied to the source bytes before pattern matching.
*/
private final byte[] mask;
/**
* First offset (inclusive) of the comparison window within the
* document input stream. Greater than or equal to zero.
*/
private final int offsetRangeBegin;
/**
* Last offset (inclusive) of the comparison window within the document
* input stream. Greater than or equal to the
* {@link #offsetRangeBegin first offset}.
* <p>
* Note that this is <em>not</em> the offset of the last byte read from
* the document stream. Instead, the last window of bytes to be compared
* starts at this offset.
*/
private final int offsetRangeEnd;
private final String asString;
/**
* Creates a detector for input documents that have the exact given byte
* pattern at the beginning of the document stream.
*
* @param type matching media type
* @param pattern magic match pattern
*/
public MagicDetector(MediaType type, byte[] pattern) {
this(type, pattern, 0);
}
/**
* Creates a detector for input documents that have the exact given byte
* pattern at the given offset of the document stream.
*
* @param type matching media type
* @param pattern magic match pattern
* @param offset offset of the pattern match
*/
public MagicDetector(MediaType type, byte[] pattern, int offset) {
this(type, pattern, null, offset, offset);
}
/**
* Creates a detector for input documents that meet the specified
* magic match.
*/
public MagicDetector(
MediaType type, byte[] pattern, byte[] mask,
int offsetRangeBegin, int offsetRangeEnd) {
if (type == null) {
throw new IllegalArgumentException("Matching media type is null");
} else if (pattern == null) {
throw new IllegalArgumentException("Magic match pattern is null");
} else if (offsetRangeBegin < 0
|| offsetRangeEnd < offsetRangeBegin) {
throw new IllegalArgumentException(
"Invalid offset range: ["
+ offsetRangeBegin + "," + offsetRangeEnd + "]");
}
this.type = type;
this.length = Math.max(pattern.length, mask != null ? mask.length : 0);
this.mask = new byte[length];
this.pattern = new byte[length];
for (int i = 0; i < length; i++) {
if (mask != null && i < mask.length) {
this.mask[i] = mask[i];
} else {
this.mask[i] = -1;
}
if (i < pattern.length) {
this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
} else {
this.pattern[i] = 0;
}
}
this.offsetRangeBegin = offsetRangeBegin;
this.offsetRangeEnd = offsetRangeEnd;
// Build the string representation. Needs to be unique, as
// these get compared. Compute now as may get compared a lot!
this.asString = "Magic Detection for " + type.toString() +
" looking for " + pattern.length +
" bytes = " + this.pattern +
" mask = " + this.mask;
}
/**
*
* @param input document input stream, or <code>null</code>
* @param metadata ignored
*/
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
if (input == null) {
return MediaType.OCTET_STREAM;
}
input.mark(offsetRangeEnd + length);
try {
int offset = 0;
// Skip bytes at the beginning, using skip() or read()
while (offset < offsetRangeBegin) {
long n = input.skip(offsetRangeBegin - offset);
if (n > 0) {
offset += n;
} else if (input.read() != -1) {
offset += 1;
} else {
return MediaType.OCTET_STREAM;
}
}
// Fill in the comparison window
byte[] buffer =
new byte[length + (offsetRangeEnd - offsetRangeBegin)];
int n = input.read(buffer);
if (n > 0) {
offset += n;
}
while (n != -1 && offset < offsetRangeEnd + length) {
int bufferOffset = offset - offsetRangeBegin;
n = input.read(
buffer, bufferOffset, buffer.length - bufferOffset);
}
if (offset < offsetRangeBegin + length) {
return MediaType.OCTET_STREAM;
}
// Loop until we've covered the entire offset range
for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
boolean match = true;
for (int j = 0; match && j < length; j++) {
match = (buffer[i + j] & mask[j]) == pattern[j];
}
if (match) {
return type;
}
}
return MediaType.OCTET_STREAM;
} finally {
input.reset();
}
}
/**
* Returns a string representation of the Detection Rule.
* Should sort nicely by type and details, as we sometimes
* compare these.
*/
public String toString() {
return asString;
}
}