package org.icij.extract.spewer;
import org.apache.commons.io.TaggedIOException;
import org.apache.tika.metadata.*;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Stream;
public class MetadataTransformer implements Serializable {
private static final Map<String, Property> dateProperties = new HashMap<>();
@SuppressWarnings("deprecation")
private static final List<String> deduplicateProperties = Arrays.asList(
// Deduplicate content types (Tika seems to add these sometimes, especially for RTF files).
Metadata.CONTENT_TYPE.toLowerCase(Locale.ENGLISH),
// Deduplicate titles (appear in bad HTML files).
TikaCoreProperties.TITLE.getName().toLowerCase(Locale.ENGLISH),
Metadata.TITLE.toLowerCase(Locale.ENGLISH),
// Deduplicate these properties contained in some MSHTML documents.
"originator",
"generator",
"progid");
private static final long serialVersionUID = -6643888792096975746L;
static {
//noinspection deprecation
Stream.of(
DublinCore.DATE,
DublinCore.CREATED,
DublinCore.MODIFIED,
Office.CREATION_DATE,
Office.SAVE_DATE,
Office.PRINT_DATE,
MSOffice.CREATION_DATE,
MSOffice.LAST_SAVED,
MSOffice.LAST_PRINTED,
PDF.DOC_INFO_CREATED,
PDF.DOC_INFO_MODIFICATION_DATE,
TIFF.ORIGINAL_DATE,
Metadata.DATE,
Property.externalDate(Metadata.MODIFIED),
HttpHeaders.LAST_MODIFIED).forEach(property -> dateProperties.put(property.getName(), property));
}
private final Metadata metadata;
private final FieldNames fields;
private final Map<String, String> fieldMap = new HashMap<>();
MetadataTransformer(final Metadata metadata, final FieldNames fields) {
this.metadata = metadata;
this.fields = fields;
}
void transform(final ValueConsumer single, final ValueArrayConsumer multiple) throws IOException {
final Map<String, String[]> normalised = new HashMap<>();
// Loop over the names twice, first to normalise the names so that "GENERATOR" and "Generator" get normalised
// to "generator", and the values are concatenated into an array instead of one value overriding the other in
// the consumer.
for (String name : metadata.names()) {
String[] values = metadata.getValues(name);
if (0 == values.length) {
continue;
}
// The title field should not be considered multivalued until TIKA-2274 is resolved.
//noinspection deprecation
if (values.length > 1 && name.equals(Metadata.TITLE)) {
values = Arrays.copyOfRange(values, 0, 1);
}
// Keep a mapping of the old name around, to enable a reverse lookup later.
final String normalisedName = fields.forMetadata(name);
fieldMap.putIfAbsent(normalisedName, name);
normalised.merge(normalisedName, values, this::concat);
}
try {
for (Map.Entry<String, String[]> entry: normalised.entrySet()) {
final String[] values = entry.getValue();
if (values.length > 1) {
transform(entry.getKey(), values, multiple);
} else {
transform(entry.getKey(), values[0], single);
}
}
} catch (IOException e) {
throw new TaggedIOException(e, getClass());
}
}
private String[] concat(final String[] a, final String[] b) {
final String[] n;
n = new String[a.length + b.length];
System.arraycopy(a, 0, n, 0, a.length);
System.arraycopy(b, 0, n, a.length, b.length);
return n;
}
private void transform(final String normalisedName, String[] values, final ValueArrayConsumer consumer) throws
IOException {
Stream<String> stream = Arrays.stream(values);
// Remove empty values.
stream = stream.filter(value -> null != value && !value.isEmpty());
// Remove duplicates.
// Normalised to lowercase so that "GENERATOR" matches "Generator" (these inconsistent names can come from
// HTML documents).
if (values.length > 1 && deduplicateProperties.contains(fieldMap.get(normalisedName)
.toLowerCase(Locale.ENGLISH))) {
stream = stream.distinct();
}
values = stream.toArray(String[]::new);
if (values.length > 0) {
consumer.accept(normalisedName, values);
}
}
private void transform(final String normalisedName, final String value, final ValueConsumer consumer) throws
IOException {
if (null == value || value.isEmpty()) {
return;
}
consumer.accept(normalisedName, value);
// Add a separate field containing the ISO 8601 date.
final String name = fieldMap.get(normalisedName);
if (dateProperties.containsKey(name)) {
transformDate(name, consumer);
}
}
private void transformDate(final String name, final ValueConsumer consumer) throws IOException {
final Date isoDate = metadata.getDate(dateProperties.get(name));
if (null != isoDate) {
consumer.accept(fields.forMetadataISODate(name), isoDate.toInstant().toString());
} else {
throw new IOException(String.format("Unable to parse date \"%s\" from field " +
"\"%s\" for ISO 8601 formatting.", metadata.get(name), name));
}
}
@FunctionalInterface
interface ValueConsumer {
void accept(final String name, final String value) throws IOException;
}
@FunctionalInterface
interface ValueArrayConsumer {
void accept(final String name, final String[] values) throws IOException;
}
}