package org.icij.extract.spewer; import org.icij.task.Options; import org.icij.task.annotation.Option; import java.util.Locale; import java.util.regex.Pattern; /** * Defaults for use with spewers. * * @author Matthew Caruana Galizia <mcaruana@icij.org> * @since 1.0.0-beta */ @Option(name = "idField", description = "Index field for an automatically generated identifier. The ID " + "for the same file is guaranteed not to change if the path doesn't change. Defaults to \"id\".", code = "i", parameter = "name") @Option(name = "textField", description = "Field name for extracted text.", code = "t", parameter = "name") @Option(name = "pathField", description = "Field name for the file path.", parameter = "name", code = "p") @Option(name = "parentPathField", description = "Field name for the parent directory path.", parameter = "name") @Option(name = "parentField", description = "Field name for the parent ID on child documents.", parameter = "name") @Option(name = "levelField", description = "Field name for the hierarchy level field.", parameter = "name") @Option(name = "baseTypeField", description = "Field name for the base content-type.", parameter = "name") @Option(name = "versionField", description = "Index field name for the version.", parameter = "name") @Option(name = "tagPrefix", description = "Prefix for tag fields added to the index.", parameter = "name") @Option(name = "metadataPrefix", description = "Prefix for metadata fields added to the index.", parameter = "name") @Option(name = "metadataISODatePostfix", description = "Postfix for 'fixed' ISO 8601 metadata fields.", parameter = "name") public class FieldNames { private static final Pattern fieldName = Pattern.compile("[^A-Za-z0-9_]"); public static final String DEFAULT_ID_FIELD = "extract_id"; public static final String DEFAULT_TEXT_FIELD = "tika_content"; public static final String DEFAULT_PATH_FIELD = "extract_paths"; public static final String DEFAULT_BASE_TYPE_FIELD = "extract_base_type"; public static final String DEFAULT_PARENT_PATH_FIELD = "extract_parent_paths"; public static final String DEFAULT_PARENT_ID_FIELD = "extract_parent_id"; public static final String DEFAULT_LEVEL_FIELD = "extract_level"; public static final String DEFAULT_VERSION_FIELD = "_version_"; public static final String DEFAULT_METADATA_FIELD_PREFIX = "tika_metadata_"; public static final String DEFAULT_TAG_FIELD_PREFIX = "tag_"; public static final String DEFAULT_METADATA_ISO_DATE_POSTFIX = "_iso8601"; private String textField = DEFAULT_TEXT_FIELD; private String pathField = DEFAULT_PATH_FIELD; private String parentPathField = DEFAULT_PARENT_PATH_FIELD; private String idField = DEFAULT_ID_FIELD; private String baseTypeField = DEFAULT_BASE_TYPE_FIELD; private String versionField = DEFAULT_VERSION_FIELD; private String parentIdField = DEFAULT_PARENT_ID_FIELD; private String levelField = DEFAULT_LEVEL_FIELD; private String tagFieldPrefix = DEFAULT_TAG_FIELD_PREFIX; private String metadataFieldPrefix = DEFAULT_METADATA_FIELD_PREFIX; private String metadataISODatePostfix = DEFAULT_METADATA_ISO_DATE_POSTFIX; public FieldNames configure(final Options<String> options) { options.get("textField").value().ifPresent(this::forText); options.get("pathField").value().ifPresent(this::forPath); options.get("parentPathField").value().ifPresent(this::forParentPath); options.get("parentField").value().ifPresent(this::forParentId); options.get("levelField").value().ifPresent(this::forLevel); options.get("baseTypeField").value().ifPresent(this::forBaseType); options.get("versionField").value().ifPresent(this::forVersion); options.get("idField").value().ifPresent(this::forId); options.get("tagPrefix").value().ifPresent(this::forTagPrefix); options.get("metadataPrefix").value().ifPresent(this::forMetadataPrefix); options.get("metadataISODatePostfix").value().ifPresent(this::forMetadataISODatePostfix); return this; } private void forText(final String textField) { this.textField = textField; } String forText() { return textField; } private void forPath(final String pathField) { this.pathField = pathField; } String forPath() { return pathField; } private void forParentPath(final String parentPathField) { this.parentPathField = parentPathField; } String forParentPath() { return parentPathField; } private void forParentId(final String parentIdField) { this.parentIdField = parentIdField; } String forParentId() { return parentIdField; } private void forLevel(final String levelField) { this.levelField = levelField; } String forLevel() { return levelField; } private void forBaseType(final String baseTypeField) { this.baseTypeField = baseTypeField; } String forBaseType() { return baseTypeField; } private void forVersion(final String versionField) { this.versionField = versionField; } String forVersion() { return versionField; } private void forId(final String idField) { this.idField = idField; } String forId() { return idField; } private void forMetadataPrefix(final String metadataFieldPrefix) { this.metadataFieldPrefix = metadataFieldPrefix; } String forMetadata(final String name) { final String normalizedName = fieldName.matcher(name).replaceAll("_").toLowerCase(Locale.ROOT); if (null != metadataFieldPrefix) { return metadataFieldPrefix + normalizedName; } return normalizedName; } private void forMetadataISODatePostfix(final String metadataISODatePostfix) { this.metadataISODatePostfix = metadataISODatePostfix; } String forMetadataISODate(final String name) { return forMetadata(name) + metadataISODatePostfix; } private void forTagPrefix(final String tagFieldPrefix) { this.tagFieldPrefix = tagFieldPrefix; } String forTag(final String name) { return tagFieldPrefix + name; } }