/** * The contents of this file are subject to the license and copyright * detailed in the LICENSE file at the root of the source * tree and available online at * * https://github.com/keeps/roda */ package org.roda.core.plugins.plugins.characterization; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.roda.core.common.iterables.CloseableIterable; import org.roda.core.data.common.RodaConstants; import org.roda.core.data.common.RodaConstants.PreservationEventType; import org.roda.core.data.exceptions.AlreadyExistsException; import org.roda.core.data.exceptions.AuthorizationDeniedException; import org.roda.core.data.exceptions.GenericException; import org.roda.core.data.exceptions.InvalidParameterException; import org.roda.core.data.exceptions.NotFoundException; import org.roda.core.data.exceptions.RequestNotValidException; import org.roda.core.data.v2.IsRODAObject; import org.roda.core.data.v2.common.OptionalWithCause; import org.roda.core.data.v2.ip.AIP; import org.roda.core.data.v2.ip.AIPState; import org.roda.core.data.v2.ip.File; import org.roda.core.data.v2.ip.Representation; import org.roda.core.data.v2.ip.RepresentationLink; import org.roda.core.data.v2.ip.metadata.LinkingIdentifier; import org.roda.core.data.v2.jobs.Job; import org.roda.core.data.v2.jobs.PluginParameter; import org.roda.core.data.v2.jobs.PluginParameter.PluginParameterType; import org.roda.core.data.v2.jobs.PluginType; import org.roda.core.data.v2.jobs.Report; import org.roda.core.data.v2.jobs.Report.PluginState; import org.roda.core.data.v2.validation.ValidationException; import org.roda.core.index.IndexService; import org.roda.core.model.ModelService; import org.roda.core.plugins.AbstractAIPComponentsPlugin; import org.roda.core.plugins.Plugin; import org.roda.core.plugins.PluginException; import org.roda.core.plugins.orchestrate.SimpleJobPluginInfo; import org.roda.core.plugins.plugins.PluginHelper; import org.roda.core.storage.StorageService; import org.roda.core.util.IdUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TikaFullTextPlugin<T extends IsRODAObject> extends AbstractAIPComponentsPlugin<T> { private static final Logger LOGGER = LoggerFactory.getLogger(TikaFullTextPlugin.class); private boolean doFeatureExtraction = true; private boolean doFulltextExtraction = false; private static Map<String, PluginParameter> pluginParameters = new HashMap<>(); static { pluginParameters.put(RodaConstants.PLUGIN_PARAMS_DO_FEATURE_EXTRACTION, new PluginParameter( RodaConstants.PLUGIN_PARAMS_DO_FEATURE_EXTRACTION, "Feature extraction", PluginParameterType.BOOLEAN, "true", true, false, "Perform feature extraction from files. This will extract properties such as number of pages, width, height, colour space, etc.")); pluginParameters.put(RodaConstants.PLUGIN_PARAMS_DO_FULLTEXT_EXTRACTION, new PluginParameter( RodaConstants.PLUGIN_PARAMS_DO_FULLTEXT_EXTRACTION, "Full text extraction", PluginParameterType.BOOLEAN, "true", true, false, "Extracts full text from document/textual files. Extracted text is used to perform full-text searching on the catalogue.")); } @Override public void init() throws PluginException { // do nothing } @Override public void shutdown() { // do nothing } public static String getStaticName() { return "Feature extraction (Apache Tika)"; } @Override public String getName() { return getStaticName(); } public static String getStaticDescription() { return "The Apache Tika tool extracts technical metadata and text from over a thousand different file types (such as PPT, XLS, and PDF). \nThe task updates " + "PREMIS objects metadata in the entity to store the results of the characterization process. A PREMIS event is also recorded " + "after the task is run.\nFor more information on this tool, please visit https://tika.apache.org"; } @Override public String getDescription() { return getStaticDescription(); } @Override public String getVersionImpl() { return "1.0"; } @Override public List<PluginParameter> getParameters() { ArrayList<PluginParameter> parameters = new ArrayList<>(); parameters.add(pluginParameters.get(RodaConstants.PLUGIN_PARAMS_DO_FEATURE_EXTRACTION)); parameters.add(pluginParameters.get(RodaConstants.PLUGIN_PARAMS_DO_FULLTEXT_EXTRACTION)); return parameters; } @Override public void setParameterValues(Map<String, String> parameters) throws InvalidParameterException { super.setParameterValues(parameters); if (parameters.containsKey(RodaConstants.PLUGIN_PARAMS_DO_FEATURE_EXTRACTION)) { doFeatureExtraction = Boolean.parseBoolean(parameters.get(RodaConstants.PLUGIN_PARAMS_DO_FEATURE_EXTRACTION)); } if (parameters.containsKey(RodaConstants.PLUGIN_PARAMS_DO_FULLTEXT_EXTRACTION)) { doFulltextExtraction = Boolean.parseBoolean(parameters.get(RodaConstants.PLUGIN_PARAMS_DO_FULLTEXT_EXTRACTION)); } } @Override public Report executeOnAIP(IndexService index, ModelService model, StorageService storage, Report report, SimpleJobPluginInfo jobPluginInfo, List<AIP> list, Job job) throws PluginException { try { for (AIP aip : list) { Report reportItem = PluginHelper.initPluginReportItem(this, aip.getId(), AIP.class, AIPState.INGEST_PROCESSING); reportItem.setPluginState(PluginState.SUCCESS); PluginHelper.updatePartialJobReport(this, model, reportItem, false, job); LOGGER.debug("Processing AIP {}", aip.getId()); String outcomeDetailExtension = ""; List<LinkingIdentifier> sources = new ArrayList<>(); try { for (Representation representation : aip.getRepresentations()) { LOGGER.debug("Processing representation {} of AIP {}", representation.getId(), aip.getId()); CloseableIterable<OptionalWithCause<File>> allFiles = model.listFilesUnder(aip.getId(), representation.getId(), true); for (OptionalWithCause<File> oFile : allFiles) { if (oFile.isPresent()) { File file = oFile.get(); LinkingIdentifier tikaResult = TikaFullTextPluginUtils.runTikaFullTextOnFile(model, file, doFeatureExtraction, doFulltextExtraction); sources.add(tikaResult); } else { LOGGER.error("Cannot process File", oFile.getCause()); } } model.notifyRepresentationUpdated(representation); } jobPluginInfo.incrementObjectsProcessedWithSuccess(); } catch (Exception e) { outcomeDetailExtension = e.getMessage(); LOGGER.error("Error running Tika on AIP {}", aip.getId(), e); if (reportItem != null) { StringBuilder details; if (reportItem.getPluginDetails() == null) { details = new StringBuilder(); } else { details = new StringBuilder(reportItem.getPluginDetails()); } details.append(e.getMessage()); reportItem.setPluginDetails(details.toString()).setPluginState(PluginState.FAILURE); } jobPluginInfo.incrementObjectsProcessedWithFailure(); } report.addReport(reportItem); PluginHelper.updatePartialJobReport(this, model, reportItem, true, job); try { List<LinkingIdentifier> outcomes = null; boolean notify = true; PluginHelper.createPluginEvent(this, aip.getId(), model, index, sources, outcomes, reportItem.getPluginState(), outcomeDetailExtension, notify); } catch (ValidationException | RequestNotValidException | NotFoundException | GenericException | AuthorizationDeniedException | AlreadyExistsException e) { LOGGER.error("Error creating preservation event", e); } } } catch (ClassCastException e) { LOGGER.error("Trying to execute an AIP-only plugin with other objects"); jobPluginInfo.incrementObjectsProcessedWithFailure(list.size()); } return report; } @Override public Report executeOnRepresentation(IndexService index, ModelService model, StorageService storage, Report report, SimpleJobPluginInfo jobPluginInfo, List<Representation> list, Job job) throws PluginException { try { for (Representation representation : list) { LOGGER.debug("Processing representation {} of AIP {}", IdUtils.getRepresentationId(representation), representation.getAipId()); Report reportItem = PluginHelper.initPluginReportItem(this, representation.getId(), Representation.class, AIPState.INGEST_PROCESSING); reportItem.setPluginState(PluginState.SUCCESS); PluginHelper.updatePartialJobReport(this, model, reportItem, false, job); List<LinkingIdentifier> sources = new ArrayList<>(); String outcomeDetailExtension = ""; try { CloseableIterable<OptionalWithCause<File>> allFiles = model.listFilesUnder(representation.getAipId(), representation.getId(), true); for (OptionalWithCause<File> oFile : allFiles) { if (oFile.isPresent()) { File file = oFile.get(); LinkingIdentifier tikaResult = TikaFullTextPluginUtils.runTikaFullTextOnFile(model, file, doFeatureExtraction, doFulltextExtraction); sources.add(tikaResult); } else { LOGGER.error("Cannot process File", oFile.getCause()); } } model.notifyRepresentationUpdated(representation); jobPluginInfo.incrementObjectsProcessedWithSuccess(); } catch (Exception e) { outcomeDetailExtension = e.getMessage(); LOGGER.error("Error running Tika on Representation {}: {}", representation.getId(), e.getMessage()); if (reportItem != null) { StringBuilder details; if (reportItem.getPluginDetails() == null) { details = new StringBuilder(); } else { details = new StringBuilder(reportItem.getPluginDetails()); } details.append(e.getMessage()); reportItem.setPluginDetails(details.toString()).setPluginState(PluginState.FAILURE); } jobPluginInfo.incrementObjectsProcessedWithFailure(); } report.addReport(reportItem); PluginHelper.updatePartialJobReport(this, model, reportItem, true, job); try { List<LinkingIdentifier> outcomes = null; boolean notify = true; PluginHelper.createPluginEvent(this, representation.getAipId(), representation.getId(), model, index, sources, outcomes, reportItem.getPluginState(), outcomeDetailExtension, notify); } catch (ValidationException | RequestNotValidException | NotFoundException | GenericException | AuthorizationDeniedException | AlreadyExistsException e) { LOGGER.error("Error creating preservation event", e); } } } catch (ClassCastException e) { LOGGER.error("Trying to execute an Representation-only plugin with other objects"); jobPluginInfo.incrementObjectsProcessedWithFailure(list.size()); } return report; } @Override public Report executeOnFile(IndexService index, ModelService model, StorageService storage, Report report, SimpleJobPluginInfo jobPluginInfo, List<File> list, Job job) throws PluginException { List<RepresentationLink> representationsToUpdate = new ArrayList<>(); for (File file : list) { LOGGER.debug("Processing file {} of representation {} of AIP {}", file.getId(), file.getRepresentationId(), file.getAipId()); Report reportItem = PluginHelper.initPluginReportItem(this, IdUtils.getFileId(file), File.class, AIPState.INGEST_PROCESSING); reportItem.setPluginState(PluginState.SUCCESS); PluginHelper.updatePartialJobReport(this, model, reportItem, false, job); List<LinkingIdentifier> sources = new ArrayList<>(); String outcomeDetailExtension = ""; try { LinkingIdentifier tikaResult = TikaFullTextPluginUtils.runTikaFullTextOnFile(model, file, doFeatureExtraction, doFulltextExtraction); sources.add(tikaResult); RepresentationLink link = new RepresentationLink(file.getAipId(), file.getRepresentationId()); if (!representationsToUpdate.contains(link)) { representationsToUpdate.add(link); } jobPluginInfo.incrementObjectsProcessedWithSuccess(); } catch (Exception e) { outcomeDetailExtension = e.getMessage(); LOGGER.error("Error running Tika on File {}: {}", file.getId(), e.getMessage()); if (reportItem != null) { StringBuilder details; if (reportItem.getPluginDetails() == null) { details = new StringBuilder(); } else { details = new StringBuilder(reportItem.getPluginDetails()); } details.append(e.getMessage()); reportItem.setPluginDetails(details.toString()).setPluginState(PluginState.FAILURE); } jobPluginInfo.incrementObjectsProcessedWithFailure(); } report.addReport(reportItem); PluginHelper.updatePartialJobReport(this, model, reportItem, true, job); try { List<LinkingIdentifier> outcomes = null; boolean notify = true; PluginHelper.createPluginEvent(this, file.getAipId(), file.getRepresentationId(), file.getPath(), file.getId(), model, index, sources, outcomes, reportItem.getPluginState(), outcomeDetailExtension, notify); } catch (ValidationException | RequestNotValidException | NotFoundException | GenericException | AuthorizationDeniedException | AlreadyExistsException e) { LOGGER.error("Error creating preservation event", e); } } for (RepresentationLink link : representationsToUpdate) { try { Representation representation = model.retrieveRepresentation(link.getAipId(), link.getRepresentationId()); model.notifyRepresentationUpdated(representation); } catch (RequestNotValidException | GenericException | NotFoundException | AuthorizationDeniedException e) { LOGGER.error("Error updating representation after running Tika plugin"); } } return report; } @Override public Plugin<T> cloneMe() { TikaFullTextPlugin<T> tikaPlugin = new TikaFullTextPlugin<>(); try { tikaPlugin.init(); } catch (PluginException e) { LOGGER.error("Error doing {} init", TikaFullTextPlugin.class.getName(), e); } return tikaPlugin; } @Override public PluginType getType() { return PluginType.AIP_TO_AIP; } @Override public boolean areParameterValuesValid() { return true; } @Override public PreservationEventType getPreservationEventType() { return PreservationEventType.METADATA_EXTRACTION; } @Override public String getPreservationEventDescription() { return "Extraction of technical metadata using Apache Tika."; } @Override public String getPreservationEventSuccessMessage() { return "Successfully extracted technical metadata and/or full text from file(s). " + "The results of extraction are stored under [REPRESENTATION_ID]/metadata/other/ApacheTika."; } @Override public String getPreservationEventFailureMessage() { return "Failed to extract technical metadata from file."; } @Override public Report beforeAllExecute(IndexService index, ModelService model, StorageService storage) throws PluginException { // do nothing return null; } @Override public Report afterAllExecute(IndexService index, ModelService model, StorageService storage) throws PluginException { // do nothing return null; } @Override public List<String> getCategories() { return Arrays.asList(RodaConstants.PLUGIN_CATEGORY_CHARACTERIZATION); } @SuppressWarnings({"unchecked", "rawtypes"}) @Override public List<Class<T>> getObjectClasses() { List<Class<? extends IsRODAObject>> list = new ArrayList<>(); list.add(AIP.class); list.add(Representation.class); list.add(File.class); return (List) list; } }