package edu.harvard.iq.dataverse.search; import edu.harvard.iq.dataverse.ControlledVocabularyValue; import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.DataFileTag; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; import edu.harvard.iq.dataverse.DatasetFieldConstant; import edu.harvard.iq.dataverse.DatasetFieldType; import edu.harvard.iq.dataverse.DatasetLinkingServiceBean; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DataverseLinkingServiceBean; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.DvObjectServiceBean; import edu.harvard.iq.dataverse.FileMetadata; import edu.harvard.iq.dataverse.PermissionServiceBean; import edu.harvard.iq.dataverse.util.StringUtil; import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinUserServiceBean; import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; import edu.harvard.iq.dataverse.datavariable.DataVariable; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.util.FileUtil; import edu.harvard.iq.dataverse.util.SystemConfig; import java.io.IOException; import java.sql.Timestamp; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collection; import java.util.Date; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.Future; import java.util.logging.Logger; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; import javax.ejb.AsyncResult; import javax.ejb.EJB; import javax.ejb.EJBException; import javax.ejb.Stateless; import javax.ejb.TransactionAttribute; import static javax.ejb.TransactionAttributeType.REQUIRES_NEW; import javax.inject.Named; import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; @Stateless @Named public class IndexServiceBean { private static final Logger logger = Logger.getLogger(IndexServiceBean.class.getCanonicalName()); @EJB DvObjectServiceBean dvObjectService; @EJB DataverseServiceBean dataverseService; @EJB DatasetServiceBean datasetService; @EJB BuiltinUserServiceBean dataverseUserServiceBean; @EJB PermissionServiceBean permissionService; @EJB AuthenticationServiceBean userServiceBean; @EJB SystemConfig systemConfig; @EJB SearchPermissionsServiceBean searchPermissionsService; @EJB SolrIndexServiceBean solrIndexService; @EJB DatasetLinkingServiceBean dsLinkingService; @EJB DataverseLinkingServiceBean dvLinkingService; public static final String solrDocIdentifierDataverse = "dataverse_"; public static final String solrDocIdentifierFile = "datafile_"; public static final String solrDocIdentifierDataset = "dataset_"; public static final String draftSuffix = "_draft"; public static final String deaccessionedSuffix = "_deaccessioned"; public static final String discoverabilityPermissionSuffix = "_permission"; private static final String groupPrefix = "group_"; private static final String groupPerUserPrefix = "group_user"; private static final String publicGroupIdString = "public"; private static final String publicGroupString = groupPrefix + "public"; public static final String PUBLISHED_STRING = "Published"; private static final String UNPUBLISHED_STRING = "Unpublished"; private static final String DRAFT_STRING = "Draft"; private static final String IN_REVIEW_STRING = "In Review"; private static final String DEACCESSIONED_STRING = "Deaccessioned"; public static final String HARVESTED = "Harvested"; private String rootDataverseName; private Dataverse rootDataverseCached; private SolrServer solrServer; @PostConstruct public void init(){ solrServer = new HttpSolrServer("http://" + systemConfig.getSolrHostColonPort() + "/solr"); rootDataverseName = findRootDataverseCached().getName() + " " + BundleUtil.getStringFromBundle("dataverse"); } @PreDestroy public void close(){ if(solrServer != null){ solrServer.shutdown(); solrServer = null; } } @TransactionAttribute(REQUIRES_NEW) public Future<String> indexDataverseInNewTransaction(Dataverse dataverse) { return indexDataverse(dataverse); } public Future<String> indexDataverse(Dataverse dataverse) { logger.fine("indexDataverse called on dataverse id " + dataverse.getId() + "(" + dataverse.getAlias() + ")"); if (dataverse.getId() == null) { String msg = "unable to index dataverse. id was null (alias: " + dataverse.getAlias() + ")"; logger.info(msg); return new AsyncResult<>(msg); } Dataverse rootDataverse = findRootDataverseCached(); if (rootDataverse == null) { String msg = "Could not find root dataverse and the root dataverse should not be indexed. Returning."; return new AsyncResult<>(msg); } else if (dataverse.getId() == rootDataverse.getId()) { String msg = "The root dataverse should not be indexed. Returning."; return new AsyncResult<>(msg); } Collection<SolrInputDocument> docs = new ArrayList<>(); SolrInputDocument solrInputDocument = new SolrInputDocument(); solrInputDocument.addField(SearchFields.ID, solrDocIdentifierDataverse + dataverse.getId()); solrInputDocument.addField(SearchFields.ENTITY_ID, dataverse.getId()); solrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, systemConfig.getVersion()); solrInputDocument.addField(SearchFields.IDENTIFIER, dataverse.getAlias()); solrInputDocument.addField(SearchFields.TYPE, "dataverses"); solrInputDocument.addField(SearchFields.NAME, dataverse.getName()); solrInputDocument.addField(SearchFields.NAME_SORT, dataverse.getName()); solrInputDocument.addField(SearchFields.DATAVERSE_NAME, dataverse.getName()); solrInputDocument.addField(SearchFields.DATAVERSE_CATEGORY, dataverse.getIndexableCategoryName()); if (dataverse.isReleased()) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING); solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataverse.getPublicationDate()); solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(dataverse.getPublicationDate())); } else { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING); solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataverse.getCreateDate()); solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(dataverse.getCreateDate())); } /* We don't really have harvested dataverses yet; (I have in fact just removed the isHarvested() method from the Dataverse object) -- L.A. if (dataverse.isHarvested()) { solrInputDocument.addField(SearchFields.IS_HARVESTED, true); solrInputDocument.addField(SearchFields.SOURCE, HARVESTED); } else { (this means that all dataverses are "local" - should this be removed? */ solrInputDocument.addField(SearchFields.IS_HARVESTED, false); solrInputDocument.addField(SearchFields.METADATA_SOURCE, findRootDataverseCached().getName() + " " + BundleUtil.getStringFromBundle("dataverse")); //rootDataverseName); /*}*/ addDataverseReleaseDateToSolrDoc(solrInputDocument, dataverse); // if (dataverse.getOwner() != null) { // solrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataverse.getOwner().getName()); // } solrInputDocument.addField(SearchFields.DESCRIPTION, StringUtil.html2text(dataverse.getDescription())); solrInputDocument.addField(SearchFields.DATAVERSE_DESCRIPTION, StringUtil.html2text(dataverse.getDescription())); // logger.info("dataverse affiliation: " + dataverse.getAffiliation()); if (dataverse.getAffiliation() != null && !dataverse.getAffiliation().isEmpty()) { /** * @todo: stop using affiliation as category */ // solrInputDocument.addField(SearchFields.CATEGORY, dataverse.getAffiliation()); solrInputDocument.addField(SearchFields.AFFILIATION, dataverse.getAffiliation()); solrInputDocument.addField(SearchFields.DATAVERSE_AFFILIATION, dataverse.getAffiliation()); } for (ControlledVocabularyValue dataverseSubject : dataverse.getDataverseSubjects()) { String subject = dataverseSubject.getStrValue(); if (!subject.equals(DatasetField.NA_VALUE)) { solrInputDocument.addField(SearchFields.DATAVERSE_SUBJECT, subject); // collapse into shared "subject" field used as a facet solrInputDocument.addField(SearchFields.SUBJECT, subject); } } // checking for NPE is important so we can create the root dataverse if (rootDataverse != null && !dataverse.equals(rootDataverse)) { // important when creating root dataverse if (dataverse.getOwner() != null) { solrInputDocument.addField(SearchFields.PARENT_ID, dataverse.getOwner().getId()); solrInputDocument.addField(SearchFields.PARENT_NAME, dataverse.getOwner().getName()); } } List<String> dataversePathSegmentsAccumulator = new ArrayList<>(); List<String> dataverseSegments = findPathSegments(dataverse, dataversePathSegmentsAccumulator); List<String> dataversePaths = getDataversePathsFromSegments(dataverseSegments); if (dataversePaths.size() > 0) { // don't show yourself while indexing or in search results: https://redmine.hmdc.harvard.edu/issues/3613 // logger.info(dataverse.getName() + " size " + dataversePaths.size()); dataversePaths.remove(dataversePaths.size() - 1); } //Add paths for linking dataverses for (Dataverse linkingDataverse : dvLinkingService.findLinkingDataverses(dataverse.getId())) { List<String> linkingDataversePathSegmentsAccumulator = new ArrayList<>(); List<String> linkingdataverseSegments = findPathSegments(linkingDataverse, linkingDataversePathSegmentsAccumulator); List<String> linkingDataversePaths = getDataversePathsFromSegments(linkingdataverseSegments); for (String dvPath : linkingDataversePaths) { dataversePaths.add(dvPath); } } solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); docs.add(solrInputDocument); String status; try { if (dataverse.getId() != null) { solrServer.add(docs); } else { logger.info("WARNING: indexing of a dataverse with no id attempted"); } } catch (SolrServerException | IOException ex) { status = ex.toString(); logger.info(status); return new AsyncResult<>(status); } try { solrServer.commit(); } catch (SolrServerException | IOException ex) { status = ex.toString(); logger.info(status); return new AsyncResult<>(status); } dvObjectService.updateContentIndexTime(dataverse); IndexResponse indexResponse = solrIndexService.indexPermissionsForOneDvObject(dataverse); String msg = "indexed dataverse " + dataverse.getId() + ":" + dataverse.getAlias() + ". Response from permission indexing: " + indexResponse.getMessage(); return new AsyncResult<>(msg); } @TransactionAttribute(REQUIRES_NEW) public Future<String> indexDatasetInNewTransaction(Dataset dataset) { boolean doNormalSolrDocCleanUp = false; return indexDataset(dataset, doNormalSolrDocCleanUp); } public Future<String> indexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { logger.fine("indexing dataset " + dataset.getId()); /** * @todo should we use solrDocIdentifierDataset or * IndexableObject.IndexableTypes.DATASET.getName() + "_" ? */ // String solrIdPublished = solrDocIdentifierDataset + dataset.getId(); String solrIdPublished = determinePublishedDatasetSolrDocId(dataset); String solrIdDraftDataset = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.WORKING_COPY.getSuffix(); // String solrIdDeaccessioned = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.DEACCESSIONED.getSuffix(); String solrIdDeaccessioned = determineDeaccessionedDatasetId(dataset); StringBuilder debug = new StringBuilder(); debug.append("\ndebug:\n"); int numPublishedVersions = 0; List<DatasetVersion> versions = dataset.getVersions(); List<String> solrIdsOfFilesToDelete = new ArrayList<>(); for (DatasetVersion datasetVersion : versions) { Long versionDatabaseId = datasetVersion.getId(); String versionTitle = datasetVersion.getTitle(); String semanticVersion = datasetVersion.getSemanticVersion(); DatasetVersion.VersionState versionState = datasetVersion.getVersionState(); if (versionState.equals(DatasetVersion.VersionState.RELEASED)) { numPublishedVersions += 1; } debug.append("version found with database id " + versionDatabaseId + "\n"); debug.append("- title: " + versionTitle + "\n"); debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n"); List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas(); List<String> fileInfo = new ArrayList<>(); for (FileMetadata fileMetadata : fileMetadatas) { String solrIdOfPublishedFile = solrDocIdentifierFile + fileMetadata.getDataFile().getId(); /** * It sounds weird but the first thing we'll do is preemptively * delete the Solr documents of all published files. Don't * worry, published files will be re-indexed later along with * the dataset. We do this so users can delete files from * published versions of datasets and then re-publish a new * version without fear that their old published files (now * deleted from the latest published version) will be * searchable. See also * https://github.com/IQSS/dataverse/issues/762 */ solrIdsOfFilesToDelete.add(solrIdOfPublishedFile); fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel()); } try { /** * Preemptively delete *all* Solr documents for files associated * with the dataset based on a Solr query. * * We must query Solr for this information because the file has * been deleted from the database ( perhaps when Solr was down, * as reported in https://github.com/IQSS/dataverse/issues/2086 * ) so the database doesn't even know about the file. It's an * orphan. * * @todo This Solr query should make the iteration above based * on the database unnecessary because it the Solr query should * find all files for the dataset. We can probably remove the * iteration above after an "index all" has been performed. * Without an "index all" we won't be able to find files based * on parentId because that field wasn't searchable in 4.0. * * @todo We should also delete the corresponding Solr * "permission" documents for the files. */ List<String> allFilesForDataset = findFilesOfParentDataset(dataset.getId()); solrIdsOfFilesToDelete.addAll(allFilesForDataset); } catch (SearchException | NullPointerException ex) { logger.fine("could not run search of files to delete: " + ex); } int numFiles = 0; if (fileMetadatas != null) { numFiles = fileMetadatas.size(); } debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n"); } debug.append("numPublishedVersions: " + numPublishedVersions + "\n"); if (doNormalSolrDocCleanUp) { IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfFilesToDelete); debug.append("result of attempt to premptively deleted published files before reindexing: " + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n"); } DatasetVersion latestVersion = dataset.getLatestVersion(); String latestVersionStateString = latestVersion.getVersionState().name(); DatasetVersion.VersionState latestVersionState = latestVersion.getVersionState(); DatasetVersion releasedVersion = dataset.getReleasedVersion(); boolean atLeastOnePublishedVersion = false; if (releasedVersion != null) { atLeastOnePublishedVersion = true; } else { atLeastOnePublishedVersion = false; } Map<DatasetVersion.VersionState, Boolean> desiredCards = new LinkedHashMap<>(); /** * @todo refactor all of this below and have a single method that takes * the map of desired cards (which correspond to Solr documents) as one * of the arguments and does all the operations necessary to achieve the * desired state. */ StringBuilder results = new StringBuilder(); if (atLeastOnePublishedVersion == false) { results.append("No published version, nothing will be indexed as ") .append(solrIdPublished).append("\n"); if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) { desiredCards.put(DatasetVersion.VersionState.DRAFT, true); IndexableDataset indexableDraftVersion = new IndexableDataset(latestVersion); String indexDraftResult = addOrUpdateDataset(indexableDraftVersion); results.append("The latest version is a working copy (latestVersionState: ") .append(latestVersionStateString).append(") and indexing was attempted for ") .append(solrIdDraftDataset).append(" (limited discoverability). Result: ") .append(indexDraftResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); if (doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("Draft exists, no need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(" (and files). Result: ") .append(deleteDeaccessionedResult).append("\n"); } desiredCards.put(DatasetVersion.VersionState.RELEASED, false); if (doNormalSolrDocCleanUp) { String deletePublishedResults = removePublished(dataset); results.append("No published version. Attempting to delete traces of published version from index. Result: "). append(deletePublishedResults).append("\n"); } /** * Desired state for existence of cards: {DRAFT=true, * DEACCESSIONED=false, RELEASED=false} * * No published version, nothing will be indexed as dataset_17 * * The latest version is a working copy (latestVersionState: * DRAFT) and indexing was attempted for dataset_17_draft * (limited discoverability). Result: indexed dataset 17 as * dataset_17_draft. filesIndexed: [datafile_18_draft] * * Draft exists, no need for deaccessioned version. Deletion * attempted for dataset_17_deaccessioned (and files). Result: * Attempted to delete dataset_17_deaccessioned from Solr index. * updateReponse was: * {responseHeader={status=0,QTime=1}}Attempted to delete * datafile_18_deaccessioned from Solr index. updateReponse was: * {responseHeader={status=0,QTime=1}} * * No published version. Attempting to delete traces of * published version from index. Result: Attempted to delete * dataset_17 from Solr index. updateReponse was: * {responseHeader={status=0,QTime=1}}Attempted to delete * datafile_18 from Solr index. updateReponse was: * {responseHeader={status=0,QTime=0}} */ String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString(); logger.fine(result); indexDatasetPermissions(dataset); return new AsyncResult<>(result); } else if (latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) { desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, true); IndexableDataset indexableDeaccessionedVersion = new IndexableDataset(latestVersion); String indexDeaccessionedVersionResult = addOrUpdateDataset(indexableDeaccessionedVersion); results.append("No draft version. Attempting to index as deaccessioned. Result: ").append(indexDeaccessionedVersionResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.RELEASED, false); if (doNormalSolrDocCleanUp) { String deletePublishedResults = removePublished(dataset); results.append("No published version. Attempting to delete traces of published version from index. Result: "). append(deletePublishedResults).append("\n"); } desiredCards.put(DatasetVersion.VersionState.DRAFT, false); if (doNormalSolrDocCleanUp) { List<String> solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset); String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset); String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete); results.append("Attempting to delete traces of drafts. Result: ") .append(deleteDraftDatasetVersionResult).append(deleteDraftFilesResults).append("\n"); } /** * Desired state for existence of cards: {DEACCESSIONED=true, * RELEASED=false, DRAFT=false} * * No published version, nothing will be indexed as dataset_17 * * No draft version. Attempting to index as deaccessioned. * Result: indexed dataset 17 as dataset_17_deaccessioned. * filesIndexed: [] * * No published version. Attempting to delete traces of * published version from index. Result: Attempted to delete * dataset_17 from Solr index. updateReponse was: * {responseHeader={status=0,QTime=0}}Attempted to delete * datafile_18 from Solr index. updateReponse was: * {responseHeader={status=0,QTime=3}} * * Attempting to delete traces of drafts. Result: Attempted to * delete dataset_17_draft from Solr index. updateReponse was: * {responseHeader={status=0,QTime=1}} */ String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString(); logger.fine(result); indexDatasetPermissions(dataset); return new AsyncResult<>(result); } else { String result = "No-op. Unexpected condition reached: No released version and latest version is neither draft nor deaccessioned"; logger.fine(result); return new AsyncResult<>(result); } } else if (atLeastOnePublishedVersion == true) { results.append("Published versions found. ") .append("Will attempt to index as ").append(solrIdPublished).append(" (discoverable by anonymous)\n"); if (latestVersionState.equals(DatasetVersion.VersionState.RELEASED) || latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) { desiredCards.put(DatasetVersion.VersionState.RELEASED, true); IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion); String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion); results.append("Attempted to index " + solrIdPublished).append(". Result: ").append(indexReleasedVersionResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DRAFT, false); if (doNormalSolrDocCleanUp) { List<String> solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset); String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset); String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete); results.append("The latest version is published. Attempting to delete drafts. Result: ") .append(deleteDraftDatasetVersionResult).append(deleteDraftFilesResults).append("\n"); } desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); if (doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("No need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult); } /** * Desired state for existence of cards: {RELEASED=true, * DRAFT=false, DEACCESSIONED=false} * * Released versions found: 1. Will attempt to index as * dataset_17 (discoverable by anonymous) * * Attempted to index dataset_17. Result: indexed dataset 17 as * dataset_17. filesIndexed: [datafile_18] * * The latest version is published. Attempting to delete drafts. * Result: Attempted to delete dataset_17_draft from Solr index. * updateReponse was: {responseHeader={status=0,QTime=1}} * * No need for deaccessioned version. Deletion attempted for * dataset_17_deaccessioned. Result: Attempted to delete * dataset_17_deaccessioned from Solr index. updateReponse was: * {responseHeader={status=0,QTime=1}}Attempted to delete * datafile_18_deaccessioned from Solr index. updateReponse was: * {responseHeader={status=0,QTime=0}} */ String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString(); logger.fine(result); indexDatasetPermissions(dataset); return new AsyncResult<>(result); } else if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) { IndexableDataset indexableDraftVersion = new IndexableDataset(latestVersion); desiredCards.put(DatasetVersion.VersionState.DRAFT, true); String indexDraftResult = addOrUpdateDataset(indexableDraftVersion); results.append("The latest version is a working copy (latestVersionState: ") .append(latestVersionStateString).append(") and will be indexed as ") .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.RELEASED, true); IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion); String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion); results.append("There is a published version we will attempt to index. Result: ").append(indexReleasedVersionResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); if (doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("No need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult); } /** * Desired state for existence of cards: {DRAFT=true, * RELEASED=true, DEACCESSIONED=false} * * Released versions found: 1. Will attempt to index as * dataset_17 (discoverable by anonymous) * * The latest version is a working copy (latestVersionState: * DRAFT) and will be indexed as dataset_17_draft (limited * visibility). Result: indexed dataset 17 as dataset_17_draft. * filesIndexed: [datafile_18_draft] * * There is a published version we will attempt to index. * Result: indexed dataset 17 as dataset_17. filesIndexed: * [datafile_18] * * No need for deaccessioned version. Deletion attempted for * dataset_17_deaccessioned. Result: Attempted to delete * dataset_17_deaccessioned from Solr index. updateReponse was: * {responseHeader={status=0,QTime=1}}Attempted to delete * datafile_18_deaccessioned from Solr index. updateReponse was: * {responseHeader={status=0,QTime=0}} */ String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString(); logger.fine(result); indexDatasetPermissions(dataset); return new AsyncResult<>(result); } else { String result = "No-op. Unexpected condition reached: There is at least one published version but the latest version is neither published nor draft"; logger.fine(result); return new AsyncResult<>(result); } } else { String result = "No-op. Unexpected condition reached: Has a version been published or not?"; logger.fine(result); return new AsyncResult<>(result); } } private String deleteDraftFiles(List<String> solrDocIdsForDraftFilesToDelete) { String deleteDraftFilesResults = ""; IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(solrDocIdsForDraftFilesToDelete); deleteDraftFilesResults = indexResponse.toString(); return deleteDraftFilesResults; } private IndexResponse indexDatasetPermissions(Dataset dataset) { boolean disabledForDebugging = false; if (disabledForDebugging) { /** * Performance problems indexing permissions in * https://github.com/IQSS/dataverse/issues/50 and * https://github.com/IQSS/dataverse/issues/2036 */ return new IndexResponse("permissions indexing disabled for debugging"); } IndexResponse indexResponse = solrIndexService.indexPermissionsOnSelfAndChildren(dataset); return indexResponse; } private String addOrUpdateDataset(IndexableDataset indexableDataset) { IndexableDataset.DatasetState state = indexableDataset.getDatasetState(); Dataset dataset = indexableDataset.getDatasetVersion().getDataset(); logger.fine("adding or updating Solr document for dataset id " + dataset.getId()); Collection<SolrInputDocument> docs = new ArrayList<>(); List<String> dataversePathSegmentsAccumulator = new ArrayList<>(); List<String> dataverseSegments = new ArrayList<>(); try { dataverseSegments = findPathSegments(dataset.getOwner(), dataversePathSegmentsAccumulator); } catch (Exception ex) { logger.info("failed to find dataverseSegments for dataversePaths for " + SearchFields.SUBTREE + ": " + ex); } List<String> dataversePaths = getDataversePathsFromSegments(dataverseSegments); //Add Paths for linking dataverses for (Dataverse linkingDataverse : dsLinkingService.findLinkingDataverses(dataset.getId())) { List<String> linkingDataversePathSegmentsAccumulator = new ArrayList<>(); List<String> linkingdataverseSegments = findPathSegments(linkingDataverse, linkingDataversePathSegmentsAccumulator); List<String> linkingDataversePaths = getDataversePathsFromSegments(linkingdataverseSegments); for (String dvPath : linkingDataversePaths) { dataversePaths.add(dvPath); } } SolrInputDocument solrInputDocument = new SolrInputDocument(); String datasetSolrDocId = indexableDataset.getSolrDocId(); solrInputDocument.addField(SearchFields.ID, datasetSolrDocId); solrInputDocument.addField(SearchFields.ENTITY_ID, dataset.getId()); String dataverseVersion = systemConfig.getVersion(); solrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion); solrInputDocument.addField(SearchFields.IDENTIFIER, dataset.getGlobalId()); solrInputDocument.addField(SearchFields.DATASET_PERSISTENT_ID, dataset.getGlobalId()); solrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL()); solrInputDocument.addField(SearchFields.TYPE, "datasets"); Date datasetSortByDate = new Date(); Date majorVersionReleaseDate = dataset.getMostRecentMajorVersionReleaseDate(); if (majorVersionReleaseDate != null) { if (true) { String msg = "major release date found: " + majorVersionReleaseDate.toString(); logger.fine(msg); } datasetSortByDate = majorVersionReleaseDate; } else { if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.WORKING_COPY)) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING); } else if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.DEACCESSIONED)) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DEACCESSIONED_STRING); } Date createDate = dataset.getCreateDate(); if (createDate != null) { if (true) { String msg = "can't find major release date, using create date: " + createDate; logger.fine(msg); } datasetSortByDate = createDate; } else { String msg = "can't find major release date or create date, using \"now\""; logger.info(msg); datasetSortByDate = new Date(); } } solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, datasetSortByDate); solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(datasetSortByDate)); if (state.equals(indexableDataset.getDatasetState().PUBLISHED)) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING); // solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataset.getPublicationDate()); } else if (state.equals(indexableDataset.getDatasetState().WORKING_COPY)) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING); } addDatasetReleaseDateToSolrDoc(solrInputDocument, dataset); if (dataset.isHarvested()) { solrInputDocument.addField(SearchFields.IS_HARVESTED, true); solrInputDocument.addField(SearchFields.METADATA_SOURCE, HARVESTED); } else { solrInputDocument.addField(SearchFields.IS_HARVESTED, false); solrInputDocument.addField(SearchFields.METADATA_SOURCE, findRootDataverseCached().getName() + " " + BundleUtil.getStringFromBundle("dataverse")); //rootDataverseName); } DatasetVersion datasetVersion = indexableDataset.getDatasetVersion(); String parentDatasetTitle = "TBD"; if (datasetVersion != null) { solrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId()); solrInputDocument.addField(SearchFields.DATASET_CITATION, datasetVersion.getCitation(false)); solrInputDocument.addField(SearchFields.DATASET_CITATION_HTML, datasetVersion.getCitation(true)); if (datasetVersion.isInReview()) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, IN_REVIEW_STRING); } for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) { DatasetFieldType dsfType = dsf.getDatasetFieldType(); String solrFieldSearchable = dsfType.getSolrField().getNameSearchable(); String solrFieldFacetable = dsfType.getSolrField().getNameFacetable(); if (dsf.getValues() != null && !dsf.getValues().isEmpty() && dsf.getValues().get(0) != null && solrFieldSearchable != null) { logger.fine("indexing " + dsf.getDatasetFieldType().getName() + ":" + dsf.getValues() + " into " + solrFieldSearchable + " and maybe " + solrFieldFacetable); // if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.INTEGER)) { if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.EMAIL)) { //no-op. we want to keep email address out of Solr per https://github.com/IQSS/dataverse/issues/759 } else if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.DATE)) { String dateAsString = dsf.getValues().get(0); logger.fine("date as string: " + dateAsString); if (dateAsString != null && !dateAsString.isEmpty()) { SimpleDateFormat inputDateyyyy = new SimpleDateFormat("yyyy", Locale.ENGLISH); try { /** * @todo when bean validation is working we * won't have to convert strings into dates */ logger.fine("Trying to convert " + dateAsString + " to a YYYY date from dataset " + dataset.getId()); Date dateAsDate = inputDateyyyy.parse(dateAsString); SimpleDateFormat yearOnly = new SimpleDateFormat("yyyy"); String datasetFieldFlaggedAsDate = yearOnly.format(dateAsDate); logger.fine("YYYY only: " + datasetFieldFlaggedAsDate); // solrInputDocument.addField(solrFieldSearchable, Integer.parseInt(datasetFieldFlaggedAsDate)); solrInputDocument.addField(solrFieldSearchable, datasetFieldFlaggedAsDate); if (dsfType.getSolrField().isFacetable()) { // solrInputDocument.addField(solrFieldFacetable, Integer.parseInt(datasetFieldFlaggedAsDate)); solrInputDocument.addField(solrFieldFacetable, datasetFieldFlaggedAsDate); } } catch (Exception ex) { logger.info("unable to convert " + dateAsString + " into YYYY format and couldn't index it (" + dsfType.getName() + ")"); } } } else { // _s (dynamic string) and all other Solr fields if (dsf.getDatasetFieldType().getName().equals("authorAffiliation")) { /** * @todo think about how to tie the fact that this * needs to be multivalued (_ss) because a * multivalued facet (authorAffilition_ss) is being * collapsed into here at index time. The business * logic to determine if a data-driven metadata * field should be indexed into Solr as a single or * multiple value lives in the getSolrField() method * of DatasetField.java */ solrInputDocument.addField(SearchFields.AFFILIATION, dsf.getValuesWithoutNaValues()); } else if (dsf.getDatasetFieldType().getName().equals("title")) { // datasets have titles not names but index title under name as well so we can sort datasets by name along dataverses and files List<String> possibleTitles = dsf.getValues(); String firstTitle = possibleTitles.get(0); if (firstTitle != null) { parentDatasetTitle = firstTitle; } solrInputDocument.addField(SearchFields.NAME_SORT, dsf.getValues()); } if (dsfType.isControlledVocabulary()) { for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) { if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) { continue; } solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue()); if (dsfType.getSolrField().isFacetable()) { solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue()); } } } else if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) { // strip HTML List<String> htmlFreeText = StringUtil.htmlArray2textArray(dsf.getValuesWithoutNaValues()); solrInputDocument.addField(solrFieldSearchable, htmlFreeText); if (dsfType.getSolrField().isFacetable()) { solrInputDocument.addField(solrFieldFacetable, htmlFreeText); } } else { // do not strip HTML solrInputDocument.addField(solrFieldSearchable, dsf.getValuesWithoutNaValues()); if (dsfType.getSolrField().isFacetable()) { if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.topicClassValue)) { String topicClassificationTerm = getTopicClassificationTermOrTermAndVocabulary(dsf); if (topicClassificationTerm != null) { logger.fine(solrFieldFacetable + " gets " + topicClassificationTerm); solrInputDocument.addField(solrFieldFacetable, topicClassificationTerm); } } else { solrInputDocument.addField(solrFieldFacetable, dsf.getValuesWithoutNaValues()); } } } } } } } solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); // solrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataset.getOwner().getName()); solrInputDocument.addField(SearchFields.PARENT_ID, dataset.getOwner().getId()); solrInputDocument.addField(SearchFields.PARENT_NAME, dataset.getOwner().getName()); if (state.equals(indexableDataset.getDatasetState().DEACCESSIONED)) { String deaccessionNote = datasetVersion.getVersionNote(); if (deaccessionNote != null) { solrInputDocument.addField(SearchFields.DATASET_DEACCESSION_REASON, deaccessionNote); } } docs.add(solrInputDocument); List<String> filesIndexed = new ArrayList<>(); if (datasetVersion != null) { List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas(); boolean checkForDuplicateMetadata = false; if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { checkForDuplicateMetadata = true; logger.fine("We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); } for (FileMetadata fileMetadata : fileMetadatas) { boolean indexThisMetadata = true; if (checkForDuplicateMetadata) { logger.fine("Checking if this file metadata is a duplicate."); for (FileMetadata releasedFileMetadata : dataset.getReleasedVersion().getFileMetadatas()) { if (fileMetadata.getDataFile() != null && fileMetadata.getDataFile().equals(releasedFileMetadata.getDataFile())) { if (fileMetadata.contentEquals(releasedFileMetadata)) { indexThisMetadata = false; logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); } else { logger.fine("This file metadata has changed since the released version; we want to index it!"); } break; } } } if (indexThisMetadata) { SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); Long fileEntityId = fileMetadata.getDataFile().getId(); datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId); datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion); datafileSolrInputDocument.addField(SearchFields.IDENTIFIER, fileEntityId); datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL()); datafileSolrInputDocument.addField(SearchFields.TYPE, "files"); String filenameCompleteFinal = ""; if (fileMetadata != null) { String filenameComplete = fileMetadata.getLabel(); if (filenameComplete != null) { String filenameWithoutExtension = ""; // String extension = ""; int i = filenameComplete.lastIndexOf('.'); if (i > 0) { // extension = filenameComplete.substring(i + 1); try { filenameWithoutExtension = filenameComplete.substring(0, i); datafileSolrInputDocument.addField(SearchFields.FILENAME_WITHOUT_EXTENSION, filenameWithoutExtension); datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameWithoutExtension); } catch (IndexOutOfBoundsException ex) { filenameWithoutExtension = ""; } } else { logger.fine("problem with filename '" + filenameComplete + "': no extension? empty string as filename?"); filenameWithoutExtension = filenameComplete; } filenameCompleteFinal = filenameComplete; } for (String tag : fileMetadata.getCategoriesByName()) { datafileSolrInputDocument.addField(SearchFields.FILE_TAG, tag); datafileSolrInputDocument.addField(SearchFields.FILE_TAG_SEARCHABLE, tag); } } datafileSolrInputDocument.addField(SearchFields.NAME, filenameCompleteFinal); datafileSolrInputDocument.addField(SearchFields.NAME_SORT, filenameCompleteFinal); datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameCompleteFinal); datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId()); /** * for rules on sorting files see * https://docs.google.com/a/harvard.edu/document/d/1DWsEqT8KfheKZmMB3n_VhJpl9nIxiUjai_AIQPAjiyA/edit?usp=sharing * via https://redmine.hmdc.harvard.edu/issues/3701 */ Date fileSortByDate = new Date(); DataFile datafile = fileMetadata.getDataFile(); if (datafile != null) { boolean fileHasBeenReleased = datafile.isReleased(); if (fileHasBeenReleased) { logger.fine("indexing file with filePublicationTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")"); Timestamp filePublicationTimestamp = datafile.getPublicationDate(); if (filePublicationTimestamp != null) { fileSortByDate = filePublicationTimestamp; } else { String msg = "filePublicationTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")"; logger.info(msg); } datafileSolrInputDocument.addField(SearchFields.ACCESS, datafile.isRestricted() ? SearchConstants.RESTRICTED : SearchConstants.PUBLIC); } else { logger.fine("indexing file with fileCreateTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")"); Timestamp fileCreateTimestamp = datafile.getCreateDate(); if (fileCreateTimestamp != null) { fileSortByDate = fileCreateTimestamp; } else { String msg = "fileCreateTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")"; logger.info(msg); } datafileSolrInputDocument.addField(SearchFields.ACCESS, fileMetadata.isRestricted() ? SearchConstants.RESTRICTED : SearchConstants.PUBLIC); } if (datafile.isHarvested()) { datafileSolrInputDocument.addField(SearchFields.IS_HARVESTED, true); datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE, HARVESTED); } else { datafileSolrInputDocument.addField(SearchFields.IS_HARVESTED, false); datafileSolrInputDocument.addField(SearchFields.METADATA_SOURCE, findRootDataverseCached().getName() + " " + BundleUtil.getStringFromBundle("dataverse")); } } if (fileSortByDate == null) { if (datasetSortByDate != null) { logger.info("fileSortByDate was null, assigning datasetSortByDate"); fileSortByDate = datasetSortByDate; } else { logger.info("fileSortByDate and datasetSortByDate were null, assigning 'now'"); fileSortByDate = new Date(); } } datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, fileSortByDate); datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(fileSortByDate)); if (majorVersionReleaseDate == null && !datafile.isHarvested()) { datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING); } if (datasetVersion.isInReview()) { datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, IN_REVIEW_STRING); } String fileSolrDocId = solrDocIdentifierFile + fileEntityId; if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().PUBLISHED)) { fileSolrDocId = solrDocIdentifierFile + fileEntityId; datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING); // datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString); addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset); } else if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().WORKING_COPY)) { fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix(); datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING); } datafileSolrInputDocument.addField(SearchFields.ID, fileSolrDocId); datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, fileMetadata.getDataFile().getFriendlyType()); datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, fileMetadata.getDataFile().getContentType()); datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, fileMetadata.getDataFile().getFriendlyType()); // For the file type facets, we have a property file that maps mime types // to facet-friendly names; "application/fits" should become "FITS", etc.: datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getFacetFileType(fileMetadata.getDataFile())); datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getFacetFileType(fileMetadata.getDataFile())); datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, fileMetadata.getDataFile().getFilesize()); if (DataFile.ChecksumType.MD5.equals(fileMetadata.getDataFile().getChecksumType())) { /** * @todo Someday we should probably deprecate this * FILE_MD5 in favor of a combination of * FILE_CHECKSUM_TYPE and FILE_CHECKSUM_VALUE. */ datafileSolrInputDocument.addField(SearchFields.FILE_MD5, fileMetadata.getDataFile().getChecksumValue()); } datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, fileMetadata.getDataFile().getChecksumType().toString()); datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_VALUE, fileMetadata.getDataFile().getChecksumValue()); datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, fileMetadata.getDescription()); datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription()); datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf()); datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataFile.getOwner().getOwner().getName()); // datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, dataFile.getDataset().getTitle()); datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId()); datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId()); datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, fileMetadata.getDataFile().getOwner().getCitation()); datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle); // If this is a tabular data file -- i.e., if there are data // variables associated with this file, we index the variable // names and labels: if (fileMetadata.getDataFile().isTabularData()) { List<DataVariable> variables = fileMetadata.getDataFile().getDataTable().getDataVariables(); for (DataVariable var : variables) { // Hard-coded search fields, for now: // TODO: eventually: review, decide how datavariables should // be handled for indexing purposes. (should it be a fixed // setup, defined in the code? should it be flexible? unlikely // that this needs to be domain-specific... since these data // variables are quite specific to tabular data, which in turn // is something social science-specific... // anyway -- needs to be reviewed. -- L.A. 4.0alpha1 if (var.getName() != null && !var.getName().equals("")) { datafileSolrInputDocument.addField(SearchFields.VARIABLE_NAME, var.getName()); } if (var.getLabel() != null && !var.getLabel().equals("")) { datafileSolrInputDocument.addField(SearchFields.VARIABLE_LABEL, var.getLabel()); } } // TABULAR DATA TAGS: // (not to be confused with the file categories, indexed above!) for (DataFileTag tag : fileMetadata.getDataFile().getTags()) { String tagLabel = tag.getTypeLabel(); datafileSolrInputDocument.addField(SearchFields.TABDATA_TAG, tagLabel); } } if (indexableDataset.isFilesShouldBeIndexed()) { filesIndexed.add(fileSolrDocId); docs.add(datafileSolrInputDocument); } } } } try { solrServer.add(docs); } catch (SolrServerException | IOException ex) { return ex.toString(); } try { solrServer.commit(); } catch (SolrServerException | IOException ex) { return ex.toString(); } dvObjectService.updateContentIndexTime(dataset); // return "indexed dataset " + dataset.getId() + " as " + solrDocId + "\nindexFilesResults for " + solrDocId + ":" + fileInfo.toString(); return "indexed dataset " + dataset.getId() + " as " + datasetSolrDocId + ". filesIndexed: " + filesIndexed; } /** * If the "Topic Classification" has a "Vocabulary", return both the "Term" * and the "Vocabulary" with the latter in parentheses. For example, the * Murray Research Archive uses "1 (Generations)" and "yes (Follow-up * permitted)". */ private String getTopicClassificationTermOrTermAndVocabulary(DatasetField topicClassDatasetField) { String finalValue = null; String topicClassVocab = null; String topicClassValue = null; for (DatasetField sibling : topicClassDatasetField.getParentDatasetFieldCompoundValue().getChildDatasetFields()) { DatasetFieldType datasetFieldType = sibling.getDatasetFieldType(); String name = datasetFieldType.getName(); if (name.equals(DatasetFieldConstant.topicClassVocab)) { topicClassVocab = sibling.getDisplayValue(); } else if (name.equals(DatasetFieldConstant.topicClassValue)) { topicClassValue = sibling.getDisplayValue(); } if (topicClassValue != null) { if (topicClassVocab != null) { finalValue = topicClassValue + " (" + topicClassVocab + ")"; } else { finalValue = topicClassValue; } } } return finalValue; } public List<String> findPathSegments(Dataverse dataverse, List<String> segments) { Dataverse rootDataverse = findRootDataverseCached(); if (!dataverse.equals(rootDataverse)) { // important when creating root dataverse if (dataverse.getOwner() != null) { findPathSegments(dataverse.getOwner(), segments); } segments.add(dataverse.getId().toString()); return segments; } else { // base case return segments; } } List<String> getDataversePathsFromSegments(List<String> dataversePathSegments) { List<String> subtrees = new ArrayList<>(); for (int i = 0; i < dataversePathSegments.size(); i++) { StringBuilder pathBuilder = new StringBuilder(); int numSegments = dataversePathSegments.size(); for (int j = 0; j < numSegments; j++) { if (j <= i) { pathBuilder.append("/" + dataversePathSegments.get(j)); } } subtrees.add(pathBuilder.toString()); } return subtrees; } private void addDataverseReleaseDateToSolrDoc(SolrInputDocument solrInputDocument, Dataverse dataverse) { if (dataverse.getPublicationDate() != null) { Calendar calendar = Calendar.getInstance(); calendar.setTimeInMillis(dataverse.getPublicationDate().getTime()); int YYYY = calendar.get(Calendar.YEAR); solrInputDocument.addField(SearchFields.PUBLICATION_DATE, YYYY); } } private void addDatasetReleaseDateToSolrDoc(SolrInputDocument solrInputDocument, Dataset dataset) { if (dataset.getPublicationDate() != null) { Calendar calendar = Calendar.getInstance(); calendar.setTimeInMillis(dataset.getPublicationDate().getTime()); int YYYY = calendar.get(Calendar.YEAR); solrInputDocument.addField(SearchFields.PUBLICATION_DATE, YYYY); solrInputDocument.addField(SearchFields.DATASET_PUBLICATION_DATE, YYYY); } } public static String getGroupPrefix() { return groupPrefix; } public static String getGroupPerUserPrefix() { return groupPerUserPrefix; } public static String getPublicGroupString() { return publicGroupString; } public static String getPUBLISHED_STRING() { return PUBLISHED_STRING; } public static String getUNPUBLISHED_STRING() { return UNPUBLISHED_STRING; } public static String getDRAFT_STRING() { return DRAFT_STRING; } public static String getIN_REVIEW_STRING() { return IN_REVIEW_STRING; } public static String getDEACCESSIONED_STRING() { return DEACCESSIONED_STRING; } public String delete(Dataverse doomed) { logger.fine("deleting Solr document for dataverse " + doomed.getId()); UpdateResponse updateResponse; try { updateResponse = solrServer.deleteById(solrDocIdentifierDataverse + doomed.getId()); } catch (SolrServerException | IOException ex) { return ex.toString(); } try { solrServer.commit(); } catch (SolrServerException | IOException ex) { return ex.toString(); } String response = "Successfully deleted dataverse " + doomed.getId() + " from Solr index. updateReponse was: " + updateResponse.toString(); logger.fine(response); return response; } /** * @todo call this in fewer places, favoring * SolrIndexServiceBeans.deleteMultipleSolrIds instead to operate in batches * * https://github.com/IQSS/dataverse/issues/142 */ public String removeSolrDocFromIndex(String doomed) { logger.fine("deleting Solr document: " + doomed); UpdateResponse updateResponse; try { updateResponse = solrServer.deleteById(doomed); } catch (SolrServerException | IOException ex) { return ex.toString(); } try { solrServer.commit(); } catch (SolrServerException | IOException ex) { return ex.toString(); } String response = "Attempted to delete " + doomed + " from Solr index. updateReponse was: " + updateResponse.toString(); logger.fine(response); return response; } public String convertToFriendlyDate(Date dateAsDate) { if (dateAsDate == null) { dateAsDate = new Date(); } // using DateFormat.MEDIUM for May 5, 2014 to match what's in DVN 3.x DateFormat format = DateFormat.getDateInstance(DateFormat.MEDIUM); String friendlyDate = format.format(dateAsDate); return friendlyDate; } private List<String> findSolrDocIdsForDraftFilesToDelete(Dataset datasetWithDraftFilesToDelete) { List<String> solrIdsOfFilesToDelete = new ArrayList<>(); for (DatasetVersion datasetVersion : datasetWithDraftFilesToDelete.getVersions()) { for (FileMetadata fileMetadata : datasetVersion.getFileMetadatas()) { DataFile datafile = fileMetadata.getDataFile(); if (datafile != null) { solrIdsOfFilesToDelete.add(solrDocIdentifierFile + datafile.getId() + draftSuffix); } } } return solrIdsOfFilesToDelete; } private List<String> findSolrDocIdsForFilesToDelete(Dataset dataset, IndexableDataset.DatasetState state) { List<String> solrIdsOfFilesToDelete = new ArrayList<>(); for (DataFile file : dataset.getFiles()) { solrIdsOfFilesToDelete.add(solrDocIdentifierFile + file.getId() + state.getSuffix()); } return solrIdsOfFilesToDelete; } private String removeMultipleSolrDocs(List<String> docIds) { IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(docIds); return indexResponse.toString(); } private String determinePublishedDatasetSolrDocId(Dataset dataset) { return IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.PUBLISHED.getSuffix(); } private String determineDeaccessionedDatasetId(Dataset dataset) { return IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.DEACCESSIONED.getSuffix(); } private String removeDeaccessioned(Dataset dataset) { StringBuilder result = new StringBuilder(); String deleteDeaccessionedResult = removeSolrDocFromIndex(determineDeaccessionedDatasetId(dataset)); result.append(deleteDeaccessionedResult); List<String> docIds = findSolrDocIdsForFilesToDelete(dataset, IndexableDataset.DatasetState.DEACCESSIONED); String deleteFilesResult = removeMultipleSolrDocs(docIds); result.append(deleteFilesResult); return result.toString(); } private String removePublished(Dataset dataset) { StringBuilder result = new StringBuilder(); String deletePublishedResult = removeSolrDocFromIndex(determinePublishedDatasetSolrDocId(dataset)); result.append(deletePublishedResult); List<String> docIds = findSolrDocIdsForFilesToDelete(dataset, IndexableDataset.DatasetState.PUBLISHED); String deleteFilesResult = removeMultipleSolrDocs(docIds); result.append(deleteFilesResult); return result.toString(); } private Dataverse findRootDataverseCached() { if (true) { /** * @todo Is the code below working at all? We don't want the root * dataverse to be indexed into Solr. Specifically, we don't want a * dataverse "card" to show up while browsing. * * Let's just find the root dataverse and be done with it. We'll * figure out the caching later. */ try { Dataverse rootDataverse = dataverseService.findRootDataverse(); return rootDataverse; } catch (EJBException ex) { logger.info("caught " + ex); Throwable cause = ex.getCause(); while (cause.getCause() != null) { logger.info("caused by... " + cause); cause = cause.getCause(); } return null; } } /** * @todo Why isn't this code working? */ if (rootDataverseCached != null) { return rootDataverseCached; } else { rootDataverseCached = dataverseService.findRootDataverse(); if (rootDataverseCached != null) { return rootDataverseCached; } else { throw new RuntimeException("unable to determine root dataverse"); } } } private String getDesiredCardState(Map<DatasetVersion.VersionState, Boolean> desiredCards) { /** * @todo make a JVM option to enforce sanity checks? Call it dev=true? */ boolean sanityCheck = true; if (sanityCheck) { Set<DatasetVersion.VersionState> expected = new HashSet<>(); expected.add(DatasetVersion.VersionState.DRAFT); expected.add(DatasetVersion.VersionState.RELEASED); expected.add(DatasetVersion.VersionState.DEACCESSIONED); if (!desiredCards.keySet().equals(expected)) { throw new RuntimeException("Mismatch between expected version states (" + expected + ") and version states passed in (" + desiredCards.keySet() + ")"); } } return "Desired state for existence of cards: " + desiredCards + "\n"; } /** * @return Dataverses that should be reindexed either because they have * never been indexed or their index time is before their modification time. */ public List<Dataverse> findStaleOrMissingDataverses() { List<Dataverse> staleDataverses = new ArrayList<>(); for (Dataverse dataverse : dataverseService.findAll()) { if (dataverse.equals(dataverseService.findRootDataverse())) { continue; } if (stale(dataverse)) { staleDataverses.add(dataverse); } } return staleDataverses; } /** * @return Datasets that should be reindexed either because they have never * been indexed or their index time is before their modification time. */ public List<Dataset> findStaleOrMissingDatasets() { List<Dataset> staleDatasets = new ArrayList<>(); for (Dataset dataset : datasetService.findAll()) { if (stale(dataset)) { staleDatasets.add(dataset); } } return staleDatasets; } private boolean stale(DvObject dvObject) { Timestamp indexTime = dvObject.getIndexTime(); Timestamp modificationTime = dvObject.getModificationTime(); if (indexTime == null) { return true; } else if (indexTime.before(modificationTime)) { return true; } return false; } public List<Long> findDataversesInSolrOnly() throws SearchException { try { /** * @todo define this centrally and statically */ return findDvObjectInSolrOnly("dataverses"); } catch (SearchException ex) { throw ex; } } public List<Long> findDatasetsInSolrOnly() throws SearchException { try { /** * @todo define this centrally and statically */ return findDvObjectInSolrOnly("datasets"); } catch (SearchException ex) { throw ex; } } public List<Long> findFilesInSolrOnly() throws SearchException { try { /** * @todo define this centrally and statically */ return findDvObjectInSolrOnly("files"); } catch (SearchException ex) { throw ex; } } private List<Long> findDvObjectInSolrOnly(String type) throws SearchException { SolrQuery solrQuery = new SolrQuery(); solrQuery.setQuery("*"); solrQuery.setRows(Integer.MAX_VALUE); solrQuery.addFilterQuery(SearchFields.TYPE + ":" + type); List<Long> dvObjectInSolrOnly = new ArrayList<>(); QueryResponse queryResponse = null; try { queryResponse = solrServer.query(solrQuery); } catch (SolrServerException ex) { throw new SearchException("Error searching Solr for " + type, ex); } SolrDocumentList results = queryResponse.getResults(); for (SolrDocument solrDocument : results) { Object idObject = solrDocument.getFieldValue(SearchFields.ENTITY_ID); if (idObject != null) { try { long id = (Long) idObject; DvObject dvobject = dvObjectService.findDvObject(id); if (dvobject == null) { dvObjectInSolrOnly.add(id); } } catch (ClassCastException ex) { throw new SearchException("Found " + SearchFields.ENTITY_ID + " but error casting " + idObject + " to long", ex); } } } return dvObjectInSolrOnly; } private List<String> findFilesOfParentDataset(long parentDatasetId) throws SearchException { SolrQuery solrQuery = new SolrQuery(); solrQuery.setQuery("*"); solrQuery.setRows(Integer.MAX_VALUE); solrQuery.addFilterQuery(SearchFields.PARENT_ID + ":" + parentDatasetId); /** * @todo "files" should be a constant */ solrQuery.addFilterQuery(SearchFields.TYPE + ":" + "files"); List<String> dvObjectInSolrOnly = new ArrayList<>(); QueryResponse queryResponse = null; try { queryResponse = solrServer.query(solrQuery); } catch (SolrServerException ex) { throw new SearchException("Error searching Solr for dataset parent id " + parentDatasetId, ex); } SolrDocumentList results = queryResponse.getResults(); for (SolrDocument solrDocument : results) { Object idObject = solrDocument.getFieldValue(SearchFields.ID); if (idObject != null) { String id = (String) idObject; dvObjectInSolrOnly.add(id); } } return dvObjectInSolrOnly; } // This is a convenience method for deleting all the SOLR documents // (Datasets and DataFiles) harvested by a specific HarvestingClient. // The delete logic is a bit simpler, than when deleting "real", local // datasets and files - for example, harvested datasets are never Drafts, etc. // We are also less concerned with the diagnostics; if any of it fails, // we don't need to treat it as a fatal condition. public void deleteHarvestedDocuments(HarvestingClient harvestingClient) { List<String> solrIdsOfDatasetsToDelete = new ArrayList<>(); // I am going to make multiple solrIndexService.deleteMultipleSolrIds() calls; // one call for the list of datafiles in each dataset; then one more call to // delete all the dataset documents. // I'm *assuming* this is safer than to try and make one complete list of // all the documents (datasets and datafiles), and then attempt to delete // them all at once... (is there a limit??) The list can be huge - if the // harvested archive is on the scale of Odum or ICPSR, with thousands of // datasets and tens of thousands of files. // for (Dataset harvestedDataset : harvestingClient.getHarvestedDatasets()) { solrIdsOfDatasetsToDelete.add(solrDocIdentifierDataset + harvestedDataset.getId()); List<String> solrIdsOfDatafilesToDelete = new ArrayList<>(); for (DataFile datafile : harvestedDataset.getFiles()) { solrIdsOfDatafilesToDelete.add(solrDocIdentifierFile + datafile.getId()); } logger.fine("attempting to delete the following datafiles from the index: " + StringUtils.join(solrIdsOfDatafilesToDelete, ",")); IndexResponse resultOfAttemptToDeleteFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfDatafilesToDelete); logger.fine("result of an attempted delete of the harvested files associated with the dataset "+harvestedDataset.getId()+": "+resultOfAttemptToDeleteFiles); } logger.fine("attempting to delete the following datasets from the index: " + StringUtils.join(solrIdsOfDatasetsToDelete, ",")); IndexResponse resultOfAttemptToDeleteDatasets = solrIndexService.deleteMultipleSolrIds(solrIdsOfDatasetsToDelete); logger.fine("result of attempt to delete harvested datasets associated with the client: " + resultOfAttemptToDeleteDatasets + "\n"); } // Another convenience method, for deleting all the SOLR documents (dataset_ // and datafile_s) associated with a harveste dataset. The comments for the // method above apply here too. public void deleteHarvestedDocuments(Dataset harvestedDataset) { List<String> solrIdsOfDocumentsToDelete = new ArrayList<>(); solrIdsOfDocumentsToDelete.add(solrDocIdentifierDataset + harvestedDataset.getId()); for (DataFile datafile : harvestedDataset.getFiles()) { solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + datafile.getId()); } logger.fine("attempting to delete the following documents from the index: " + StringUtils.join(solrIdsOfDocumentsToDelete, ",")); IndexResponse resultOfAttemptToDeleteDocuments = solrIndexService.deleteMultipleSolrIds(solrIdsOfDocumentsToDelete); logger.fine("result of attempt to delete harvested documents: " + resultOfAttemptToDeleteDocuments + "\n"); } }