package edu.harvard.iq.dataverse.search;
/**
* We define Solr search fields here in one central place so they can be used
* throughout the code but renamed here if need be.
*
* Note that there are many fields in Solr that are *not* here because their
* values come from the database. For example "authorName" comes from the
* database. We update the Solr schema.xml file by merging the output of `curl
* http://localhost:8080/api/admin/index/solr/schema` into the file in the
* source tree when a metadata block update warrants it.
*
* Generally speaking, we want the search fields to be readable. This is a
* challenge for long field names but a power user should be able to type
* "authorAffiliation:Harvard" into the general search box. A regular user is
* much more likely to used Advanced Search to populate that field
* automatically.
*
* Originally, these fields were all snake_case but since the dynamic fields are
* camelCase we might want to standardize on that.
*
* You'll notice that dynamic fields like this are used...
*
* - _s (string)
*
* - _ss (multivalued string)
*
* - _l (long)
*
* - _dt (datetime)
*
* ... and these endings should not be changed unless you plan to convert them
* to non-dynamic (by removing the ending) and specify their "type" in the Solr
* schema.xml.
*
* Most fields we want to be searchable but some are stored with indexed=false
* because we *don't* want them to be searchable and we're just using Solr as a
* convenient key/value store. Why go to the database if you don't have to? For
* a string here or there that needs to be available to both the GUI and the
* Search API, we can just store them in Solr.
*
* For faceting we use a "string" type. If you use something like "text_general"
* the field is tokenized ("Foo Bar" becomes "foo" "bar" which is not what we
* want). See also
* http://stackoverflow.com/questions/16559911/facet-query-will-give-wrong-output-on-dynamicfield-in-solr
*/
public class SearchFields {
/**
* @todo: consider making various dynamic fields (_s) static in schema.xml
* instead. Should they be stored in the database?
*/
// standard fields from example/solr/collection1/conf/schema.xml
// (but we are getting away from these...)
public static final String ID = "id";
/**
* Determine which DvObjects you might want to target for reindexing after
* an upgrade such as between Dataverse 4.2 and 4.3.
*/
public static final String DATAVERSE_VERSION_INDEXED_BY = "dataverseVersionIndexedBy_s";
public static final String NAME = "name";
/**
* @todo Do we want to support finding dataverses, datasets, and files with
* a query for description:foo? Maybe not, since people will probably just
* use basic search for this. They could also use "dvDescription:foo OR
* dsDescription:foo OR fileDescription:foo" if they *really* only want to
* target the description of all three types at once.
*
* See also https://redmine.hmdc.harvard.edu/issues/3745
*/
public static final String DESCRIPTION = "description";
/**
* Identifiers differ per DvObject: alias for dataverses, globalId for
* datasets, and database id for files.
*/
public static final String IDENTIFIER = "identifier";
/**
* Visible in the GUI as a facet to click: "Harvested" vs. "Root Dataverse".
*/
public static final String METADATA_SOURCE = "metadataSource";
/**
* Internal boolean used when creating OAI sets, for example.
*/
public static final String IS_HARVESTED = "isHarvested";
/**
* Such as http://dx.doi.org/10.5072/FK2/HXI35W
*
* For files, the URL will be the parent dataset.
*/
public static final String PERSISTENT_URL = "persistentUrl";
public static final String UNF = "unf";
public static final String DATAVERSE_NAME = "dvName";
public static final String DATAVERSE_AFFILIATION = "dvAffiliation";
public static final String DATAVERSE_DESCRIPTION = "dvDescription";
public static final String DATAVERSE_CATEGORY = "dvCategory";
/**
* What is dvSubject_en for? How does it get populated into Solr? The
* behavior changed so that now the subjects of dataverses are based on
* their datasets. Should this be a string so we can facet on it more
* properly? Should all checkboxes on the advanced search page (controlled
* vocabularies) be backed by a string? When we rename this to "foobar" (a
* field Solr doesn't know about) why doesn't Solr complain when we "index
* all"? See also https://github.com/IQSS/dataverse/issues/1681
*/
public static final String DATAVERSE_SUBJECT = "dvSubject";
/**
* A "collapsed" facet (e.g. applies to both dataverses and datasets and is
* merged as a single facet in the GUI) like affiliation that needs to match
* the corresponding dynamic "facet" Solr field at the dataset level to work
* properly. Should we use/expose "_ss" when you click a facet? It needs to
* be different from "subject" which is used for general search but maybe we
* could have a convention like "subjectFacet" for the facets?
*/
public static final String SUBJECT = "subject_ss";
/**
* @todo think about how to tie the fact that this needs to be multivalued
* (_ss) because a multivalued facet (authorAffilition_ss) will be collapsed
* into it at index time. The business logic to determine if a data-driven
* metadata field should be indexed into Solr as a single or multiple value
* lives in the getSolrField() method of DatasetField.java
*
* AFFILIATION is used for the "collapsed" "Affiliation" facet that means
* either "Author Affiliation" or dataverse affiliation. It needs to be a
* string so we can facet on it and it needs to be multivalued because
* "Author Affiliation" can be multivalued.
*/
public static final String AFFILIATION = "affiliation_ss";
public static final String FILE_NAME = "fileName";
public static final String FILE_DESCRIPTION = "fileDescription";
/**
* Can be multivalued and includes both "friendly" and "group" versions:
* "PNG Image", "image"
*/
public static final String FILE_TYPE_SEARCHABLE = "fileType";
/**
* @todo Thie static variable not named properly. We want to expose an
* acutal MIME Type in https://github.com/IQSS/dataverse/issues/1595 . See
* also cleanup ticket at https://github.com/IQSS/dataverse/issues/1314
*
* i.e. "PNG Image"
*/
public static final String FILE_TYPE_FRIENDLY = "fileTypeDisplay";
public static final String FILE_CONTENT_TYPE = "fileContentType";
/**
* Used as a facet for file groups like "image" or "document"
*/
public static final String FILE_TYPE = "fileTypeGroupFacet";
public static final String FILE_SIZE_IN_BYTES = "fileSizeInBytes";
public static final String FILE_MD5 = "fileMd5";
public static final String FILE_CHECKSUM_TYPE = "fileChecksumType";
public static final String FILE_CHECKSUM_VALUE = "fileChecksumValue";
public static final String FILENAME_WITHOUT_EXTENSION = "fileNameWithoutExtension";
/**
* Indexed as a string so we can facet on it.
*/
public static final String FILE_TAG = "fileTag";
/**
* Indexed as text_en so it's searchable by lower case etc.
*/
public static final String FILE_TAG_SEARCHABLE = "fileTags";
/*
* (tabular) Data Tags are indexed as a string, since we are only planning to
* use these in facet-like, exact searches:
*/
public static final String TABDATA_TAG = "tabularDataTag";
public static final String ACCESS = "fileAccess";
public static final String SUBTREE = "subtreePaths";
// i.e. http://localhost:8080/search.xhtml?q=*&fq0=citationdate_dt:[2008-01-01T00%3A00%3A00Z+TO+2011-01-01T00%3A00%3A00Z%2B1YEAR}
// public static final String PRODUCTION_DATE_ORIGINAL = DatasetFieldConstant.productionDate + "_dt";
// public static final String PRODUCTION_DATE_YEAR_ONLY = DatasetFieldConstant.productionDate + "_i";
// public static final String DISTRIBUTION_DATE_ORIGINAL = DatasetFieldConstant.distributionDate + "_dt";
// public static final String DISTRIBUTION_DATE_YEAR_ONLY = DatasetFieldConstant.distributionDate + "_i";
/**
* Solr refers to "relevance" as "score"
*/
public static final String RELEVANCE = "score";
/**
* A dataverse, a dataset, or a file.
*/
public static final String TYPE = "dvObjectType";
public static final String NAME_SORT = "nameSort";
public static final String PUBLICATION_DATE = "publicationDate";
public static final String RELEASE_OR_CREATE_DATE = "dateSort";
/**
* i.e. "Mar 17, 2015"
*/
public static final String RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT = "dateFriendly";
public static final String DEFINITION_POINT = "definitionPointDocId";
public static final String DEFINITION_POINT_DVOBJECT_ID = "definitionPointDvObjectId";
public static final String DISCOVERABLE_BY = "discoverableBy";
/**
* i.e. "Unpublished", "Draft" (multivalued)
*/
public static final String PUBLICATION_STATUS = "publicationStatus";
/**
* @todo reconcile different with Solr schema.xml where type is Long rather
* than String.
*/
public static final String ENTITY_ID = "entityId";
public static final String PARENT_NAME = "parentName";
public static final String PARENT_ID = "parentId";
public static final String PARENT_IDENTIFIER = "parentIdentifier";
/**
* @todo Should we add a "parentCitationHtml" field now or wait for demand
* for it?
*/
public static final String PARENT_CITATION = "parentCitation";
public static final String DATASET_DESCRIPTION = "dsDescriptionValue";
/**
* In Datavese 4.3 and earlier "citation" was indexed as the "online" or
* HTML version, with the DOI link wrapped in an href tag but now it's the
* plaintext version and anyone who was depending on the old version can
* switch to the new "citationHTML" field.
*/
public static final String DATASET_CITATION = "citation";
public static final String DATASET_CITATION_HTML = "citationHtml";
public static final String DATASET_DEACCESSION_REASON = "deaccessionReason";
/**
* In contrast to PUBLICATION_DATE, this field applies only to datasets for
* more targeted results for just datasets. The format is YYYY (i.e.
* "2015").
*/
public static final String DATASET_PUBLICATION_DATE = "dsPublicationDate";
public static final String DATASET_PERSISTENT_ID = "dsPersistentId";
public static final String DATASET_VERSION_ID = "datasetVersionId";
public static final String VARIABLE_NAME = "variableName";
public static final String VARIABLE_LABEL = "variableLabel";
}