package org.solrmarc.index;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.marc4j.marc.DataField;
import org.marc4j.marc.Record;
import org.marc4j.marc.Subfield;
import org.marc4j.marc.VariableField;
import org.solrmarc.index.indexer.AbstractValueIndexer;
import org.solrmarc.index.indexer.ValueIndexerFactory;
import org.solrmarc.index.mapping.AbstractMultiValueMapping;
import org.solrmarc.tools.DataUtil;
import org.solrmarc.tools.Utils;
/**
* class SolrIndexer
*
* This class exists solely for backwards compatibility purposes. The intention is that if a previous custom function
* was being used, one that provides the same functionality can be found here. Furthermore if there were any helper functions
* that could have been used to create your own custom indexing functions those helper functions should be found here as well.
*
* In most cases the methods found here are merely shims to translate the desired method to use the newer functionality that
* is now available.
*
*
* @author rh9ec
*
*/
@Deprecated
public class SolrIndexerShim
{
//private Map<String, AbstractValueIndexer<?>> indexerCache = new ConcurrentHashMap<String, AbstractValueIndexer<?>>();
private ThreadLocal<Map<String, AbstractValueIndexer<?>>> indexerCache =
new ThreadLocal<Map<String, AbstractValueIndexer<?>>>()
{
@Override
protected Map<String, AbstractValueIndexer<?>> initialValue()
{
return new HashMap<>();
}
};
/** map of translation maps. keys are names of translation maps;
* values are the translation maps (hence, it's a map of maps) */
//private Map<String, Object> transMapMap = new HashMap<String, Object>();
private ConcurrentMap<String, Object> transMapMap = new ConcurrentHashMap<String, Object>();
private SolrIndexerShim()
{ /* private constructor */ }
private static SolrIndexerShim theSolrIndexer;
public static SolrIndexerShim instance()
{
if (theSolrIndexer == null) theSolrIndexer = new SolrIndexerShim();
return(theSolrIndexer);
}
private AbstractValueIndexer<?> getOrCreateIndexerFullSpec(String fullSpec)
{
if (indexerCache.get().containsKey(fullSpec))
{
return(indexerCache.get().get(fullSpec));
}
else
{
AbstractValueIndexer<?> indexer;
synchronized (ValueIndexerFactory.instance())
{
indexer = ValueIndexerFactory.instance().createValueIndexer("", fullSpec);
indexerCache.get().put(fullSpec, indexer);
}
return(indexer);
}
}
private AbstractValueIndexer<?> getOrCreateIndexerMapped(String tagStr, String map)
{
String key = (map == null) ? tagStr : tagStr + ", " + map;
return getOrCreateIndexerFullSpec(key);
}
private AbstractValueIndexer<?> getOrCreateIndexer(String tagStr, String separator)
{
String key = (separator == null) ? tagStr : tagStr + ", join(\""+separator+"\")";
return getOrCreateIndexerFullSpec(key);
}
private AbstractValueIndexer<?> getOrCreateIndexer(String tagStr, int start, int end)
{
String key = (start == -1 && end == -1) ? tagStr : tagStr + "[" + start + "-" + end + "]";
return getOrCreateIndexerFullSpec(key);
}
/**
* Get <code>Collection</code> of Strings as indicated by tagStr. For each field
* spec in the tagStr that is NOT about bytes (i.e. not a 008[7-12] type fieldspec),
* the result string is the concatenation of all the specific subfields.
*
* @param record -
* the marc record object
* @param tagStr
* string containing which field(s)/subfield(s) to use. This is a
* series of: marc "tag" string (3 chars identifying a marc
* field, e.g. 245) optionally followed by characters identifying
* which subfields to use. Separator of colon indicates a
* separate value, rather than concatenation. 008[5-7] denotes
* bytes 5-7 of the 008 field (0 based counting) 100[a-cf-z]
* denotes the bracket pattern is a regular expression indicating
* which subfields to include. Note: if the characters in the
* brackets are digits, it will be interpreted as particular
* bytes, NOT a pattern. 100abcd denotes subfields a, b, c, d are
* desired.
* @param collector
* object in which to collect the data from the fields described by
* <code>tagStr</code>. A <code>Set</code> will automatically de-dupe
* values, a <code>List</code> will allow values to repeat.
* @throws Exception
*/
private void getFieldListCollector(Record record, AbstractValueIndexer<?> indexer, Collection<String> collector)
{
try
{
indexer.getFieldData(record, collector);
}
catch (Exception e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void getFieldListCollector(Record record, String tagStr, String mapStr, Collection<String> collector)
{
AbstractValueIndexer<?> indexer = getOrCreateIndexerMapped(tagStr, mapStr);
getFieldListCollector(record, indexer, collector);
}
/**
* Get Set of Strings as indicated by tagStr. For each field spec in the
* tagStr that is NOT about bytes (i.e. not a 008[7-12] type fieldspec), the
* result string is the concatenation of all the specific subfields.
*
* @param record -
* the marc record object
* @param tagStr
* string containing which field(s)/subfield(s) to use. This is a
* series of: marc "tag" string (3 chars identifying a marc
* field, e.g. 245) optionally followed by characters identifying
* which subfields to use. Separator of colon indicates a
* separate value, rather than concatenation. 008[5-7] denotes
* bytes 5-7 of the 008 field (0 based counting) 100[a-cf-z]
* denotes the bracket pattern is a regular expression indicating
* which subfields to include. Note: if the characters in the
* brackets are digits, it will be interpreted as particular
* bytes, NOT a pattern. 100abcd denotes subfields a, b, c, d are
* desired.
* @return the contents of the indicated marc field(s)/subfield(s), as a set
* of Strings.
* @throws Exception
*/
// public Set<String> getFieldList(Record record, String tagStr)
// {
// Set<String> result = new LinkedHashSet<String>();
// getFieldListCollector(record, tagStr, null, result);
// return result;
// }
public Set<String> getFieldList(Record record, String tagStr)
{
Set<String> result = new LinkedHashSet<String>();
getFieldListCollector(record, tagStr, null, result);
return result;
}
public Set<String> getMappedFieldList(Record record, String tagStr, String mapStr)
{
Set<String> result = new LinkedHashSet<String>();
getFieldListCollector(record, tagStr, mapStr, result);
return result;
}
/**
* Get <code>List</code> of Strings as indicated by tagStr. For each field spec in the
* tagStr that is NOT about bytes (i.e. not a 008[7-12] type fieldspec), the
* result string is the concatenation of all the specific subfields.
*
* @param record -
* the marc record object
* @param tagStr
* string containing which field(s)/subfield(s) to use. This is a
* series of: marc "tag" string (3 chars identifying a marc
* field, e.g. 245) optionally followed by characters identifying
* which subfields to use. Separator of colon indicates a
* separate value, rather than concatenation. 008[5-7] denotes
* bytes 5-7 of the 008 field (0 based counting) 100[a-cf-z]
* denotes the bracket pattern is a regular expression indicating
* which subfields to include. Note: if the characters in the
* brackets are digits, it will be interpreted as particular
* bytes, NOT a pattern. 100abcd denotes subfields a, b, c, d are
* desired.
* @return the contents of the indicated marc field(s)/subfield(s).
* @throws Exception
*/
public List<String> getFieldListAsList(Record record, String tagStr)
{
List<String> result = new ArrayList<String>();
getFieldListCollector(record, tagStr, null, result);
return result;
}
/**
* Get all field values specified by tagStr, joined as a single string.
* @param record - the marc record object
* @param tagStr string containing which field(s)/subfield(s) to use. This
* is a series of: marc "tag" string (3 chars identifying a marc field,
* e.g. 245) optionally followed by characters identifying which subfields
* to use.
* @param separator string separating values in the result string
* @return single string containing all values of the indicated marc
* field(s)/subfield(s) concatenated with separator string
*/
public String getFieldVals(Record record, String tagStr, String separator)
{
Set<String> result = getFieldList(record, tagStr);
return org.solrmarc.tools.Utils.join(result, separator);
}
/**
* Get the first value specified by the tagStr
* @param record - the marc record object
* @param tagStr string containing which field(s)/subfield(s) to use. This
* is a series of: marc "tag" string (3 chars identifying a marc field,
* e.g. 245) optionally followed by characters identifying which subfields
* to use.
* @return first value of the indicated marc field(s)/subfield(s) as a string
* @throws Exception
*/
public String getFirstFieldVal(Record record, String tagStr)
{
Set<String> result = getFieldList(record, tagStr);
Iterator<String> iter = result.iterator();
if (iter.hasNext())
return iter.next();
else
return null;
}
/**
* Get the first field value, which is mapped to another value. If there is
* no mapping for the value, use the mapping for the empty key, if it
* exists, o.w., use the mapping for the __DEFAULT key, if it exists.
* @param record - the marc record object
* @param mapName - name of translation map to use to xform values
* @param tagStr - which field(s)/subfield(s) to use
* @return first value as a string
*/
public String getFirstFieldVal(Record record, String mapName, String tagStr)
{
Set<String> result = getMappedFieldList(record, tagStr, mapName);
Iterator<String> iter = result.iterator();
return (iter.hasNext())? iter.next() : null;
}
public boolean isControlField(String fieldTag)
{
if (fieldTag.matches("00[0-9]"))
{
return (true);
}
return (false);
}
/**
* Get the specified subfields from the specified MARC field, returned as a
* set of strings to become lucene document field values
*
* @param record the MARC record object
* @param fldTag the field name, e.g. 245
* @param subfldsStr the string containing the desired subfields
* @param separator the separator string to insert between subfield items (if null, a " " will be
* used)
* @param collector an object to accumulate the data indicated by <code>fldTag</code> and
* <code>subfldsStr</code>.
*/
public void getSubfieldDataCollector(Record record, String fldTag, String subfldsStr,
String separator, Collection<String> collector)
{
AbstractValueIndexer<?> indexer = getOrCreateIndexer(fldTag+subfldsStr, separator);
getFieldListCollector(record, indexer, collector);
return;
}
/**
* Get the specified substring of subfield values from the specified MARC
* field, returned as a set of strings to become lucene document field values
* @param record - the marc record object
* @param fldTag - the field name, e.g. 008
* @param subfldStr - the string containing the desired subfields
* @param beginIx - the beginning index of the substring of the subfield value
* @param endIx - the ending index of the substring of the subfield value
* @param collector an object to accumulate the data indicated by <code>fldTag</code> and
* <code>subfldsStr</code>.
*/
public void getSubfieldDataCollector(Record record, String fldTag, String subfldStr,
int beginIx, int endIx, Collection<String> collector)
{
AbstractValueIndexer<?> indexer = getOrCreateIndexer(fldTag+subfldStr, beginIx, endIx);
getFieldListCollector(record, indexer, collector);
return;
}
/**
* Get the specified subfields from the specified MARC field, returned as a
* set of strings to become lucene document field values
*
* @param record the marc record object
* @param fldTag the field name, e.g. 245
* @param subfldsStr the string containing the desired subfields
* @param separator the separator string to insert between subfield items
* (if <code>null</code>, a " " will be used)
* @return a Set of String, where each string is the concatenated contents of all the
* desired subfield values from a single instance of the <code>fldTag</code>
*/
public Set<String> getSubfieldDataAsSet(Record record, String fldTag, String subfldsStr, String separator)
{
Set<String> result = new LinkedHashSet<String>();
getSubfieldDataCollector(record, fldTag, subfldsStr, separator, result);
return result;
}
/**
* Get the specified substring of subfield values from the specified MARC
* field, returned as a set of strings to become lucene document field values
* @param record the marc record object
* @param fldTag the field name, e.g. 008
* @param subfldStr the string containing the desired subfields
* @param beginIx the beginning index of the substring of the subfield value
* @param endIx the ending index of the substring of the subfield value
* @return the result set of strings
*/
public Set<String> getSubfieldDataAsSet(Record record, String fldTag, String subfldStr, int beginIx, int endIx)
{
Set<String> result = new LinkedHashSet<String>();
getSubfieldDataCollector(record, fldTag, subfldStr, beginIx, endIx, result);
return result;
}
/**
* remove trailing punctuation (default trailing characters to be removed)
* See org.solrmarc.tools.Utils.cleanData() for details on the
* punctuation removal
* @param record marc record object
* @param fieldSpec - the field to have trailing punctuation removed
* @return Set of strings containing the field values with trailing
* punctuation removed
*/
public Set<String> removeTrailingPunct(Record record, String fieldSpec)
{
Set<String> result = getFieldList(record, fieldSpec);
Set<String> newResult = new LinkedHashSet<String>();
for (String s : result)
{
newResult.add(DataUtil.cleanData(s));
}
return newResult;
}
/**
* Stub more advanced version of getDate that looks in the 008 field as well as the 260c field
* this routine does some simple sanity checking to ensure that the date to return makes sense.
* @param record - the marc record object
* @return 260c or 008[7-10] or 008[11-14], "cleaned" per org.solrmarc.tools.Utils.cleanDate()
*/
public String getPublicationDate(final Record record)
{
List<String> result = new ArrayList<String>();
AbstractValueIndexer<?> indexer = getOrCreateIndexerFullSpec("008[7-10]:008[11-14]:260c:264c?(ind2=1||ind2=4),clean, first, " +
"map(\"(^|.*[^0-9])((20|1[5-9])[0-9][0-9])([^0-9]|$)=>$2\",\".*[^0-9].*=>\")");
getFieldListCollector(record, indexer, result);
return (result.size() == 0) ? "" : result.iterator().next();
}
public Set<String> getFullTextUrls(Record record)
{
Set<String> result = new LinkedHashSet<String>();
AbstractValueIndexer<?> indexer = getOrCreateIndexer("{856uz3}?((ind1 = 4 || (ind1 = 7 & $x startsWith \"http\")) && (ind2 = 0 || (ind2 = 1 )))", "||");
getFieldListCollector(record, indexer, result);
return result;
}
public Set<String> getSupplUrls(Record record)
{
Set<String> result = new LinkedHashSet<String>();
AbstractValueIndexer<?> indexer = getOrCreateIndexer("{856uz3}?((ind1 = 4 || (ind1 = 7 & $x startsWith \"http\")) && (ind2 = 2 || (ind2 = 1)))", "||");
getFieldListCollector(record, indexer, result);
return result;
}
/**
* extract all the subfields requested in requested marc fields. Each
* instance of each marc field will be put in a separate result (but the
* subfields will be concatenated into a single value for each marc field)
*
* @param record
* marc record object
* @param fieldSpec -
* the desired marc fields and subfields as given in the
* xxx_index.properties file
* @param separator -
* the character to use between subfield values in the solr field
* contents
* @return Set of values (as strings) for solr field
*/
public Set<String> getAllSubfields(final Record record, String fieldSpec, String separator)
{
Set<String> result = new LinkedHashSet<String>();
AbstractValueIndexer<?> indexer = getOrCreateIndexer(fieldSpec, separator);
getFieldListCollector(record, indexer, result);
return result;
}
/**
* extract all the subfields requested in requested marc fields. Each
* instance of each marc field will be put in a separate result (but the
* subfields will be concatenated into a single value for each marc field)
*
* @param record
* marc record object
* @param fieldSpec -
* the desired marc fields and subfields as given in the
* xxx_index.properties file
* @param separator -
* the character to use between subfield values in the solr field
* contents
* @return Set of values (as strings) for solr field
*/
public Set<String> getAllAlphaSubfields(final Record record, String fieldSpec, String firstAllJoin)
{
Set<String> result = new LinkedHashSet<String>();
String [] pieces = fieldSpec.split(":");
String fieldSpecWithAll = Utils.join(pieces, "[a-z]:") + "[a-z]" + ", " + firstAllJoin;
AbstractValueIndexer<?> indexer = getOrCreateIndexerFullSpec(fieldSpecWithAll);
getFieldListCollector(record, indexer, result);
return result;
}
/**
* For each occurrence of a marc field in the fieldSpec list, extract the
* contents of all subfields except the ones specified, concatenate the
* subfield contents with a space separator and add the string to the result
* set.
*
* @param record -
* the marc record
* @param fieldSpec -
* the marc fields (e.g. 600:655) in which we will grab the
* alphabetic subfield contents for the result set. The field may
* not be a control field (must be 010 or greater)
* @return a set of strings, where each string is the concatenated values of
* all the alphabetic subfields.
*/
public Set<String> getAllAlphaExcept(final Record record, String fieldSpec)
{
Set<String> result = new LinkedHashSet<String>();
String [] pieces = fieldSpec.split(":");
StringBuilder sb = new StringBuilder();
for (String piece : pieces)
{
sb.append(piece.substring(0, 3)).append("[^").append(piece.substring(3)).append("0-9]:");
}
sb.setLength(sb.length()-1);
String fieldSpecWithAll = sb.toString();
AbstractValueIndexer<?> indexer = getOrCreateIndexerFullSpec(fieldSpecWithAll);
getFieldListCollector(record, indexer, result);
return result;
}
/**
* extract all the subfields requested in requested marc fields. Each
* instance of each marc field will be put in a separate result (but the
* subfields will be concatenated into a single value for each marc field)
*
* @param record
* marc record object
* @param fieldSpec -
* the desired marc fields and subfields as given in the
* xxx_index.properties file
* @param separator -
* the character to use between subfield values in the solr field
* contents
* @return Set of values (as strings) for solr field
*/
public List<String> getAllSubfieldsAsList(final Record record, String fieldSpec, String separator)
{
List<String> result = new ArrayList<String>();
AbstractValueIndexer<?> indexer = getOrCreateIndexer(fieldSpec, separator);
getFieldListCollector(record, indexer, result);
return result;
}
/**
* Loops through all datafields and creates a field for "all fields"
* searching. Shameless stolen from Vufind Indexer Custom Code
*
* @param record
* marc record object
* @param lowerBoundStr -
* the "lowest" marc field to include (e.g. 100). defaults to 100
* if value passed doesn't parse as an integer
* @param upperBoundStr -
* one more than the "highest" marc field to include (e.g. 900
* will include up to 899). Defaults to 900 if value passed
* doesn't parse as an integer
* @return a string containing ALL subfields of ALL marc fields within the
* range indicated by the bound string arguments.
*/
public String getAllSearchableFields(final Record record, String lowerBoundStr, String upperBoundStr)
{
StringBuffer buffer = new StringBuffer("");
int lowerBound = localParseInt(lowerBoundStr, 100);
int upperBound = localParseInt(upperBoundStr, 900);
List<DataField> fields = record.getDataFields();
for (DataField field : fields)
{
// Get all fields starting with the 100 and ending with the 839
// This will ignore any "code" fields and only use textual fields
int tag = localParseInt(field.getTag(), -1);
if ((tag >= lowerBound) && (tag < upperBound))
{
// Loop through subfields
List<Subfield> subfields = field.getSubfields();
for (Subfield subfield : subfields)
{
if (buffer.length() > 0)
buffer.append(" ");
buffer.append(subfield.getData());
}
}
}
return buffer.toString();
}
/**
* Loops through all datafields and creates a field for "all fields"
* searching. Shameless stolen from Vufind Indexer Custom Code
*
* @param record
* marc record object
* @param lowerBoundStr -
* the "lowest" marc field to include (e.g. 100). defaults to 100
* if value passed doesn't parse as an integer
* @param upperBoundStr -
* one more than the "highest" marc field to include (e.g. 900
* will include up to 899). Defaults to 900 if value passed
* doesn't parse as an integer
* @return a Set of strings containing ALL subfields of ALL marc fields within the
* range indicated by the bound string arguments, with one string for each field encountered.
*/
public Set<String> getAllSearchableFieldsAsSet(final Record record, String lowerBoundStr, String upperBoundStr)
{
Set<String> result = new LinkedHashSet<String>();
int lowerBound = localParseInt(lowerBoundStr, 100);
int upperBound = localParseInt(upperBoundStr, 900);
List<DataField> fields = record.getDataFields();
for (DataField field : fields)
{
// Get all fields starting with the 100 and ending with the 839
// This will ignore any "code" fields and only use textual fields
int tag = localParseInt(field.getTag(), -1);
if ((tag >= lowerBound) && (tag < upperBound))
{
// Loop through subfields
StringBuffer buffer = new StringBuffer("");
List<Subfield> subfields = field.getSubfields();
for (Subfield subfield : subfields)
{
if (buffer.length() > 0)
buffer.append(" ");
buffer.append(subfield.getData());
}
result.add(buffer.toString());
}
}
return result;
}
/**
* Get the title (245ab) from a record, without non-filing chars as
* specified in 245 2nd indicator, and lowercased.
* @param record - the marc record object
* @return 245a and 245b values concatenated, with trailing punct removed,
* and with non-filing characters omitted. Null returned if no
* title can be found.
*
* @see SolrIndexerShim#getTitle
*/
public String getSortableTitle(Record record)
{
List<String> result = new ArrayList<String>();
AbstractValueIndexer<?> indexer = getOrCreateIndexerFullSpec("245abk,titleSortLower,first");
getFieldListCollector(record, indexer, result);
return (result.size() == 0) ? "" : result.iterator().next();
}
/**
* return an int for the passed string
* @param str
* @param defValue - default value, if string doesn't parse into int
*/
private int localParseInt(String str, int defValue)
{
int value = defValue;
try
{
value = Integer.parseInt(str);
}
catch (NumberFormatException nfe)
{
// provided value is not valid numeric string
// Ignoring it and moving happily on.
}
return (value);
}
public List<VariableField> getFieldSetMatchingTagList(Record record, String tagList)
{
String tags[] = tagList.split(":");
for (int i = 0; i < tags.length; i++)
{
String tag = tags[i].substring(0, 3);
if (tag == "LNK") tag = tags[i].substring(0, 6);
tags[i] = tag;
}
return(record.getVariableFields(tags));
}
/**
* public interface callable from custom indexing scripts to
* load the translation map into transMapMap
* Simply implements a stub that calls the createMultiValueMapping method
* @param translationMapSpec the specification of a translation map -
* could be name of a _map.properties file, or some subset of entries in a
* _map.properties file
* @return the name of the translation map to be used in a subsequent call to FindMap
*/
public String loadTranslationMap(String translationMapSpec)
{
if (findMap(translationMapSpec) == null)
{
AbstractMultiValueMapping map = ValueIndexerFactory.instance().createMultiValueMapping(translationMapSpec);
transMapMap.putIfAbsent(translationMapSpec, map);
}
return(translationMapSpec);
}
public String loadTranslationMap(String ignore, String translationMapSpec)
{
return(loadTranslationMap(translationMapSpec));
}
/**
* Get the appropriate Map object from populated transMapMap
* @param mapName the name of the translation map to find
* @return populated Map object
*/
public Object findMap(String mapName)
{
if (transMapMap.containsKey(mapName))
return(transMapMap.get(mapName));
return null;
}
public Collection<String> remap(Collection<String> valuesToMap, Object translationMap, boolean b) throws Exception
{
if (translationMap instanceof AbstractMultiValueMapping)
{
AbstractMultiValueMapping map = (AbstractMultiValueMapping) translationMap;
return(map.map(valuesToMap));
}
return null;
}
public String remap(String valueToMap, Object translationMap, boolean b) throws Exception
{
if (translationMap instanceof AbstractMultiValueMapping)
{
AbstractMultiValueMapping map = (AbstractMultiValueMapping) translationMap;
return(map.mapSingle(valueToMap));
}
return null;
}
public String getDataFromVariableField(VariableField vf, String subfldTags, String separator, boolean cleanIt)
{
if (subfldTags.length() > 1 && !subfldTags.startsWith("["))
subfldTags = '[' + subfldTags + ']';
Pattern subfieldPattern = Pattern.compile(subfldTags.length() == 0 ? "." : subfldTags);
DataField marcField = (DataField) vf;
StringBuffer buffer = new StringBuffer("");
List<Subfield> subfields = marcField.getSubfields();
for (Subfield subfield : subfields)
{
Matcher matcher = subfieldPattern.matcher("" + subfield.getCode());
if (matcher.matches())
{
if (buffer.length() > 0)
buffer.append(separator != null ? separator : " ");
buffer.append(subfield.getData().trim());
}
}
if (buffer.length() > 0)
return(cleanIt ? DataUtil.cleanData(buffer.toString()) : buffer.toString());
else
return(null);
}
}