package org.solrmarc.callnum;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Parses and computes sort keys for Library of Congress call numbers.
*
* <p>The purpose of this class is to parse LC call numbers well and produce useful sort keys.
* Often there are local extensions to LC call numbers, so we are loose with pattern matching.
* It is also common for for non-LC call numbers to be coded as LC,
* so this class also tries to sensibly handle such input, and compute a sortkey
* that will file the call number in a place that will make sense to the user.
*
* <p>Parsing the call number
*
* <p>The call number is parsed as follows
*
* <table summary="dividing up the call number">
* <tr><th>{@code classification}</th><td>everything before the first cutter</td></tr>
* <tr><th>{@code classLetters}</th><td>leading sequence of letters</td></tr>
* <tr><th>{@code classDigits}</th><td>following digits</td></tr>
* <tr><th>{@code classDecimal}</th><td>decimal with following digits (if exist)</td></tr>
* <tr><th>{@code classSuffix}</th><td>whatever remains before the first cutter</td></tr>
* <tr>
* <th>{@code cutter}</th>
* <td>the first occurrence of the pattern {@code [ .][A-Z]\\d+} after the classification letters and digits.</td>
* </tr>
* </table>
*
* <p>For example, the call number {@code PR9199.3 1920 .L33 1475 .A6} parses like so:
*
* <table summary="call number parsed into fields">
* <tr><th>{@code classification}</th><td>{@code PR9199.3 1920}</td></tr>
* <tr><th>{@code classLetters}</th><td>{@code PR}</td></tr>
* <tr><th>{@code classDigits}</th><td>{@code 9199}</td></tr>
* <tr><th>{@code classDecimal}</th><td>{@code .3}</td></tr>
* <tr><th>{@code classSuffix}</th><td>{@code 1920}</td></tr>
* <tr><th>{@code cutter}</th><td>{@code .L33 1475 .A6}</td></tr>
* </table>
*
* <p>Shelf keys:
*
* <p>With computing shelf keys, we want a string which represents the number but can easily be sorted.
* The main issues is sequences of digits: which ones sort numerically, and how to arrange that.
*
* <p>The shelf key algorithm is basically:
* <ol>
* <li>{@code classLetters} followed by a space</li>
* <li>{@code classDigits} prepended with the number of digits</li>
* <li>
* normalize {@code classSuffix},
* for details see {@link Utils#appendNumericallySortable Utils#appendNumericallySortable};
* if suffix is alphabetic, prefix with {@code _} so it sorts after cutters
* </li>
* <li>
* parse {@code cutter} to separate cutters from any additional information (years, military regiments, etc.),
* actual cutters are used as-is (without decimals),
* any other data is normalized according to {@code Utils#appendNumericallySortable}
* </li>
* </ol>
*
* <p>Using the above example call number:
* <table summary="constructing the shelf key">
* <tr><th>{@code classLetters}</th><td>{@code PR}</td><td>{@code PR}</td></tr>
* <tr><th>{@code classDigits}</th><td>{@code 9199}</td><td>{@code 49199}</td></tr>
* <tr><th>{@code classDecimal}</th><td>{@code .3}</td><td>{@code .3}</td></tr>
* <tr><th>{@code classSuffix}</th><td>{@code 1920}</td><td>{@code 41920}</td></tr>
* <tr><th>{@code cutter}</th><td>{@code .L33 1475 .A6}</td><td>{@code L33 41475 A6}</td></tr>
* </table>
*
* <p>The resulting shelf key will be:
* <p>{@code PR 49199.3 41920 L33 41475 A6}
*
* <p>A note on music call numbers: this class does no special processing for music call numbers.
* Letters in the suffix that introduce Köchel numbers, Burghauser numbers, etc should properly use
* a period indicating the fact that it is an abbreviation and not a cutter.
*
* <p>Run the {@code ExerciseLCCallNumber} class from the command line to print out a
* number of examples of both parsed call numbers and shelf keys.
*
* @author Tod Olson, University of Chicago
* @author Anna Headley, Tri-College Library Consortium
*
*/
public class LCCallNumber extends AbstractCallNumber {
/* Class variables */
protected String classification;
protected String classLetters;
protected String classDigits;
protected String classDecimal;
protected String classSuffix;
protected String cutter;
protected String shelfKey;
protected String paddedShelfKey;
/* Regexp and patterns */
/* Original strict regex, from CallNumUtils:
*
* regular expression string for the required portion of the LC classification
* LC classification is
* 1-3 capital letters followed by float number (may be an integer)
* optionally followed by a space and then a year or other number,
* e.g. "1987" "15th"
* LC call numbers can't begin with I, O, W, X, or Y
* As a regex pattern, group 1 matches the classification letter, group 2 matches the numbers.
*/
// public static final String LC_CLASS_REQ_REGEX = "([A-Z&&[^IOWXY]]{1}[A-Z]{0,2}) *(\\d+(?:\\.\\d+)?)";
/**
* Liberally matches LC call number.
*
* This regex matchex any string of letters, followed by optional spaces, digits, and decimal digits.
*
* Match group 1 contains the letters.
* Match group 2 contains the classification number
* Match group 3 contains any classification decimal plus digits
*/
public static final String CLASS_REGEX = "^([a-zA-Z]+) *(?:(\\d+)(\\.\\d+)?)?";
/**
* regular expression string for the cutter, without preceding characters
* (such as the "required" period, which is sometimes missing, or spaces).
* A Cutter is a single letter followed by digits.
*
* Must match uppper and lower case, catalog patrons expect to type in either case.
*/
public static final String CUTTER_REGEX = "[A-Za-z]\\d+";
/**
* Separates the class from the rest of a call number.
*
* Match group 1 contains the classification.
* Match group 2 contains the class letters.
* Match group 3 contains the class digits (before the decimal).
* Match group 4 contains the decimal portion of the class number, including the decimal point.
* Match group 5 contains everything after the classification.
*/
protected static Pattern classPattern = Pattern.compile("(" + CLASS_REGEX + ")" + "(.*)$");
/**
* Matches a single cutter.
*
* This Pattern assumes that matching will begin after any classification or class suffix,
* and will identify exactly one letter-followed-by-digits cutter.
*
* Matching group 1 contains the cutter.
*/
protected static Pattern cutterPat = Pattern.compile("(" + CUTTER_REGEX + ")");
/**
* Matches the cutter after the classification suffix.
*
* Assumes that the classification part has already been removed
* and we just need to separate the cutter from any suffix.
* Matching group 1 contains the cutter.
*/
protected static Pattern cutterAfterSuffixPat = Pattern.compile("(\\.?[A-Za-z]\\d+|^\\.[A-Za-z]| \\.[A-Za-z])");
/* Constructors */
/**
* Creates a call number object from the given string.
*
* The constructor parses the {@code rawCallNumber} argument as part of instantiating the object.
*
* @param rawCallNumber the call number as a string
*/
public LCCallNumber(String rawCallNumber) {
parse(rawCallNumber);
}
/**
* Create call number object with no call number.
* Mainly a convenience for inheritance.
*/
public LCCallNumber() {
// TODO Auto-generated constructor stub
}
/* Accessors */
public String getClassification() {
return classification;
}
public String getClassLetters() {
return classLetters;
}
public String getClassDigits() {
return classDigits;
}
public String getClassDecimal() {
return classDecimal;
}
/**
* Returns the digit and decimal part of the classification.
*
* @return numeric portion of the classification
*/
public String getClassNumber() {
String digits = classDigits == null ? "" : classDigits;
String decimal = classDecimal == null ? "" : classDecimal;
return digits + decimal;
}
public String getClassSuffix() {
return classSuffix;
}
public String getCutter() {
return cutter;
}
/* Methods proper */
protected void init() {
rawCallNum = null;
classification = null;
classLetters = null;
classDigits = null;
classDecimal = null;
classSuffix = null;
cutter = null;
shelfKey = null;
paddedShelfKey = null;
}
/**
* This parse can be used in conjunction with the empty constructor.
*
* Leading and training whitespace will automatically be trimmed before call number is stored and parsed.
*
* @param call call number to parse
*/
@Override
public void parse(String call) {
init();
if (call == null) {
this.rawCallNum = null;
} else {
this.rawCallNum = call.trim();
}
parse();
}
protected void parse() {
if (this.rawCallNum != null) {
parseCallNumber();
// buildShelfKey();
}
}
/**
* Parses the call number, splitting the classification portions from any
* cutter(s) and other following characters. Sets these internal fields:
* <ul>
* <li><code>classLetters</code></li>
* <li><code>classDigits</code></li>
* <li><code>classDecimal</code></li>
* <li><code>classSuffix</code></li>
* <li><code>classification</code></li>
* <li><code>cutter</code></li>
* </ul>
*/
protected void parseCallNumber() {
String everythingElse = null;
Matcher mClass = classPattern.matcher(rawCallNum);
if (mClass.matches()) {
classification = mClass.group(1) == null ? null
: mClass.group(1).trim();
classLetters = mClass.group(2) == null ? null
: mClass.group(2).trim();
classDigits = mClass.group(3) == null ? null
: mClass.group(3).trim();
classDecimal = mClass.group(4) == null ? null
: mClass.group(4).trim();
everythingElse = mClass.group(5) == null ? null
: mClass.group(5).trim();
// (.*) matches on "" but we trade in nulls
everythingElse = mClass.group(5).length() < 1 ? null
: everythingElse;
} else {
everythingElse = rawCallNum;
}
cutter = null;
if (everythingElse != null) {
// split any classSuffix from first cutter
Matcher mCut = cutterAfterSuffixPat.matcher(everythingElse);
if (mCut.find()) {
int start = mCut.start(1);
classSuffix = start > 0 ? everythingElse.substring(0, start).trim() : null;
cutter = everythingElse.substring(mCut.start(1)).trim();
} else {
classSuffix = everythingElse.trim();
}
// clean up the class suffix
if (classSuffix != null && classSuffix.length() == 0) {
classSuffix = null;
}
// add suffix on to classification
if (classSuffix != null) {
if (classification != null) {
classification += " " + classSuffix;
} else {
classification = classSuffix;
}
}
}
}
/**
* Builds the shelf key from the parsed call number.
*/
protected void buildShelfKey() {
//TODO: Painful procedural logic, want a null-sensitive map over an array
//TODO: Question: better to upcase here, or force to upper at parse time?
StringBuilder key = new StringBuilder();
if (classLetters != null) {
key.append(classLetters.toUpperCase());
}
if (classDigits != null) {
if (key.length() > 0) {
key.append(' ');
}
key.append(classDigits.length());
key.append(classDigits);
}
// class decimal includes ., easier to visually check, and sorts after [space] [year]
if (classDecimal != null) {
key.append(classDecimal);
}
if (classSuffix != null) {
//TODO: pad-if-not-null utility helper; or null-ignoring builder subclass with right-pad method
if (key.length() > 0) {
key.append(' ');
// sort alphabetic suffixes after cutters
if (Character.isAlphabetic(classSuffix.charAt(0))) {
key.append('_');
}
}
Utils.appendNumericallySortable(key, classSuffix.toUpperCase());
}
if (cutter != null) {
appendCutterShelfKey(key, cutter.toUpperCase());
}
// TODO: better way to deal with trailing . or space in call num, as in "BF199.",
// causes meaningless class suffix resulting in trailing space on shelf key
if (key.length() > 0) {
int i = key.length() - 1;
char last = key.charAt(i);
if (last == ' ') {
key.deleteCharAt(i);
}
}
shelfKey = key.toString();
}
/**
* Builds the shelf key from the parsed call number.
*/
protected void buildPaddedShelfKey() {
//TODO: Painful procedural logic, want a null-sensitive map over an array
//TODO: Question: better to upcase here, or force to upper at parse time?
StringBuilder key = new StringBuilder();
if (classLetters != null) {
key.append(classLetters.toUpperCase());
key.append(" ".substring(classLetters.length()));
}
if (classDigits != null) {
if (key.length() > 0) {
key.append(' ');
}
key.append((classDigits.length() < 4 ? "0000".substring(classDigits.length()) : ""));
key.append(classDigits);
}
// class decimal includes ., easier to visually check, and sorts after [space] [year]
if (classDecimal != null) {
key.append(classDecimal);
key.append((classDecimal.length() < 7 ? "000000".substring(classDecimal.length()-1) : ""));
}
else {
key.append(".000000");
}
if (classSuffix != null) {
//TODO: pad-if-not-null utility helper; or null-ignoring builder subclass with right-pad method
if (key.length() > 0) {
key.append(' ');
// sort alphabetic suffixes after cutters
if (Character.isAlphabetic(classSuffix.charAt(0))) {
key.append('_');
}
}
Utils.appendNumericallySortable(key, classSuffix.toUpperCase());
}
if (cutter != null) {
appendPaddedCutterShelfKey(key, cutter.toUpperCase());
}
// TODO: better way to deal with trailing . or space in call num, as in "BF199.",
// causes meaningless class suffix resulting in trailing space on shelf key
if (key.length() > 0) {
int i = key.length() - 1;
char last = key.charAt(i);
if (last == ' ') {
key.deleteCharAt(i);
}
}
paddedShelfKey = key.toString();
}
/**
* Computes the shelf key for the cutter, appending it to the shelf key buffer.
*
* @param keyBuf buffer with the in-progress shelf key
* @param cutter cutter sequence to parse
*/
protected static void appendCutterShelfKey(StringBuilder keyBuf, CharSequence cutter) {
Matcher m = cutterPat.matcher(cutter);
appendCutterShelfKeyLoop(keyBuf, cutter, m, 0);
}
/**
* Computes the shelf key for the cutter, appending it to the shelf key buffer.
*
* @param keyBuf buffer with the in-progress shelf key
* @param cutter cutter sequence to parse
*/
protected static void appendPaddedCutterShelfKey(StringBuilder keyBuf, CharSequence cutter) {
Matcher m = cutterPat.matcher(cutter);
appendPaddedCutterShelfKeyLoop(keyBuf, cutter, m, 0);
}
/**
* Recursively builds up the key in the buffer.
*
* This method marches through the cutter, consumes up through the next cutter pattern.
* It formats what has been consumed into a shelf key and appends it to {@code buf},
* the, calls itself recursively, starting at the end of the current match.
*
* @param keyBuf buffer with the in-progress shelf key
* @param cutter cutter sequence to parse
* @param m matcher with the cutter pattern
* @param offset current position in the cutter
*/
protected static void appendCutterShelfKeyLoop(StringBuilder keyBuf, CharSequence cutter, Matcher m, int offset) {
if (offset >= cutter.length()) { // all done
return;
} else if (m.find(offset)) { // found another cutter
CharSequence previousCutterSuffix = cutter.subSequence(offset, m.start());
CharSequence matchSeq = cutter.subSequence(m.start(), m.end());
//TODO: pad-if-not-null utility helper; or null-ignoring builder subclass with right-pad method
if (keyBuf.length() > 0 && keyBuf.charAt(keyBuf.length()-1) != ' ') {
keyBuf.append(' ');
}
Utils.appendNumericallySortable(keyBuf, previousCutterSuffix);
if (keyBuf.length() > 0 && keyBuf.charAt(keyBuf.length()-1) != ' ') {
keyBuf.append(' ');
}
keyBuf.append(matchSeq);
appendCutterShelfKeyLoop(keyBuf, cutter, m, m.end());
} else { // no more cutters
if (keyBuf.length() > 0 && keyBuf.charAt(keyBuf.length()-1) != ' ') {
keyBuf.append(' ');
}
Utils.appendNumericallySortable(keyBuf, cutter.subSequence(offset, cutter.length()));
}
}
/**
* Recursively builds up the key in the buffer.
*
* This method marches through the cutter, consumes up through the next cutter pattern.
* It formats what has been consumed into a shelf key and appends it to {@code buf},
* the, calls itself recursively, starting at the end of the current match.
*
* @param keyBuf buffer with the in-progress shelf key
* @param cutter cutter sequence to parse
* @param m matcher with the cutter pattern
* @param offset current position in the cutter
*/
protected static void appendPaddedCutterShelfKeyLoop(StringBuilder keyBuf, CharSequence cutter, Matcher m, int offset) {
if (offset >= cutter.length()) { // all done
return;
} else if (m.find(offset)) { // found another cutter
CharSequence previousCutterSuffix = cutter.subSequence(offset, m.start());
CharSequence matchSeq = cutter.subSequence(m.start(), m.end());
//TODO: pad-if-not-null utility helper; or null-ignoring builder subclass with right-pad method
if (keyBuf.length() > 0 && keyBuf.charAt(keyBuf.length()-1) != ' ') {
keyBuf.append(' ');
}
Utils.appendNumericallySortable(keyBuf, previousCutterSuffix);
if (keyBuf.length() > 0 && keyBuf.charAt(keyBuf.length()-1) != ' ') {
keyBuf.append(' ');
}
appendCutterPadded(keyBuf, matchSeq);
appendPaddedCutterShelfKeyLoop(keyBuf, cutter, m, m.end());
} else { // no more cutters
if (keyBuf.length() > 0 && keyBuf.charAt(keyBuf.length()-1) != ' ') {
keyBuf.append(' ');
}
Utils.appendNumericallySortable(keyBuf, cutter.subSequence(offset, cutter.length()));
}
}
private static void appendCutterPadded(StringBuilder keyBuf, CharSequence cutter)
{
int offset = 0;
for (; Character.isAlphabetic(cutter.charAt(offset)); offset++)
{
keyBuf.append(cutter.charAt(offset));
}
CharSequence number = cutter.subSequence(offset, cutter.length());
keyBuf.append("0.").append(number).append((number.length() < 6 ? "000000".substring(number.length()) : ""));
}
/**
* Initial implementation checks for:
* - invalid classes (beginning with I,O,W,X, or Y)
* - null classDigits
*/
public boolean isValid() {
boolean valid = true;
if (this.classLetters == null) {
valid = false;
}
else {
char firstChar = this.classLetters.charAt(0);
// LC call numbers can't begin with I, O, W, X, or Y
if (firstChar == 'I' || firstChar == 'O' || firstChar == 'W'
|| firstChar == 'X' || firstChar == 'Y') {
valid = false;
}
}
if (this.classDigits == null) valid = false;
return valid;
}
@Override
public String getShelfKey() {
if (shelfKey == null) {
buildShelfKey();
}
return shelfKey;
}
public String getPaddedShelfKey() {
if (paddedShelfKey == null) {
buildPaddedShelfKey();
}
return paddedShelfKey;
}
/**
* Formats the call number from its parsed components into a display format
*/
public String toString() {
// TODO: this method was based on buildShelfKey and therefore would
// benefit from the same refactoring efforts.
StringBuilder formatted = new StringBuilder();
if (classLetters != null) {
formatted.append(classLetters);
}
if (classDigits != null) {
formatted.append(classDigits);
}
// class decimal includes ., easier to visually check, and sorts after [space] [year]
if (classDecimal != null) {
formatted.append(classDecimal);
}
if (classSuffix != null) {
if (formatted.length() > 0) {
formatted.append(' ');
}
formatted.append(classSuffix);
}
if (cutter != null) {
formatted.append(" ");
// fix cutter, but only for valid LC
if (this.isValid() && cutter.charAt(0) != '.') {
formatted.append('.');
}
formatted.append(cutter);
}
return formatted.toString();
}
}