package org.solrmarc.callnum;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Provides utility functions to support call number manipulation.
*
* @author Tod Olson, University of Chicago
*
*/
public class Utils {
public enum State { START, WORD, GAP, NUM }
public enum InType { LETTER, SPACE, PERIOD, PUNCT, DIGIT, OTHER, END }
/**
* Writes a numerically-sortable version of the input to the buffer.
*
* The rules for production the numerically sortable sequence are:
* <ul>
* <li>Letters are translated to upper case</li>
* <li>numeric sequences are prepended with the length of the sequence</li>
* <li>any other character is a word separator</li>
* <li>sequences of word separators are reduced to a single space</li>
* </ul>
*
* <p>Prepending a sequence of digits with the number of digits ensures that they will easily sort numerically:
* sort keys for 2-digits nubmers start with 2, sort keys for 3-digit numbers start with 3, etc.
* Suggested by John Craig at Code4Lib in Ashville, NC.
*
* <p>Implemented as a finite state machine, modeled by <code>switch</code> statements.
*
* @param buf buffer for appending sortable version of the input
* @param input source character sequence
*/
public static void appendNumericallySortable(StringBuilder buf, CharSequence input) {
StringBuilder numBuf = new StringBuilder();
State state = State.START;
char c;
InType inType = InType.END;
for (int i = 0; i < input.length(); i++) {
c = input.charAt(i);
// Ugly C-style comparisons to avoid Unicode table lookups,
// should be fine for call numbers
// TODO: remove unneeded input types
if (c >= 'A' && c <= 'Z') {
inType = InType.LETTER;
} else if (c >= 'a' && c <= 'z') {
inType = InType.LETTER;
c = Character.toUpperCase(c);
} else if (c >= '0' && c <= '9') {
inType = InType.DIGIT;
} else if (c == '.') {
inType = InType.PERIOD;
} else if (Character.isWhitespace(c)) {
inType = InType.SPACE;
} else if (c >= '!' && c <= '~') {
// Only consider ASCII-style punctuation,
// have already eliminated digits and letters
inType = InType.PUNCT;
} else {
inType = InType.OTHER;
}
switch (state) {
case START:
switch (inType) {
case LETTER:
state = State.WORD;
buf.append(c);
break;
case DIGIT:
state = State.NUM;
numBuf.append(c);
break;
default:
// Consume anything else, remain in START state
break;
}
break;
case WORD: // Write word characters directly to buffer
switch (inType) {
case LETTER:
state = State.WORD;
buf.append(c);
break;
case DIGIT:
state = State.NUM;
buf.append(' ');
numBuf.append(c);
break;
default:
state = State.GAP;
buf.append(' ');
break;
}
break;
case GAP: // If we are in a gap, only letters or digits will take us out
switch (inType) {
case LETTER:
state = State.WORD;
buf.append(c);
break;
case DIGIT:
state = State.NUM;
numBuf.append(c);
break;
default:
// Consume anything else, remain in GAP state
break;
}
break;
case NUM: // accumulate number in special buffer, write sort version on state change
switch (inType) {
case DIGIT: // Stay in NUM state and accumulate another digit
case PERIOD:
numBuf.append(c);
break;
case LETTER:
state = State.WORD;
appendSortableNumber(buf, numBuf);
numBuf.setLength(0);
buf.append(c);
break;
default:
state = State.GAP;
appendSortableNumber(buf, numBuf);
numBuf.setLength(0);
buf.append(' ');
break;
}
break;
default:
//TODO: Dryrot error?
break;
}
}
// Remember any lingering number data
if (state == State.NUM) {
appendSortableNumber(buf, numBuf);
numBuf.setLength(0);
}
}
/**
* Appends to a buffer a lexicographically sortable version of the number.
* The number may be an integer or may include a decimal.
*
* @param buf buffer to insert the sortable token
* @param num sequence of digits, possibly with decimal
*/
public static void appendSortableNumber(StringBuilder buf, CharSequence num) {
/*
if (num.charAt(0) == '0') {
int i = 0;
while (i < num.length() && num.charAt(i) == '0') {
i++;
}
buf.append(num.length() - i);
buf.append(num.subSequence(i, num.length()));
} else {
buf.append(num.length());
buf.append(num);
}
*/
// identify integer part
int intStart = 0;
int intEnd = 0;
while (intStart < num.length() && num.charAt(intStart) == '0') {
intStart++;
}
while (intEnd < num.length() && num.charAt(intEnd) >= '0' && num.charAt(intEnd) <= '9') {
intEnd++;
}
// append length of integer part
buf.append(intEnd - intStart);
// append number without leading 0s
buf.append(num.subSequence(intStart, num.length()));
}
public static String getCutterFromAuthor(String authorLastname)
{
StringBuilder sb = new StringBuilder();
String uppername = authorLastname.toUpperCase().replaceAll("[^A-Z0-9]", "");
char first = uppername.charAt(0);
char second = uppername.charAt(1);
char third = uppername.charAt(2);
switch (first)
{
case 'A': case 'E': case 'I': case 'O': case 'U':
{
sb.append(first);
if (second < 'B') sb.append('1');
else if (second >= 'B' && second < 'D') sb.append('2');
else if (second >= 'D' && second < 'L') sb.append('3');
else if (second >= 'L' && second < 'N') sb.append('4');
else if (second >= 'N' && second < 'P') sb.append('5');
else if (second >= 'P' && second < 'R') sb.append('6');
else if (second >= 'R' && second < 'S') sb.append('7');
else if (second >= 'S' && second < 'U') sb.append('8');
else if (second >= 'U') sb.append('9');
addCutterExpansion(sb, third);
break;
}
case 'S':
{
sb.append(first);
if (second < 'C' || (second == 'C' && third < 'H')) sb.append('2');
else if (second >= 'C' && second < 'E') sb.append('3');
else if (second >= 'E' && second < 'H') sb.append('4');
else if (second >= 'H' && second < 'M') sb.append('5');
else if (second >= 'M' && second < 'T') sb.append('6');
else if (second >= 'T' && second < 'U') sb.append('7');
else if (second >= 'U' && second < 'W') sb.append('8');
else if (second >= 'W') sb.append('9');
addCutterExpansion(sb, third);
break;
}
case 'Q':
{
sb.append(first);
if (second >= 'U' )
{
if (third >= 'A' && third < 'E') sb.append('3');
else if (third >= 'E' && third < 'I') sb.append('4');
else if (third >= 'I' && third < 'O') sb.append('5');
else if (third >= 'O' && third < 'R') sb.append('6');
else if (third >= 'R' && third < 'T') sb.append('7');
else if (third >= 'T' && third < 'Y') sb.append('8');
else if (third >= 'Y') sb.append('9');
addCutterExpansion(sb, uppername.charAt(3));
}
else
{
sb.append('2');
addCutterExpansion(sb, third);
}
break;
}
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
sb.append("A1");
sb.append(first);
sb.append(second);
break;
}
default:
{
sb.append(first);
if (second >= 'A' && second < 'E') sb.append('3');
else if (second >= 'E' && second < 'I') sb.append('4');
else if (second >= 'I' && second < 'O') sb.append('5');
else if (second >= 'O' && second < 'R') sb.append('6');
else if (second >= 'R' && second < 'U') sb.append('7');
else if (second >= 'U' && second < 'Y') sb.append('8');
else if (second >= 'Y') sb.append('9');
addCutterExpansion(sb, third);
}
}
return(sb.toString());
}
private static void addCutterExpansion(StringBuilder sb, char third)
{
if (third >= 'A' && third < 'E') sb.append('3');
else if (third >= 'E' && third < 'I') sb.append('4');
else if (third >= 'I' && third < 'M') sb.append('5');
else if (third >= 'M' && third < 'P') sb.append('6');
else if (third >= 'P' && third < 'T') sb.append('7');
else if (third >= 'T' && third < 'W') sb.append('8');
else if (third >= 'W') sb.append('9');
}
}