package org.solrmarc.mixin; import java.util.Collection; import java.util.LinkedHashSet; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.solrmarc.index.extractor.impl.custom.Mixin; /** * Created with IntelliJ IDEA. * User: dueberb * Date: 1/30/15 * Time: 10:30 AM * To change this template use File | Settings | File Templates. */ public class ISBNNormalizer implements Mixin { private static String ISBNDelimiterPattern = "\\-"; public static final Pattern ISBN10Pat = Pattern.compile("^.*?(\\d[\\d\\-]{8,}[Xx]?)(?:\\D|\\Z).*$"); public static final Pattern ISBN13Pat = Pattern.compile("^.*?(978[\\d\\-]{10,})(?:\\D|\\Z).*$"); public static Collection<String> filterISBN(Collection <String> isbnList) { return(filterISBN(isbnList, "13")); } public static Collection<String> filterISBN(Collection <String> isbnList, String output) { Collection<String> result = Set.class.isAssignableFrom(isbnList.getClass()) ? new LinkedHashSet<String>() : new ArrayList<String>(); boolean get13 = (output.equals("13") || output.equals("both")); boolean get10 = (output.equals("10") || output.equals("both")); if (!get13 && !get10) throw new IllegalArgumentException("Warning: method only accepts values \"10\" \"13\" or \"both\""); for (String isbn : isbnList) { if (get13) { String isbn13 = normalize_13(isbn); if (isbn13 != null) result.add(isbn13); } if (get10) { try { String isbn10 = normalize_10(isbn); if (isbn10 != null) result.add(isbn10); } catch (IllegalArgumentException e) { } } } return(result); } /** * Try to extract an ISBN from the string. 13s are returned as-is, * 10s are turned into an isbn13 and returned. Otherwise throw IllegalArgumentException * * @param isbnstring The string that may contain an ISBN * @return an ISBN13 * @throws IllegalArgumentException */ public static String normalize_13(String isbnstring) throws IllegalArgumentException { // First look for a 13,then a 10 try { return extract_isbn13(isbnstring); } catch (IllegalArgumentException e) { return isbn10_to_13(extract_isbn10(isbnstring)); } } /** * Try to extract an ISBN from the string. 13s are returned as-is, * 10s are turned into an isbn13 and returned. Otherwise throw IllegalArgumentException * * @param isbnstring The string that may contain an ISBN * @return an ISBN13 * @throws IllegalArgumentException */ public static String normalize_10(String isbnstring) throws IllegalArgumentException { // First look for a 10,then a 13 try { return extract_isbn10(isbnstring); } catch (IllegalArgumentException e) { return isbn13_to_10(extract_isbn13(isbnstring)); } } /** * @param isbnstring a String that might contain an ISBN * @param pat The pattern to match against * @param len The length of the ISBN you're looking for (10 or 13) * @return the extracted ISBN * @throws IllegalArgumentException if an ISBN isn't found */ public static String extract_isbn_by_pat(String isbnstring, Pattern pat, Integer len) throws IllegalArgumentException { Matcher m = pat.matcher(isbnstring); if (!m.matches()) { throw new IllegalArgumentException(isbnstring + " doesn't contain an ISBN" + len.toString()); } String extracted_string = m.group(1); String normalized_string = extracted_string.replaceAll(ISBNDelimiterPattern, ""); if (normalized_string.length() != len) { throw new IllegalArgumentException("'" + normalized_string + "' doesn't contain an ISBN" + len.toString() + "; it's length is " + normalized_string.length()); } return normalized_string; } public static String extract_isbn10(String isbnstring) throws IllegalArgumentException { return extract_isbn_by_pat(isbnstring, ISBN10Pat, 10); } public static String extract_isbn13(String isbnstring) throws IllegalArgumentException { return extract_isbn_by_pat(isbnstring, ISBN13Pat, 13); } /** * Turn an already-extracted ISBN10 into an ISBN13 * * @param isbn10 -- just the raw digits (plus possible 'X') of an ISBN10 * @return the equivalent ISBN13 */ public static String isbn10_to_13(String isbn10) { String longisbn = "978" + isbn10.substring(0, 9); int[] digits = new int[12]; for (int i = 0; i < 12; i++) { digits[i] = new Integer(longisbn.substring(i, i + 1)); } Integer sum = 0; for (int i = 0; i < 12; i++) { sum = sum + digits[i] + (2 * digits[i] * (i % 2)); } // Get the smallest multiple of ten > sum Integer top = sum + (10 - (sum % 10)); Integer check = top - sum; if (check == 10) { return longisbn + "0"; } else { return longisbn + check.toString(); } } /** * Turn an already-extracted ISBN13 into an ISBN10 * * @param isbn13 -- just the raw digits of an ISBN13 * @return the equivalent ISBN10 (if possible) */ public static String isbn13_to_10(String isbn13) { if (!isbn13.substring(0,3).equals("978")) throw new IllegalArgumentException("13-digit ISBN '" + isbn13 + "' doesn't start with 978, cannot make a valid 10-digit ISBN for it."); String shortisbn = isbn13.substring(3, 12); int[] multVect = { 10, 9, 8, 7, 6, 5, 4, 3, 2 }; int sum = 0; for (int i = 0; i < 9; i++) { int digit = new Integer(shortisbn.substring(i, i + 1)); sum += multVect[i] * digit; } int val = 11 - (sum % 11); char checkDigit = (char)((val == 11) ? '0' : (val == 10) ? 'X' : (char)val + '0'); String result = shortisbn + checkDigit; return(result); } }