package org.solrmarc.tools;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class DataUtil
{
private final static Pattern FOUR_DIGIT_PATTERN_BRACES = Pattern.compile("\\[[12]\\d{3,3}\\]");
private final static Pattern FOUR_DIGIT_PATTERN_ONE_BRACE = Pattern.compile("\\[[12]\\d{3,3}");
private final static Pattern FOUR_DIGIT_PATTERN_STARTING_WITH_1_2 = Pattern
.compile("(20|19|18|17|16|15)[0-9][0-9]");
private final static Pattern FOUR_DIGIT_PATTERN_OTHER_1 = Pattern.compile("l\\d{3,3}");
private final static Pattern FOUR_DIGIT_PATTERN_OTHER_2 = Pattern.compile("\\[19\\]\\d{2,2}");
private final static Pattern FOUR_DIGIT_PATTERN_OTHER_3 = Pattern.compile("(20|19|18|17|16|15)[0-9][-?0-9]");
private final static Pattern FOUR_DIGIT_PATTERN_OTHER_4 = Pattern.compile("i.e. (20|19|18|17|16|15)[0-9][0-9]");
private final static Pattern BC_DATE_PATTERN = Pattern.compile("[0-9]+ [Bb][.]?[Cc][.]?");
private final static Pattern FOUR_DIGIT_PATTERN = Pattern.compile("\\d{4,4}");
protected static Logger logger = Logger.getLogger(DataUtil.class.getName());
/**
* Cleans non-digits from a String
*
* @param date
* String to parse
* @return Numeric part of date String (or null)
*/
public static String cleanDate(final String date)
{
Matcher matcher_braces = FOUR_DIGIT_PATTERN_BRACES.matcher(date);
Matcher matcher_one_brace = FOUR_DIGIT_PATTERN_ONE_BRACE.matcher(date);
Matcher matcher_start_with_1_2 = FOUR_DIGIT_PATTERN_STARTING_WITH_1_2.matcher(date);
Matcher matcher_l_plus_three_digits = FOUR_DIGIT_PATTERN_OTHER_1.matcher(date);
Matcher matcher_bracket_19_plus_two_digits = FOUR_DIGIT_PATTERN_OTHER_2.matcher(date);
Matcher matcher_three_digits_plus_unk = FOUR_DIGIT_PATTERN_OTHER_3.matcher(date);
Matcher matcher_ie_date = FOUR_DIGIT_PATTERN_OTHER_4.matcher(date);
Matcher matcher = FOUR_DIGIT_PATTERN.matcher(date);
Matcher matcher_bc_date = BC_DATE_PATTERN.matcher(date);
String cleanDate = null; // raises DD-anomaly
if (matcher_braces.find())
{
cleanDate = matcher_braces.group();
cleanDate = removeOuterBrackets(cleanDate);
if (matcher.find())
{
String tmp = matcher.group();
if (!tmp.equals(cleanDate))
{
tmp = "" + tmp;
}
}
}
else if (matcher_ie_date.find())
{
cleanDate = matcher_ie_date.group().replaceAll("i.e. ", "");
}
else if (matcher_one_brace.find())
{
cleanDate = matcher_one_brace.group();
cleanDate = removeOuterBrackets(cleanDate);
if (matcher.find())
{
String tmp = matcher.group();
if (!tmp.equals(cleanDate))
{
tmp = "" + tmp;
}
}
}
else if (matcher_bc_date.find())
{
cleanDate = null;
}
else if (matcher_start_with_1_2.find())
{
cleanDate = matcher_start_with_1_2.group();
}
else if (matcher_l_plus_three_digits.find())
{
cleanDate = matcher_l_plus_three_digits.group().replaceAll("l", "1");
}
else if (matcher_bracket_19_plus_two_digits.find())
{
cleanDate = matcher_bracket_19_plus_two_digits.group().replaceAll("\\[", "").replaceAll("\\]", "");
}
else if (matcher_three_digits_plus_unk.find())
{
cleanDate = matcher_three_digits_plus_unk.group().replaceAll("[-?]", "0");
}
if (cleanDate != null)
{
Calendar calendar = Calendar.getInstance();
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy");
String thisYear = dateFormat.format(calendar.getTime());
try
{
if (Integer.parseInt(cleanDate) > Integer.parseInt(thisYear) + 1) cleanDate = null;
}
catch (NumberFormatException nfe)
{
cleanDate = null;
}
}
if (cleanDate != null)
{
logger.debug("Date : " + date + " mapped to : " + cleanDate);
}
else
{
logger.debug("No Date match: " + date);
}
return cleanDate;
}
/**
* Removes trailing characters (space, comma, slash, semicolon, colon),
* trailing period if it is preceded by at least three letters, and single
* square bracket characters if they are the start and/or end chars of the
* cleaned string
*
* @param origStr
* String to clean
* @return cleaned string
*/
public static String cleanData(String origStr)
{
String currResult = origStr;
String prevResult;
do
{
prevResult = currResult;
currResult = currResult.trim();
currResult = currResult.replaceAll(" *([,/;:])$", "");
// trailing period removed in certain circumstances
if (currResult.endsWith("."))
{
if (currResult.matches(".*[JS]r\\.$"))
{
// dont strip period off of Jr. or Sr.
}
else if (currResult.matches(".*\\w\\w\\.$"))
{
currResult = currResult.substring(0, currResult.length() - 1);
}
else if (currResult.matches(".*\\p{L}\\p{L}\\.$"))
{
currResult = currResult.substring(0, currResult.length() - 1);
}
else if (currResult
.matches(".*\\w\\p{InCombiningDiacriticalMarks}?\\w\\p{InCombiningDiacriticalMarks}?\\.$"))
{
currResult = currResult.substring(0, currResult.length() - 1);
}
else if (currResult.matches(".*\\p{Punct}\\.$"))
{
currResult = currResult.substring(0, currResult.length() - 1);
}
}
currResult = removeOuterBrackets(currResult);
if (currResult.length() == 0) return currResult;
} while (!currResult.equals(prevResult));
// if (!currResult.equals(origStr))
// System.out.println(origStr + " -> "+ currResult);
return currResult;
}
/**
* Call cleanData on an entire set of Strings has a side effect of deleting
* entries that are identical when they are cleaned.
*
* @param values
* - the set to clean
* @return Set<String> - the "same" set with all of its entries cleaned.
*/
public static Set<String> cleanData(Set<String> values)
{
Set<String> result = new LinkedHashSet<String>();
for (String entry : values)
{
String cleaned = cleanData(entry);
result.add(cleaned);
}
return (result);
}
/**
* Repeatedly removes trailing characters indicated in regular expression,
* PLUS trailing period if it is preceded by its regular expression
*
* @param origStr
* String to clean
* @param trailingCharsRegEx
* a regular expression of trailing chars to be removed (see java
* Pattern class). Note that the regular expression should NOT
* have '$' at the end. (e.g. " *[,/;:]" replaces any commas,
* slashes, semicolons or colons at the end of the string, and
* these chars may optionally be preceded by a space)
* @param charsB4periodRegEx
* a regular expression that must immediately precede a trailing
* period IN ORDER FOR THE PERIOD TO BE REMOVED. Note that the
* regular expression will NOT have the period or '$' at the end.
* (e.g. "[a-zA-Z]{3,}" means at least three letters must
* immediately precede the period for it to be removed.)
* @return cleaned string
*/
public static String removeAllTrailingCharAndPeriod(String origStr, String trailingCharsRegEx,
String charsB4periodRegEx)
{
if (origStr == null) return null;
String currResult = origStr;
String prevResult;
do
{
prevResult = currResult;
currResult = removeTrailingCharAndPeriod(currResult.trim(), trailingCharsRegEx, charsB4periodRegEx);
if (currResult.length() == 0) return currResult;
} while (!currResult.equals(prevResult));
return currResult;
}
/**
* Removes trailing characters indicated in regular expression, PLUS
* trailing period if it is preceded by its regular expression.
*
* @param origStr
* String to clean
* @param trailingCharsRegEx
* a regular expression of trailing chars to be removed (see java
* Pattern class). Note that the regular expression should NOT
* have '$' at the end. (e.g. " *[,/;:]" replaces any commas,
* slashes, semicolons or colons at the end of the string, and
* these chars may optionally be preceded by a space)
* @param charsB4periodRegEx
* a regular expression that must immediately precede a trailing
* period IN ORDER FOR THE PERIOD TO BE REMOVED. Note that the
* regular expression will NOT have the period or '$' at the end.
* (e.g. "[a-zA-Z]{3,}" means at least three letters must
* immediately precede the period for it to be removed.)
* @return cleaned string
*/
public static String removeTrailingCharAndPeriod(String origStr, String trailingCharsRegEx,
String charsB4periodRegEx)
{
if (origStr == null) return null;
String result = removeTrailingChar(origStr, trailingCharsRegEx);
result = removeTrailingPeriod(result, charsB4periodRegEx);
return result;
}
/**
* Remove the characters per the regular expression if they are at the end
* of the string.
*
* @param origStr
* string to be cleaned
* @param charsToReplaceRegEx
* - a regular expression of the trailing string/chars to be
* removed e.g. " *([,/;:])" meaning last character is a comma,
* slash, semicolon, colon, possibly preceded by one or more
* spaces.
* @see Pattern class in java api
* @return the string with the specified trailing characters removed
*/
public static String removeTrailingChar(String origStr, String charsToReplaceRegEx)
{
if (origStr == null) return origStr;
// get rid of reg ex specified chars at the end of the string
return origStr.trim().replaceAll(charsToReplaceRegEx + "$", "");
}
/**
* If there is a period at the end of the string, remove the period if it is
* immediately preceded by the regular expression
*
* @param origStr
* the string to be cleaned
* @param precedingCharsRegEx
* a regular expression that must immediately precede a trailing
* period IN ORDER FOR THE PERIOD TO BE REMOVED. Note that the
* regular expression will NOT have the period or '$' at the end.
* (e.g. "[a-zA-Z]{3,}" means at least three letters must
* immediately precede the period for it to be removed.)
* @return the string without a trailing period iff the regular expression
* param was found immediately before the trailing period
*/
public static String removeTrailingPeriod(String origStr, String precedingCharsRegEx)
{
if (origStr == null) return origStr;
String result = origStr.trim();
if (result.endsWith(".") && result.matches(".*" + precedingCharsRegEx + "\\.$"))
result = result.substring(0, result.length() - 1).trim();
return result;
}
/**
* Remove single square bracket characters if they are the start and/or end
* chars (matched or unmatched) and are the only square bracket chars in the
* string.
*/
public static String removeOuterBrackets(String origStr)
{
if (origStr == null || origStr.length() == 0) return origStr;
String result = origStr.trim();
if (result.length() > 0)
{
boolean openBracketFirst = result.charAt(0) == '[';
boolean closeBracketLast = result.endsWith("]");
if (openBracketFirst && closeBracketLast && result.indexOf('[', 1) == -1
&& result.lastIndexOf(']', result.length() - 2) == -1)
// only square brackets are at beginning and end
result = result.substring(1, result.length() - 1);
else if (openBracketFirst && result.indexOf(']') == -1)
// starts with '[' but no ']'; remove open bracket
result = result.substring(1);
else if (closeBracketLast && result.indexOf('[') == -1)
// ends with ']' but no '['; remove close bracket
result = result.substring(0, result.length() - 1);
}
return result.trim();
}
/**
* Change string to have initial Capital letters on all words and lower case
* elsewhere.
* @param s the string to change
* @return The Title Case Version Of The String
*/
public static String toTitleCase(String s)
{
final String ACTIONABLE_DELIMITERS = " .-/"; // these cause the character following
// to be capitalized
final String QUESTIONABLE_DELIMITERS = "'"; // these might cause the character following
// to be capitalized
boolean hasNoLowerCase = true;
for (char c : s.toCharArray())
{
if (Character.isLowerCase(c)) hasNoLowerCase = false;
}
StringBuilder sb = new StringBuilder();
int countSinceActionable = 0;
char prevChar = ' ';
for (char c : s.toCharArray())
{
boolean cIsUpper = Character.isUpperCase(c);
boolean cIsLower = Character.isLowerCase(c);
boolean curActionable = (ACTIONABLE_DELIMITERS.indexOf(c) >= 0);
boolean prevActionable = (ACTIONABLE_DELIMITERS.indexOf(prevChar) >= 0);
boolean prevQuestionable = (QUESTIONABLE_DELIMITERS.indexOf(prevChar) >= 0);
boolean toUpper = false;
boolean toLower = false;
if (prevActionable && cIsLower) toUpper = true;
else if (prevQuestionable && cIsLower && countSinceActionable <= 2) toUpper = true;
else if (hasNoLowerCase && cIsUpper && countSinceActionable >= 1) toLower = true;
countSinceActionable = (curActionable) ? 0 : countSinceActionable+1;
char newC = (toUpper) ? Character.toUpperCase(c) : (toLower) ? Character.toLowerCase(c) : c ;
sb.append(newC);
prevChar = c;
}
return sb.toString();
}
}