/*
* Copyright 2011 Luke Usherwood.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package net.bettyluke.tracinstant.data;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DateFormatDetector {
private static final int MAX_SAMPLES_TO_CONSIDER = 60;
private static final List<String> PERMUTATIONS = Arrays.asList(
"yyyy-MM-dd",
"yyyy-MMM-dd",
"yyyy-MMMMM-dd",
"dd-MM-yyyy",
"dd-MMM-yyyy",
"dd-MMMMM-yyyy",
"dd-MM-yy",
"dd-MMM-yy",
"dd-MMMMM-yy",
"MM-dd-yyyy",
"MMM-dd-yyyy",
"MMMMM-dd-yyyy",
"MM-dd-yy",
"MMM-dd-yy",
"MMMMM-dd-yy",
"yy-MM-dd",
"yy-MMM-dd",
"yy-MMMMM-dd",
// Text months with comma
"MMM-dd,-yyyy",
"MMMMM-dd,-yyyy",
"MMM-dd,-yy",
"MMMMM-dd,-yy",
// Text months Including days
"EEE, dd-MMM-yyyy",
"EEE, dd-MMMMM-yyyy",
"EEEE, dd-MMM-yyyy",
"EEEE, dd-MMMMM-yyyy"
);
private static final Pattern TIME_PATTERN = Pattern.compile(
"[\\,\\;]?[ tT]*\\d{1,2}:\\d{1,2}(:\\d{1,2})?");
// TODO: Switch DateFormat -> DateTimeFormatter for consistency
private static class Attempt {
private static final Date LONG_AGO =
new Date(-TimeUnit.MILLISECONDS.convert(365 * 200, TimeUnit.DAYS));
public Attempt(String dateString) {
string = dateString;
format = new SimpleDateFormat(dateString);
format.setLenient(false);
latestMatchedDate = LONG_AGO;
}
public String string;
public DateFormat format;
public Date latestMatchedDate;
}
private final List<String> dateStringsAscending;
private final List<Attempt> possibleFormats;
/**
* Will decrement with each check after only a single 'possibleFormat' remains.
* When it reaches zero we will terminate with success.
*/
private int confirmationsRemaining = 10;
public static String detectFormat(List<String> dateTimeStringsAscending) {
return new DateFormatDetector(dateTimeStringsAscending).detectFormat();
}
private DateFormatDetector(List<String> dateTimeStringsAscending) {
this.dateStringsAscending = stripTime(trimIfLarge(dateTimeStringsAscending));
possibleFormats = new ArrayList<>(PERMUTATIONS.size() * 4);
for (String dateString : PERMUTATIONS) {
possibleFormats.add(new Attempt(dateString));
possibleFormats.add(new Attempt(dateString.replaceAll("\\-", "/")));
possibleFormats.add(new Attempt(dateString.replaceAll("\\-", ".")));
possibleFormats.add(new Attempt(dateString.replaceAll("\\-", " ")));
}
}
private List<String> trimIfLarge(List<String> ss) {
if (ss.size() <= MAX_SAMPLES_TO_CONSIDER) {
return ss;
}
// Use the start and end of the given list.
List<String> result = new ArrayList<>(MAX_SAMPLES_TO_CONSIDER);
for (int i = 0; i < MAX_SAMPLES_TO_CONSIDER / 2; i++) {
result.add(ss.get(i));
}
for (int i = ss.size() - MAX_SAMPLES_TO_CONSIDER / 2; i < ss.size(); i++) {
result.add(ss.get(i));
}
return result;
}
private static List<String> stripTime(List<String> dateTimesAscending) {
List<String> result = new ArrayList<>(dateTimesAscending.size());
for (String s : dateTimesAscending) {
result.add(stripTime(s));
}
return result;
}
private static String stripTime(String dateTime) {
dateTime = dateTime.trim();
Matcher m = TIME_PATTERN.matcher(dateTime);
if (m.find()) {
return dateTime.substring(0, m.start()).trim();
}
return dateTime;
}
private String detectFormat() {
for (String dateString : dateStringsAscending) {
eliminateFormats(dateString);
}
for (String dateString : dateStringsAscending) {
eliminateIncorrectFieldLengths(dateString);
}
if (possibleFormats.size() > 1) {
System.err.println("WARNING ambiguous date format. Possibilites:");
for (Attempt a : possibleFormats) {
System.err.println(" " + a.string);
}
}
return possibleFormats.isEmpty() ? null : possibleFormats.get(0).string;
}
/**
* This heuristic uses two tests:
* 1) the non-lenient DateFormat must successfully parse the date, and
* 2) the parsed date must be greater than the previous date parsed by the same
* DateFormat instance.
* Any DateFormat not meeting these tests is removed.
*/
private void eliminateFormats(String dateString) {
int i = 0;
while (i < possibleFormats.size()) {
Attempt attempt = possibleFormats.get(i);
DateFormat format = attempt.format;
Date latestDate = attempt.latestMatchedDate;
try {
Date date = format.parse(dateString);
if (date.compareTo(latestDate) >= 0) {
attempt.latestMatchedDate = date;
i++;
continue;
}
} catch (ParseException e) {
}
possibleFormats.remove(i);
}
}
/**
* Sample date strings with 4-digit years will be accepted by formats using either
* 'yyyy' or 'yy'; similarly and "June" will match both MMM and MMMMM formats.
* <p>
* This is heuristic that parse and reformats a given date-string, and compares the
* result with the original. Spaces and leading-zeros are dropped for the comparison.
*/
private void eliminateIncorrectFieldLengths(String dateString) {
Iterator<Attempt> it = possibleFormats.iterator();
while(it.hasNext()) {
Attempt a = it.next();
try {
String reformatted = a.format.format(a.format.parse(dateString));
if (stripZerosSpaces(dateString).equals(stripZerosSpaces(reformatted))) {
// Early termination
if (possibleFormats.size() == 1) {
if ((--confirmationsRemaining) == 0) {
return;
}
}
continue;
}
} catch (ParseException e) {
}
it.remove();
}
}
private String stripZerosSpaces(String dateString) {
return dateString.replaceAll("\\b0", "").replaceAll("\\s+", "");
}
public static void main(String[] args) {
final String[][] TESTS = new String[][] {
{
"Sep 5, 2002, 11:41:15 AM",
"Sep 5, 2002; 11:41:15 AM"
},
{
"03/12/03 10:01:16",
"22/04/04 11:20:38",
"21/09/04 13:22:50"
},
{
"2003-08-24 22:51:08"
},
{
"2002 2 22"
},
{
"2002. 02. 2"
},
{
"Sep 5, 2002"
},
{
"November 5, 2002"
},
{
"5 August 99"
},
{
"Fri, 26 Oct 2007 12:56:12 GMT",
"Fri, 16 Nov 2007 22:40:34 GMT"
},
{
"Friday, 26 Oct 2007 12:56:12 GMT"
},
{
"2012-04-10T09:37:31+01:00",
}
};
for (String[] test : TESTS) {
String detected = detectFormat(Arrays.asList(test));
if (detected == null) {
System.err.println("No matching format for: " + Arrays.toString(test));
} else {
System.out.println(detected + " -> " +
new SimpleDateFormat(detected).format(new Date()));
}
}
}
}