package uk.bl.wa.extract;
/*
* #%L
* warc-indexer
* $Id:$
* $HeadURL:$
* %%
* Copyright (C) 2013 - 2014 The UK Web Archive
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-2.0.html>.
* #L%
*/
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Times {
/**
* Extract date
*
* http://stackoverflow.com/questions/6100353/extract-dates-from-web-page
*
* @return Date object
* @throws ParseException
*/
public static Date extractDate(String text) throws ParseException {
Date date = null;
boolean dateFound = false;
String year = null;
String month = null;
String monthName = null;
String day = null;
String hour = null;
String minute = null;
String second = null;
@SuppressWarnings("unused")
String ampm = null;
String regexDelimiter = "[-:\\/.,]";
String regexDay = "((?:[0-2]?\\d{1})|(?:[3][01]{1}))";
String regexMonth = "(?:([0]?[1-9]|[1][012])|(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Sept|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?))";
String regexYear = "((?:[1]{1}\\d{1}\\d{1}\\d{1})|(?:[2]{1}\\d{3}))";
String regexHourMinuteSecond = "(?:(?:\\s)((?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):([0-5][0-9])(?::([0-5][0-9]))?(?:\\s?(am|AM|pm|PM))?)?";
String regexEndswith = "(?![\\d])";
// DD/MM/YYYY
String regexDateEuropean =
regexDay + regexDelimiter + regexMonth + regexDelimiter + regexYear + regexHourMinuteSecond + regexEndswith;
// MM/DD/YYYY
String regexDateAmerican =
regexMonth + regexDelimiter + regexDay + regexDelimiter + regexYear + regexHourMinuteSecond + regexEndswith;
// YYYY/MM/DD
String regexDateTechnical =
regexYear + regexDelimiter + regexMonth + regexDelimiter + regexDay + regexHourMinuteSecond + regexEndswith;
// see if there are any matches
Matcher m = checkDatePattern(regexDateEuropean, text);
if (m.find()) {
day = m.group(1);
month = m.group(2);
monthName = m.group(3);
year = m.group(4);
hour = m.group(5);
minute = m.group(6);
second = m.group(7);
ampm = m.group(8);
dateFound = true;
}
if(!dateFound) {
m = checkDatePattern(regexDateAmerican, text);
if (m.find()) {
month = m.group(1);
monthName = m.group(2);
day = m.group(3);
year = m.group(4);
hour = m.group(5);
minute = m.group(6);
second = m.group(7);
ampm = m.group(8);
dateFound = true;
}
}
if(!dateFound) {
m = checkDatePattern(regexDateTechnical, text);
if (m.find()) {
year = m.group(1);
month = m.group(2);
monthName = m.group(3);
day = m.group(3);
hour = m.group(5);
minute = m.group(6);
second = m.group(7);
ampm = m.group(8);
dateFound = true;
}
}
// construct date object if date was found
if(dateFound) {
String dateFormatPattern = "";
String dayPattern = "";
String dateString = "";
if(day != null) {
dayPattern = "d" + (day.length() == 2 ? "d" : "");
}
if(day != null && month != null && year != null) {
dateFormatPattern = "yyyy MM " + dayPattern;
dateString = year + " " + month + " " + day;
} else if(monthName != null) {
if(monthName.length() == 3) dateFormatPattern = "yyyy MMM " + dayPattern;
else dateFormatPattern = "yyyy MMMM " + dayPattern;
dateString = year + " " + monthName + " " + day;
}
if(hour != null && minute != null) {
//TODO ampm
dateFormatPattern += " hh:mm";
dateString += " " + hour + ":" + minute;
if(second != null) {
dateFormatPattern += ":ss";
dateString += ":" + second;
}
}
if(!dateFormatPattern.equals("") && !dateString.equals("")) {
//TODO support different locales?
SimpleDateFormat dateFormat = new SimpleDateFormat(dateFormatPattern.trim(), Locale.US);
date = dateFormat.parse(dateString.trim());
}
}
return date;
}
private static Matcher checkDatePattern(String regex, String text) {
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
return p.matcher(text);
}
}