package water.fvec;
import java.util.Set;
import java.util.Map;
import java.util.TreeMap;
import java.util.Iterator;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.DateTimeFormatterBuilder;
import org.joda.time.DateTimeZone;
import water.parser.ValueString;
import water.util.Log;
public abstract class ParseTime {
// Deduce if we are looking at a Date/Time value, or not.
// If so, return time as msec since Jan 1, 1970 or Long.MIN_VALUE.
// I tried java.util.SimpleDateFormat, but it just throws too many
// exceptions, including ParseException, NumberFormatException, and
// ArrayIndexOutOfBoundsException... and the Piece de resistance: a
// ClassCastException deep in the SimpleDateFormat code:
// "sun.util.calendar.Gregorian$Date cannot be cast to sun.util.calendar.JulianCalendar$Date"
public static int digit( int x, int c ) {
if( x < 0 || c < '0' || c > '9' ) return -1;
return x*10+(c-'0');
}
// So I just brutally parse "dd-MMM-yy".
public static final byte MMS[][][] = new byte[][][] {
{"jan".getBytes(),"january" .getBytes()},
{"feb".getBytes(),"february" .getBytes()},
{"mar".getBytes(),"march" .getBytes()},
{"apr".getBytes(),"april" .getBytes()},
{"may".getBytes(),"may" .getBytes()},
{"jun".getBytes(),"june" .getBytes()},
{"jul".getBytes(),"july" .getBytes()},
{"aug".getBytes(),"august" .getBytes()},
{"sep".getBytes(),"september".getBytes()},
{"oct".getBytes(),"october" .getBytes()},
{"nov".getBytes(),"november" .getBytes()},
{"dec".getBytes(),"december" .getBytes()}
};
// Time parse patterns
public static final String TIME_PARSE[] = { "yyyy-MM-dd", "yyyy-MM-dd HH:mm:ss.SSS", "dd-MMM-yy" };
// Returns:
// - not a time parse: Long.MIN_VALUE
// - time parse via pattern X: time in msecs since Jan 1, 1970, shifted left by 1 byte, OR'd with X
public static long encodeTimePat(long tcode, int tpat ) { return (tcode<<8)|tpat; }
public static long decodeTime(long tcode ) { return tcode>>8; }
public static int decodePat (long tcode ) { return ((int)tcode&0xFF); }
public static long attemptTimeParse( ValueString str ) {
try {
long t0 = attemptTimeParse_01(str); // "yyyy-MM-dd" and that plus " HH:mm:ss.SSS"
if( t0 != Long.MIN_VALUE ) return t0;
long t2 = attemptTimeParse_2 (str); // "dd-MMM-yy"
if( t2 != Long.MIN_VALUE ) return t2;
} catch( org.joda.time.IllegalFieldValueException ignore ) { }
return Long.MIN_VALUE;
}
// So I just brutally parse "yyyy-MM-dd HH:mm:ss.SSS"
private static long attemptTimeParse_01( ValueString str ) {
final byte[] buf = str.get_buf();
int i=str.get_off();
final int end = i+str.get_length();
while( i < end && buf[i] == ' ' ) i++;
if ( i < end && buf[i] == '"' ) i++;
if( (end-i) != 10 && (end-i) < 19 ) return Long.MIN_VALUE;
int yy=0, MM=0, dd=0, HH=0, mm=0, ss=0, SS=0;
yy = digit(yy,buf[i++]);
yy = digit(yy,buf[i++]);
yy = digit(yy,buf[i++]);
yy = digit(yy,buf[i++]);
if( yy < 1970 ) return Long.MIN_VALUE;
if( buf[i++] != '-' ) return Long.MIN_VALUE;
MM = digit(MM,buf[i++]);
MM = digit(MM,buf[i++]);
if( MM < 1 || MM > 12 ) return Long.MIN_VALUE;
if( buf[i++] != '-' ) return Long.MIN_VALUE;
dd = digit(dd,buf[i++]);
dd = digit(dd,buf[i++]);
if( dd < 1 || dd > 31 ) return Long.MIN_VALUE;
if( i==end )
return encodeTimePat(new DateTime(yy,MM,dd,0,0,0,getTimezone()).getMillis(),0);
if( buf[i++] != ' ' ) return Long.MIN_VALUE;
HH = digit(HH,buf[i++]);
HH = digit(HH,buf[i++]);
if( HH < 0 || HH > 23 ) return Long.MIN_VALUE;
if( buf[i++] != ':' ) return Long.MIN_VALUE;
mm = digit(mm,buf[i++]);
mm = digit(mm,buf[i++]);
if( mm < 0 || mm > 59 ) return Long.MIN_VALUE;
if( buf[i++] != ':' ) return Long.MIN_VALUE;
ss = digit(ss,buf[i++]);
ss = digit(ss,buf[i++]);
if( ss < 0 || ss > 59 ) return Long.MIN_VALUE;
if( i<end && buf[i] == '.' ) {
i++;
if( i<end ) SS = digit(SS,buf[i++]);
if( i<end ) SS = digit(SS,buf[i++]);
if( i<end ) SS = digit(SS,buf[i++]);
if( SS < 0 || SS > 999 ) return Long.MIN_VALUE;
}
if( i<end && buf[i] == '"' ) i++;
if( i<end ) return Long.MIN_VALUE;
return encodeTimePat(new DateTime(yy,MM,dd,HH,mm,ss,getTimezone()).getMillis()+SS,1);
}
// DD-MMM-YY
private static long attemptTimeParse_2( ValueString str ) {
final byte[] buf = str.get_buf();
int i=str.get_off();
final int end = i+str.get_length();
while( i < end && buf[i] == ' ' ) i++;
if ( i < end && buf[i] == '"' ) i++;
if( (end-i) < 7 ) return Long.MIN_VALUE; // Shortest date: d-mm-yy, only 7 chars
int yy=0, MM=0, dd=0;
dd = digit(dd,buf[i++]);
if( buf[i] != '-' ) dd = digit(dd,buf[i++]);
if( dd < 1 || dd > 31 ) return Long.MIN_VALUE;
if( buf[i++] != '-' ) return Long.MIN_VALUE;
byte[]mm=null;
OUTER: for( ; MM<MMS.length; MM++ ) {
byte[][] mms = MMS[MM];
INNER: for( int k=0; k<mms.length; k++ ) {
mm = mms[k];
if( mm == null ) continue;
if( i+mm.length >= end ) continue INNER;
for( int j=0; j<mm.length; j++ )
if( mm[j] != Character.toLowerCase(buf[i+j]) )
continue INNER;
if( buf[i+mm.length] == '-' ) break OUTER;
}
}
if( MM == MMS.length ) return Long.MIN_VALUE; // No matching month
i += mm.length; // Skip month bytes
MM++; // 1-based month
if( buf[i++] != '-' ) return Long.MIN_VALUE;
yy = digit(yy,buf[i++]); // 2-digit year
if( i >= buf.length ) return Long.MIN_VALUE;
yy = digit(yy,buf[i++]);
if( end-i>=2 && buf[i] != '"' ) {
if( i >= buf.length+1 ) return Long.MIN_VALUE;
yy = digit(yy,buf[i++]); // 4-digit year
yy = digit(yy,buf[i++]);
} else {
yy += 2000; // Y2K bug
}
if( i<end && buf[i] == '"' ) i++;
if( i<end ) return Long.MIN_VALUE;
return encodeTimePat(new DateTime(yy,MM,dd,0,0,0,getTimezone()).getMillis(),2);
}
// Parse XXXXXXXX-XXXX-XXXX and return an arbitrary long, or set str.off==-1
// (and return Long.MIN_VALUE but this is a valid long return value).
public static long attemptUUIDParse0( ValueString str ) {
final byte[] buf = str.get_buf();
int i=str.get_off();
if( i+36>buf.length ) return badUUID(str);
long lo=0;
lo = get2(lo,buf,(i+=2)-2);
lo = get2(lo,buf,(i+=2)-2);
lo = get2(lo,buf,(i+=2)-2);
lo = get2(lo,buf,(i+=2)-2);
if( buf[i++]!='-' ) return badUUID(str);
lo = get2(lo,buf,(i+=2)-2);
lo = get2(lo,buf,(i+=2)-2);
if( buf[i++]!='-' ) return badUUID(str);
lo = get2(lo,buf,(i+=2)-2);
return attemptUUIDParseLast(str,lo,buf,i);
}
// Parse -XXXX-XXXXXXXXXXXX and return an arbitrary long, or set str.off==-1
// (and return Long.MIN_VALUE but this is a valid long return value).
public static long attemptUUIDParse1( ValueString str ) {
final byte[] buf = str.get_buf();
int i=str.get_off();
if( i== -1 ) return badUUID(str);
long hi=0;
if( buf[i++]!='-' ) return badUUID(str);
hi = get2(hi,buf,(i+=2)-2);
hi = get2(hi,buf,(i+=2)-2);
if( buf[i++]!='-' ) return badUUID(str);
hi = get2(hi,buf,(i+=2)-2);
hi = get2(hi,buf,(i+=2)-2);
hi = get2(hi,buf,(i+=2)-2);
hi = get2(hi,buf,(i+=2)-2);
hi = get2(hi,buf,(i+=2)-2);
return attemptUUIDParseLast(str,hi,buf,i);
}
private static long attemptUUIDParseLast( ValueString str, long lo, byte[] buf, int i ) {
// Can never equal MIN_VALUE since only parsed 14 of 16 digits, unless
// failed parse already.
if( lo == Long.MIN_VALUE ) return badUUID(str);
// If the last 2 digits are 0x8000 and the first 14 are all 0's then might
// legitimately parse MIN_VALUE, need to check for it special.
str.setOff(i+2); // Mark as parsed
if( lo == 0x80000000000000L && buf[i]=='0' && buf[i+1]=='0' )
return Long.MIN_VALUE; // Valid MIN_VALUE parse
// First 14 digits are a random scramble; will never equal MIN_VALUE result
// unless we have a failed parse in the last 2 digits
lo = get2(lo,buf,i);
return (lo == Long.MIN_VALUE || // broken UUID already, OR
// too many valid UUID digits
(i+2< buf.length && hdigit(0,buf[i+2]) != Long.MIN_VALUE)) ? badUUID(str) : lo;
}
private static long get2( long x, byte[] buf, int i ) {
if( x == Long.MIN_VALUE ) return x;
x = hdigit(x,buf[i++]);
x = hdigit(x,buf[i++]);
return x;
}
private static long hdigit( long x, byte b ) {
if( x == Long.MIN_VALUE ) return Long.MIN_VALUE;
else if( b >= '0' && b <= '9' ) return (x<<4)+b-'0';
else if( b >= 'A' && b <= 'F' ) return (x<<4)+b-'A'+10;
else if( b >= 'a' && b <= 'f' ) return (x<<4)+b-'a'+10;
else return Long.MIN_VALUE;
}
public static long badUUID( ValueString str ) {
str.setOff(-1);
return Long.MIN_VALUE;
}
private static DateTimeZone _timezone;
public static void setTimezone(String tz) {
Set<String> idSet = DateTimeZone.getAvailableIDs();
if(idSet.contains(tz))
_timezone = DateTimeZone.forID(tz);
else
Log.err("Attempted to set unrecognized timezone: "+ tz);
}
public static DateTimeZone getTimezone() {
return _timezone == null ? DateTimeZone.getDefault() : _timezone;
}
public static String listTimezones() {
DateTimeFormatter offsetFormatter = new DateTimeFormatterBuilder().appendTimeZoneOffset(null, true, 2, 4).toFormatter();
Set<String> idSet = DateTimeZone.getAvailableIDs();
Map<String, String> tzMap = new TreeMap();
Iterator<String> it = idSet.iterator();
String id, cid, offset, key, output;
DateTimeZone tz;
int i = 0;
long millis = System.currentTimeMillis();
// collect canonical and alias IDs into a map
while (it.hasNext()) {
id = it.next();
tz = DateTimeZone.forID(id);
cid = tz.getID();
offset = offsetFormatter.withZone(tz).print(tz.getStandardOffset(millis));
key = offset + " " + cid;
if (id == cid) { // Canonical ID
if (!tzMap.containsKey(key)) tzMap.put(key, "");
} else {// alias ID
if (!tzMap.containsKey(key)) tzMap.put(key, id);
else tzMap.put(key, tzMap.get(key) + ", " + id);
}
}
// assemble result
output = "StandardOffset CanonicalID, Aliases\n";
for (Map.Entry<String, String> e : tzMap.entrySet())
output += e.getKey() + e.getValue()+"\n";
return output;
}
/**
* Factory to create a formatter from a strptime pattern string.
* This models the commonly supported features of strftime from POSIX
* (where it can).
* <p>
* The format may contain locale specific output, and this will change as
* you change the locale of the formatter.
* Call DateTimeFormatter.withLocale(Locale) to switch the locale.
* For example:
* <pre>
* DateTimeFormat.forPattern(pattern).withLocale(Locale.FRANCE).print(dt);
* </pre>
*
* @param pattern pattern specification
* @return the formatter
* @throws IllegalArgumentException if the pattern is invalid
*/
public static DateTimeFormatter forStrptimePattern(String pattern) {
if (pattern == null || pattern.length() == 0)
throw new IllegalArgumentException("Empty date time pattern specification");
DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder();
parseToBuilder(builder, pattern);
DateTimeFormatter formatter = builder.toFormatter();
return formatter;
}
//-----------------------------------------------------------------------
/**
* Parses the given pattern and appends the rules to the given
* DateTimeFormatterBuilder. See strptime man page for valid patterns.
*
* @param pattern pattern specification
* @throws IllegalArgumentException if the pattern is invalid
*/
private static void parseToBuilder(DateTimeFormatterBuilder builder, String pattern) {
int length = pattern.length();
int[] indexRef = new int[1];
for (int i=0; i<length; i++) {
indexRef[0] = i;
String token = parseToken(pattern, indexRef);
i = indexRef[0];
int tokenLen = token.length();
if (tokenLen == 0) {
break;
}
char c = token.charAt(0);
if (c == '%' && token.charAt(1) != '%') {
c = token.charAt(1);
switch(c) {
case 'a':
builder.appendDayOfWeekShortText();
break;
case 'A':
builder.appendDayOfWeekText();
break;
case 'b':
case 'h':
builder.appendMonthOfYearShortText();
break;
case 'B':
builder.appendMonthOfYearText();
break;
case 'c':
builder.appendDayOfWeekShortText();
builder.appendLiteral(' ');
builder.appendMonthOfYearShortText();
builder.appendLiteral(' ');
builder.appendDayOfMonth(2);
builder.appendLiteral(' ');
builder.appendHourOfDay(2);
builder.appendLiteral(':');
builder.appendMinuteOfHour(2);
builder.appendLiteral(':');
builder.appendSecondOfMinute(2);
builder.appendLiteral(' ');
builder.appendYear(4,4);
break;
case 'C':
builder.appendCenturyOfEra(1,2);
break;
case 'd':
builder.appendDayOfMonth(2);
break;
case 'D':
builder.appendMonthOfYear(2);
builder.appendLiteral('/');
builder.appendDayOfMonth(2);
builder.appendLiteral('/');
builder.appendTwoDigitYear(2019);
break;
case 'e':
builder.appendOptional(DateTimeFormat.forPattern("' '").getParser());
builder.appendDayOfMonth(2);
break;
case 'F':
builder.appendYear(4,4);
builder.appendLiteral('-');
builder.appendMonthOfYear(2);
builder.appendLiteral('-');
builder.appendDayOfMonth(2);
break;
case 'g':
case 'G':
break; //for output only, accepted and ignored for input
case 'H':
builder.appendHourOfDay(2);
break;
case 'I':
builder.appendClockhourOfHalfday(2);
break;
case 'j':
builder.appendDayOfYear(3);
break;
case 'k':
builder.appendOptional(DateTimeFormat.forPattern("' '").getParser());
builder.appendHourOfDay(2);
break;
case 'l':
builder.appendOptional(DateTimeFormat.forPattern("' '").getParser());
builder.appendClockhourOfHalfday(2);
break;
case 'm':
builder.appendMonthOfYear(2);
break;
case 'M':
builder.appendMinuteOfHour(2);
break;
case 'n':
break;
case 'p':
builder.appendHalfdayOfDayText();
break;
case 'r':
builder.appendClockhourOfHalfday(2);
builder.appendLiteral(':');
builder.appendMinuteOfHour(2);
builder.appendLiteral(':');
builder.appendSecondOfMinute(2);
builder.appendLiteral(' ');
builder.appendHalfdayOfDayText();
break;
case 'R':
builder.appendHourOfDay(2);
builder.appendLiteral(':');
builder.appendMinuteOfHour(2);
break;
case 'S':
builder.appendSecondOfMinute(2);
break;
case 't':
break;
case 'T':
builder.appendHourOfDay(2);
builder.appendLiteral(':');
builder.appendMinuteOfHour(2);
builder.appendLiteral(':');
builder.appendSecondOfMinute(2);
break;
/* case 'U': //FIXME Joda does not support US week start (Sun), this will be wrong
builder.appendWeekOfYear(2);
break;
case 'u':
builder.appendDayOfWeek(1);
break;*/
case 'V':
break; //accepted and ignored
/* case 'w': //FIXME Joda does not support US week start (Sun), this will be wrong
builder.appendDayOfWeek(1);
break;
case 'W':
builder.appendWeekOfYear(2);
break;*/
case 'x':
builder.appendTwoDigitYear(2019);
builder.appendLiteral('/');
builder.appendMonthOfYear(2);
builder.appendLiteral('/');
builder.appendDayOfMonth(2);
break;
/* case 'X': //Results differ between OSX and Linux
builder.appendHourOfDay(2);
builder.appendLiteral(':');
builder.appendMinuteOfHour(2);
builder.appendLiteral(':');
builder.appendSecondOfMinute(2);
break;*/
case 'y': //POSIX 2004 & 2008 says 69-99 -> 1900s, 00-68 -> 2000s
builder.appendTwoDigitYear(2019);
break;
case 'Y':
builder.appendYear(4,4);
break;
case 'z':
builder.appendTimeZoneOffset(null, "z", false, 2, 2);
break;
case 'Z':
break; //for output only, accepted and ignored for input
default: // No match, ignore
builder.appendLiteral('\'');
builder.appendLiteral(token);
Log.warn(token + "is not acceptted as a parse token, treating as a literal");
}
} else {
if (c == '\'') {
String sub = token.substring(1);
if (sub.length() > 0) {
// Create copy of sub since otherwise the temporary quoted
// string would still be referenced internally.
builder.appendLiteral(new String(sub));
}
} else throw new IllegalArgumentException("Unexpected token encountered parsing format string:" + c);
}
}
}
/**
* Parses an individual token.
*
* @param pattern the pattern string
* @param indexRef a single element array, where the input is the start
* location and the output is the location after parsing the token
* @return the parsed token
*/
private static String parseToken(String pattern, int[] indexRef) {
StringBuilder buf = new StringBuilder();
int i = indexRef[0];
int length = pattern.length();
char c = pattern.charAt(i);
if (c == '%' && i + 1 < length && pattern.charAt(i+1) != '%') {
//Grab pattern tokens
c = pattern.charAt(++i);
//0 is ignored for input, and this ignores alternative religious eras
if ((c == '0' || c == 'E') && i + 1 >= length) c = pattern.charAt(++i);
buf.append('%');
buf.append(c);
} else { // Grab all else as text
buf.append('\''); // mark literals with ' in first place
buf.append(c);
for (i++; i < length;i++) {
c = pattern.charAt(i);
if (c == '%' ) { // consume literal % otherwise break
if (i + 1 < length && pattern.charAt(i + 1) == '%') i++;
else { i--; break; }
}
buf.append(c);
}
}
indexRef[0] = i;
return buf.toString();
}
}