/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.filter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.tools.Ontology;
/**
* <p>This operator transforms the specified date attribute and writes a new nominal attribute
* in a user specified format. This might be useful for time base OLAP to change the granularity of
* the time stamps from day to week or month.</p>
*
* <p>The date format can be specified by the date_format parameter like described in the following.</p>
*
* <h4>Date and Time Patterns</h4>
*
* <p>
* Date and time formats are specified by <em>date and time pattern</em>
* strings in the date_format parameter.
* Within date and time pattern strings, unquoted letters from
* <code>'A'</code> to <code>'Z'</code> and from <code>'a'</code> to
* <code>'z'</code> are interpreted as pattern letters representing the
* components of a date or time string.
* Text can be quoted using single quotes (<code>'</code>) to avoid
* interpretation.
* <code>"''"</code> represents a single quote.
* All other characters are not interpreted; they're simply copied into the
* output string during formatting or matched against the input string
* during parsing.</p>
*
* <p>
* The following pattern letters are defined (all other characters from
* <code>'A'</code> to <code>'Z'</code> and from <code>'a'</code> to
* <code>'z'</code> are reserved):</p>
*
* <ul>
* <li><em>G</em>: era designator; Text; example: AD</li>
* <li><em>y</em>: year; Year; example: 1996; 96</li>
* <li><em>M</em>: month in year; Month; example: July; Jul; 07</li>
* <li><em>w</em>: week in year; Number; example: 27</li>
* <li><em>W</em>: week in month; Number; example: 2</li>
* <li><em>D</em>: day in year; Number; example: 189</li>
* <li><em>d</em>: day in month; Number; example: 10</li>
* <li><em>F</em>: day of week in month; Number; example: 2</li>
* <li><em>E</em>: day in week; Text; example: Tuesday; Tue</li>
* <li><em>a</em>: am/pm marker; Text; example: PM</li>
* <li><em>H</em>: hour in day (0-23); Number; example: 0</li>
* <li><em>k</em>: hour in day (1-24); Number; example: 24</li>
* <li><em>K</em>: hour in am / pm (0-11); Number; example: 0</li>
* <li><em>h</em>: hour in am / pm (1-12); Number; example: 12</li>
* <li><em>m</em>: minute in hour; Number; example: 30</li>
* <li><em>s</em>: second in minute; Number; example: 55</li>
* <li><em>S</em>: millisecond; Number; example: 978</li>
* <li><em>z</em>: time zone; General Time Zone; example: Pacific Standard Time; PST; GMT-08:00</li>
* <li><em>Z</em>: time zone; RFC 822 Time Zone; example: -0800</li>
* </ul>
*
* <p>Pattern letters are usually repeated, as their number determines the
* exact presentation:</p>
*
* <ul>
* <li><em>Text:</em>
* For formatting, if the number of pattern letters is 4 or more,
* the full form is used; otherwise a short or abbreviated form
* is used if available.
* For parsing, both forms are accepted, independent of the number
* of pattern letters.</li>
* <li><em>Number:</em>
* For formatting, the number of pattern letters is the minimum
* number of digits, and shorter numbers are zero-padded to this amount.
* For parsing, the number of pattern letters is ignored unless
* it's needed to separate two adjacent fields.</li>
* <li><em>Year:</em>
* If the underlying calendar is the Gregorian calendar, the following
* rules are applied.
*
* <ul>
* <li>For formatting, if the number of pattern letters is 2, the year
* is truncated to 2 digits; otherwise it is interpreted as a
* <em>number</em>.</li>
* <li>For parsing, if the number of pattern letters is more than 2,
* the year is interpreted literally, regardless of the number of
* digits. So using the pattern "MM/dd/yyyy", "01/11/12" parses to
* Jan 11, 12 A.D.</li>
* <li>For parsing with the abbreviated year pattern ("y" or "yy"),
* this operator must interpret the abbreviated year
* relative to some century. It does this by adjusting dates to be
* within 80 years before and 20 years after the time the operator
* is created. For example, using a pattern of "MM/dd/yy" and the
* operator created on Jan 1, 1997, the string
* "01/11/12" would be interpreted as Jan 11, 2012 while the string
* "05/04/64"
* would be interpreted as May 4, 1964.
* During parsing, only strings consisting of exactly two digits will be
* parsed into the default century.
* Any other numeric string, such as a one digit string, a three or more digit
* string, or a two digit string that isn't all digits (for example, "-1"), is
* interpreted literally. So "01/02/3" or "01/02/003" are parsed, using the
* same pattern, as Jan 2, 3 AD. Likewise, "01/02/-3" is parsed as Jan 2, 4 BC.</li>
* </ul>
*
* Otherwise, calendar system specific forms are applied.
* If the number of pattern
* letters is 4 or more, a calendar specific long form is used. Otherwise, a calendar
* short or abbreviated form is used.</li>
*
* <li><em>Month:</em>
* If the number of pattern letters is 3 or more, the month is
* interpreted as <em>text</em>; otherwise,
* it is interpreted as a <em>number</em>.</li>
*
* <li><em>General time zone:</em>
* Time zones are interpreted as <em>text</em> if they have
* names. It is possible to define time zones by representing a GMT offset value.
* RFC 822 time zones are also accepted.</li>
*
* <li><em>RFC 822 time zone:</em>
* For formatting, the RFC 822 4-digit time zone format is used.
* General time zones are also accepted.</li>
* </ul>
*
* <p>This operator also supports <em>localized date and time
* pattern</em> strings by defining the locale parameter. In these strings,
* the pattern letters described above
* may be replaced with other, locale dependent, pattern letters.</p>
*
* <h4>Examples</h4>
*
* <p>The following examples show how date and time patterns are interpreted in
* the U.S. locale. The given date and time are 2001-07-04 12:08:56 local time
* in the U.S. Pacific Time time zone.</p>
*
* <ul>
* <li><em>"yyyy.MM.dd G 'at' HH:mm:ss z"</em>: 2001.07.04 AD at 12:08:56 PDT</li>
* <li><em>"EEE, MMM d, ''yy"</em>: Wed, Jul 4, '01</li>
* <li><em>"h:mm a"</em>: 12:08 PM</li>
* <li><em>"hh 'o''clock' a, zzzz"</em>: 12 o'clock PM, Pacific Daylight Time</li>
* <li><em>"K:mm a, z"</em>: 0:08 PM, PDT</li>
* <li><em>"yyyy.MMMMM.dd GGG hh:mm aaa"</em>: 02001.July.04 AD 12:08 PM</li>
* <li><em>"EEE, d MMM yyyy HH:mm:ss Z"</em>: Wed, 4 Jul 2001 12:08:56 -0700</li>
* <li><em>"yyMMddHHmmssZ"</em>: 010704120856-0700</li>
* <li><em>"yyyy-MM-dd'T'HH:mm:ss.SSSZ"</em>: 2001-07-04T12:08:56.235-0700</li>
* </ul>
*
* @author Ingo Mierswa
* @version $Id: Date2Nominal.java,v 1.6 2008/08/11 15:11:13 tobiasmalbrecht Exp $
*/
public class Date2Nominal extends Operator {
public static final String PARAMETER_ATTRIBUTE_NAME = "attribute_name";
public static final String PARAMETER_DATE_FORMAT = "date_format";
public static final String PARAMETER_LOCALE = "locale";
public static final String PARAMETER_KEEP_OLD_ATTRIBUTE = "keep_old_attribute";
public static List<Locale> availableLocales = new ArrayList<Locale>();
public static String[] availableLocaleNames;
public static int defaultLocale;
static {
Locale[] availableLocaleArray = Locale.getAvailableLocales();
for (Locale l : availableLocaleArray) {
availableLocales.add(l);
}
Collections.sort(availableLocales, new Comparator<Locale> () {
public int compare(Locale o1, Locale o2) {
return o1.getDisplayName().compareTo(o2.getDisplayName());
}
});
availableLocaleNames = new String[availableLocales.size()];
defaultLocale = -1;
for (int i = 0; i < availableLocales.size(); i++) {
Locale currentLocale = availableLocales.get(i);
availableLocaleNames[i] = currentLocale.getDisplayName();
if (currentLocale.equals(Locale.US)) {
defaultLocale = i;
}
}
if (defaultLocale < 0)
defaultLocale = 0;
}
public Date2Nominal(OperatorDescription description) {
super(description);
}
public IOObject[] apply() throws OperatorException {
ExampleSet exampleSet = getInput(ExampleSet.class);
String attributeName = getParameterAsString(PARAMETER_ATTRIBUTE_NAME);
Attribute dateAttribute = exampleSet.getAttributes().get(attributeName);
if (dateAttribute == null) {
throw new UserError(this, 111, attributeName);
}
Attribute newAttribute = AttributeFactory.createAttribute(attributeName, Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(newAttribute);
exampleSet.getAttributes().addRegular(newAttribute);
String dateFormat = getParameterAsString(PARAMETER_DATE_FORMAT);
int localeIndex = getParameterAsInt(PARAMETER_LOCALE);
Locale selectedLocale = Locale.US;
if ((localeIndex >= 0) && (localeIndex < availableLocales.size()))
selectedLocale = availableLocales.get(getParameterAsInt(PARAMETER_LOCALE));
SimpleDateFormat parser = new SimpleDateFormat(dateFormat, selectedLocale);
for (Example example : exampleSet) {
if (Double.isNaN(example.getValue(dateAttribute))) {
example.setValue(newAttribute, Double.NaN);
} else {
Date date = new Date((long)example.getValue(dateAttribute));
String newDateStr = parser.format(date);
example.setValue(newAttribute, newAttribute.getMapping().mapString(newDateStr));
}
}
if (!getParameterAsBoolean(PARAMETER_KEEP_OLD_ATTRIBUTE)) {
exampleSet.getAttributes().remove(dateAttribute);
} else {
newAttribute.setName(attributeName + "_nominal");
}
return new IOObject[]{exampleSet};
}
public Class<?>[] getInputClasses() {
return new Class[] { ExampleSet.class };
}
public Class<?>[] getOutputClasses() {
return new Class[] { ExampleSet.class };
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeString(PARAMETER_ATTRIBUTE_NAME, "The attribute which should be parsed.", false));
types.add(new ParameterTypeString(PARAMETER_DATE_FORMAT, "The output format of the date values, for example \"yyyy/MM/dd\".", false));
ParameterType type = new ParameterTypeCategory(PARAMETER_LOCALE, "The used locale for date texts, for example \"Wed\" (English) in contrast to \"Mi\" (German).", availableLocaleNames, defaultLocale);
type.setExpert(false);
types.add(type);
types.add(new ParameterTypeBoolean(PARAMETER_KEEP_OLD_ATTRIBUTE, "Indicates if the original date attribute should be kept.", false));
return types;
}
}