/*
* Copyright [2012-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.shifu.container.obj;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.google.common.collect.Lists;
import ml.shifu.shifu.util.CommonUtils;
/**
* SourceData part for ModelConfig.json
*/
@JsonIgnoreProperties(ignoreUnknown = true)
public class RawSourceData implements Cloneable {
/**
* If data is from local or hdfs, S3 is not supported so far.
*
* @author Zhang David (pengzhang@paypal.com)
*/
@JsonDeserialize(using = SouceTypeDeserializer.class)
public static enum SourceType {
LOCAL, HDFS, S3
}
/**
* If source from local or hdfs
*/
private SourceType source = SourceType.LOCAL;
/**
* Data path, from local or hdfs. Folder and file are all supported. Recursive folder is also supported. CSV format
* file is supported and if csv, header no need to set.
*/
private String dataPath;
/**
* Validation data path which is used in train step for validation data. Such data should have the same schema like
* {@link #dataPath}. If {@link #validationDataPath} is not empty, specified validation data is enabled and all
* other sampling parameters have no effect. If empty (by default), such feature is not enabled.
*/
private String validationDataPath;
/**
* How to split data and validation data.
*/
private String dataDelimiter = "|";
/**
* Header path for schema, if null, first line of data will be checked and read to get schema. That's why csv format
* file works well in Shifu.
*/
private String headerPath;
/**
* How to split header content.
*
*/
private String headerDelimiter = "|";
/**
* Filter expression on data path and validation data path, this is helpful to filter some data not in original
* data. Example like 'columna > 10'
*/
private String filterExpressions = "";
/**
* Weight column, should be one of columns
*/
private String weightColumnName = "";
/**
* Target column, should be one of columns
*/
private String targetColumnName;
/**
* Positive tag list: Example like ["0", "1"];
*/
private List<String> posTags;
/**
* Negative tag list: Example like ["2", "3"]
*/
private List<String> negTags;
/**
* Missing or invalid values.
*/
private List<String> missingOrInvalidValues = Lists.asList("", new String[] { "?" });
// private List<String> missingOrInvalidValues = Lists.asList("", new String[] { "*", "#", "?", "null", "none" });
/**
* Auto type column feature, if eanabled by tree, shifu will set categorical or numerical feature automatically.
* Since severl false-positive setting categorical features, such feature is disabled
*/
private Boolean autoType = Boolean.FALSE;
/**
* If number ratio over autoTypeThreshold/100, column will be set to numeric when {@link #autoType} is true.
*/
private Integer autoTypeThreshold = 0;
/**
* Meta column configuration file
*/
private String metaColumnNameFile;
/**
* @return the autoTypeThreshold
*/
@JsonIgnore
public Integer getAutoTypeThreshold() {
return autoTypeThreshold;
}
/**
* @param autoTypeThreshold
* the autoTypeThreshold to set
*/
@JsonProperty
public void setAutoTypeThreshold(Integer autoTypeThreshold) {
this.autoTypeThreshold = autoTypeThreshold;
}
public SourceType getSource() {
return source;
}
public void setSource(SourceType source) {
this.source = source;
}
public String getDataPath() {
return dataPath;
}
public void setDataPath(String dataPath) {
this.dataPath = dataPath;
}
public String getValidationDataPath() {
return validationDataPath;
}
public void setValidationDataPath(String validationDataPath) {
this.validationDataPath = validationDataPath;
}
public String getDataDelimiter() {
return dataDelimiter;
}
public void setDataDelimiter(String dataDelimiter) {
this.dataDelimiter = dataDelimiter;
}
public String getHeaderPath() {
return headerPath;
}
public void setHeaderPath(String headerPath) {
this.headerPath = headerPath;
}
public String getHeaderDelimiter() {
return headerDelimiter;
}
public void setHeaderDelimiter(String headerDelimiter) {
this.headerDelimiter = headerDelimiter;
}
public String getFilterExpressions() {
return filterExpressions;
}
public void setFilterExpressions(String filterExpressions) {
this.filterExpressions = filterExpressions;
}
public String getWeightColumnName() {
return weightColumnName;
}
public void setWeightColumnName(String weightColumnName) {
this.weightColumnName = weightColumnName;
}
public String getTargetColumnName() {
return targetColumnName;
}
public void setTargetColumnName(String targetColumnName) {
this.targetColumnName = targetColumnName;
}
public List<String> getPosTags() {
return posTags;
}
public void setPosTags(List<String> posTags) {
this.posTags = trimTags(posTags);
}
public List<String> getNegTags() {
return negTags;
}
public void setNegTags(List<String> negTags) {
this.negTags = trimTags(negTags);
}
private List<String> trimTags(List<String> tags) {
if(tags != null) {
List<String> trimmedTags = new ArrayList<String>();
for(String tag: tags) {
trimmedTags.add(CommonUtils.trimTag(tag));
}
return trimmedTags;
} else {
return null;
}
}
public String getMetaColumnNameFile() {
return metaColumnNameFile;
}
public void setMetaColumnNameFile(String metaColumnNameFile) {
this.metaColumnNameFile = metaColumnNameFile;
}
@Override
public RawSourceData clone() {
RawSourceData copy = new RawSourceData();
copy.setSource(source);
copy.setDataPath(dataPath);
copy.setDataDelimiter(dataDelimiter);
copy.setHeaderPath(headerPath);
copy.setHeaderDelimiter(headerDelimiter);
copy.setFilterExpressions(filterExpressions);
copy.setWeightColumnName(weightColumnName);
copy.setTargetColumnName(targetColumnName);
copy.setPosTags(new ArrayList<String>(posTags));
copy.setNegTags(new ArrayList<String>(negTags));
copy.setMissingOrInvalidValues(missingOrInvalidValues);
copy.setMetaColumnNameFile(metaColumnNameFile);
return copy;
}
/**
* @return the missingOrInvalidValues
*/
public List<String> getMissingOrInvalidValues() {
return missingOrInvalidValues;
}
/**
* @param missingOrInvalidValues
* the missingOrInvalidValues to set
*/
public void setMissingOrInvalidValues(List<String> missingOrInvalidValues) {
this.missingOrInvalidValues = missingOrInvalidValues;
}
/**
* @return the autoType
*/
@JsonIgnore
public Boolean getAutoType() {
return autoType;
}
/**
* @param autoType
* the autoType to set
*/
@JsonProperty
public void setAutoType(Boolean autoType) {
this.autoType = autoType;
}
}