package ml.shifu.shifu.combo; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; /** * Created by zhanhu on 12/9/16. */ public class ColumnFile { private static Logger LOG = LoggerFactory.getLogger(ColumnFile.class); /** * file location */ private String filePath; /** * file type, CSV format or PigStorage file */ private FileType fileType; /** * file delimiter */ private String delimiter; /** * variables list that will be selected */ private String[] selectedVars; /** * variable mapping, from variable name to output variable name */ private Map<String, String> varsMapping; public ColumnFile(String filePath, FileType fileType, String delimiter, String[] selectedVars, Map<String, String> varsMapping) { this.filePath = filePath; this.fileType = fileType; this.delimiter = delimiter; this.selectedVars = selectedVars; this.varsMapping = varsMapping; } public Map<String, String> getVarsMapping() { return varsMapping; } public String getFilePath() { return filePath; } public FileType getFileType() { return fileType; } public String getDelimiter() { return delimiter; } public String[] getSelectedVars() { return selectedVars; } /* * generate output variables after mapping */ public List<String> getOutputVarNames() { List<String> outputVarNames = new ArrayList<String>(); for(String var: selectedVars) { if(this.varsMapping.containsKey(var)) { outputVarNames.add(this.varsMapping.get(var)); } else { outputVarNames.add(var); } } return outputVarNames; } /* * generate the fields projector for selected variables */ public String genFieldSelector() { List<String> fields = new ArrayList<String>(); for(String var: selectedVars) { if(this.varsMapping.containsKey(var)) { fields.add(var + " as " + this.varsMapping.get(var)); } else { fields.add(var + " as " + var); } } return StringUtils.join(fields, ","); } /* * Load data into memory, only selected data. * The output format is (key, selected-variables) * */ /** * Load data into memory, only selected data. * The output format is (key, selected-variables) * * @param keyName * the key name * @return map results */ public Map<String, List<String>> loadSelectedData(String keyName) { LOG.info("Load data from {}:{} by key {}.", fileType, filePath, keyName); Map<String, List<String>> selectedData = new HashMap<String, List<String>>(); if(FileType.CSV.equals(fileType)) { CsvFile cvsFile = new CsvFile(filePath, delimiter); for(Map<String, String> records: cvsFile) { String key = records.get(keyName); if(key != null) { List<String> vars = new ArrayList<String>(); for(String varName: selectedVars) { vars.add(records.get(varName)); } selectedData.put(key, vars); } } } return selectedData; } /* * Check the selected variables contain some variable */ public boolean hasSelectedVar(String varName) { return ArrayUtils.contains(selectedVars, varName); } public static enum FileType { CSV, PIGSTORAGE } }