package com.taobao.tddl.rule.utils.sample; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Calendar; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; /** * 一个描点集合的抽象。支持线程不安全的笛卡尔积迭代遍历。<br/> * 一个Sample表示一个笛卡尔积抽样,将多列独立的枚举值,用特殊的遍历方法,转换为笛卡尔积抽样(即不同列值的组合) * 一个Sample用一个Map<String, Object>表示,key包含了各列,value对应每列的一个取值 * * <pre> * 算法介绍: * 1. 构建多个列的Iterator * 2. 每个Iterator先取第一个值,此时处于最后一个列 * 3. 遍历开始 * a. 从当前Iterator进行遍历,该列所有值遍历完成后,往前退到上一个列 * b. 上一列移动到下一个值,移动完成后进入下一个列进行遍历,如果无法移动,继续a的操作 * </pre> * * @author linxuan */ public class Samples implements Iterable<Map<String/* 列名 */, Object/* 列值 */>>, Iterator<Map<String, Object>> { private final Map<String, Set<Object>> columnEnumerates; private final String[] subColums; // 使用哪几列,便于做sub private final Set<String> subColumSet; // 与subColums保持一致,便于判,只读 public Samples(Map<String, Set<Object>> columnEnumerates){ this.columnEnumerates = columnEnumerates; this.subColums = columnEnumerates.keySet().toArray(new String[columnEnumerates.size()]); this.subColumSet = columnEnumerates.keySet();// subColumSet只读,这样应该没问题 } public Samples(Map<String, Set<Object>> columnEnumerates, String[] subColumns){ this.columnEnumerates = columnEnumerates; this.subColums = subColumns; this.subColumSet = new HashSet<String>(); this.subColumSet.addAll(Arrays.asList(subColumns)); if (subColumSet.size() != subColums.length) { throw new IllegalArgumentException(Arrays.toString(subColumns) + " has duplicate columm"); } } public Samples(Set<String> columnNames){ this.columnEnumerates = new HashMap<String, Set<Object>>(); for (String name : columnNames) { this.columnEnumerates.put(name, new HashSet<Object>(1)); } this.subColums = columnNames.toArray(new String[columnEnumerates.size()]); this.subColumSet = Collections.unmodifiableSet(columnNames);// subColumSet只读 } /** * @param columns 如果columns包含本对象columnEnumerates中不存在的key,后果不可预期 */ public Samples subSamples(String[] columns) { if (columns.length == this.subColums.length) { return this; // 这里就不判读columns是否都和this一致了,有一定风险 } return new Samples(this.columnEnumerates, columns);// 可能会使第三层sub由小变大,但是不影响使用。也没有判读一致性 } /** * @return 如果subColums和columnEnumerates相同,则直接返回,否则抽取 */ public Map<String, Set<Object>> getColumnEnumerates() { if (this.columnEnumerates.size() == subColums.length) { return this.columnEnumerates; } else { Map<String, Set<Object>> res = new HashMap<String, Set<Object>>(subColums.length); for (String column : subColums) { res.put(column, this.columnEnumerates.get(column)); } return res; } } /** * @return 列个数 */ public int size() { return this.subColums.length; } // ======================== 笛卡尔积迭代遍历的相关操作方法 =========================== /** * 下面是笛卡尔积迭代遍历的实现 */ private Map<String, Object> currentCartesianSample; // currentCartesianProduct当前的笛卡尔值 private Iterator<Object>[] iterators; // 这种方式尾端iterator要反复重新打开,KeyIterator对象会创建比较多。考虑用Object[]加游标 private int cursor; /** * 向一个列添加枚举值 */ public void addEnumerates(String name, Set<Object> values) { if (columnEnumerates.containsKey(name)) { columnEnumerates.get(name).addAll(values); } else { throw new IllegalArgumentException(Arrays.toString(subColums) + ", Samples not contain key:" + name); } } /** * 添加一个Sample组合。若某个列名不在本Samples中,则直接抛空指针 */ public void addSample(Map<String, Object> aCartesianSample) { for (Map.Entry<String, Object> e : aCartesianSample.entrySet()) { columnEnumerates.get(e.getKey()).add(e.getValue()); } } @SuppressWarnings("unchecked") public Iterator<Map<String, Object>> iterator() { // 每次迭代前清空上次迭代状态 currentCartesianSample = new HashMap<String, Object>(subColums.length); iterators = new Iterator[subColums.length]; int i = cursor = 0; for (String name : subColums) { iterators[i++] = columnEnumerates.get(name).iterator(); } return this; } public boolean hasNext() { for (Iterator<Object> it : iterators) { if (it.hasNext()) { return true; } } return false; } /** * 返回结果只能读取。如若修改后果不可预期。 columnSamples每个列的枚举值集合必须至少有一个元素。 */ public Map<String, Object> next() { for (;;) { if (iterators[cursor].hasNext()) { currentCartesianSample.put(subColums[cursor], iterators[cursor].next()); if (cursor == subColums.length - 1) { break; } else { cursor++; } } else { if (cursor == 0) { break; // 全部结束了 } else { // 重新打开当前的iterator备下一轮用 iterators[cursor] = columnEnumerates.get(subColums[cursor]).iterator(); cursor--; } } } return currentCartesianSample; } public void remove() { throw new UnsupportedOperationException(getClass().getName() + ".remove()"); } public Set<Object> getColumnEnumerates(String name) { return columnEnumerates.get(name); } public Set<String> getSubColumSet() { return subColumSet; } public String toString() { SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd"); StringBuilder sb = new StringBuilder("Samples{"); for (String column : this.subColumSet) { sb.append(column).append("=["); for (Object value : this.columnEnumerates.get(column)) { if (value instanceof Calendar) { sb.append(df.format(((Calendar) value).getTime())).append(","); } else { sb.append(value).append(","); } } sb.append("]"); } sb.append("}"); return sb.toString(); } }