package com.guokr.hebo.tap; import java.io.IOException; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import cascading.flow.FlowProcess; import cascading.flow.planner.Scope; import cascading.scheme.Scheme; import cascading.scheme.SinkCall; import cascading.scheme.SourceCall; import cascading.tap.SinkMode; import cascading.tap.SinkTap; import cascading.tap.Tap; import cascading.tap.TapException; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntrySchemeCollector; import cascading.tuple.TupleException; public abstract class BaseTemplateTap<Config, Output> extends SinkTap<Config, Output> { /** Field OPEN_FILES_THRESHOLD_DEFAULT */ protected static final int OPEN_TAPS_THRESHOLD_DEFAULT = 300; private class TemplateCollector extends TupleEntryCollector { private final FlowProcess<Config> flowProcess; private final Config conf; private final Fields parentFields; private final Fields pathFields; public TemplateCollector(FlowProcess<Config> flowProcess) { super(Fields.asDeclaration(getSinkFields())); this.flowProcess = flowProcess; this.conf = flowProcess.getConfigCopy(); this.parentFields = parent.getSinkFields(); this.pathFields = ((TemplateScheme) getScheme()).pathFields; } private TupleEntryCollector getCollector(String path) { TupleEntryCollector collector = collectors.get(path); if (collector != null) return collector; try { collector = createTupleEntrySchemeCollector(flowProcess, parent, path); flowProcess.increment(Counters.Paths_Opened, 1); } catch (IOException exception) { throw new TapException("unable to open template path: " + path, exception); } if (collectors.size() > openTapsThreshold) purgeCollectors(); collectors.put(path, collector); return collector; } private void purgeCollectors() { int numToClose = Math.max(1, (int) (openTapsThreshold * .10)); Set<String> removeKeys = new HashSet<String>(); Set<String> keys = collectors.keySet(); for (String key : keys) { if (numToClose-- == 0) break; removeKeys.add(key); } for (String removeKey : removeKeys) closeCollector(collectors.remove(removeKey)); flowProcess.increment(Counters.Path_Purges, 1); } @Override public void close() { super.close(); try { for (TupleEntryCollector collector : collectors.values()) closeCollector(collector); } finally { collectors.clear(); } } private void closeCollector(TupleEntryCollector collector) { if (collector == null) return; try { collector.close(); flowProcess.increment(Counters.Paths_Closed, 1); } catch (Exception exception) { // do nothing } } public void add(TupleEntry tupleEntry) { Fields expectedFields = this.tupleEntry.getFields(); this.tupleEntry = tupleEntry; try { collect(tupleEntry); } catch (IOException exception) { throw new TupleException("unable to collect tuple", exception); } } @SuppressWarnings("serial") protected void collect(TupleEntry tupleEntry) throws IOException { System.out.println("collect invoked"); if (pathFields != null) { String datetime = tupleEntry.getString(0); // 0位置为datetime DateTimeFormatter formatter = DateTimeFormat .forPattern("yyyy-MM-dd'T'HH:mm:ssZ"); final DateTime dt = formatter.parseDateTime(datetime); Fields fields = tupleEntry.getFields(); Fields[] timeFields = new Fields[] { new Fields("?year"), new Fields("?month"), new Fields("?day"), new Fields("?hour"), new Fields("?minute") }; fields = fields.append(timeFields); Tuple tuple = tupleEntry.getTuple(); tuple = tuple.append(new Tuple(dt.toString(DateTimeFormat.forPattern("yyyy"))), new Tuple(dt.toString(DateTimeFormat.forPattern("MM"))), new Tuple(dt.toString(DateTimeFormat.forPattern("dd"))), new Tuple(dt.toString(DateTimeFormat.forPattern("HH"))), new Tuple(dt.toString(DateTimeFormat.forPattern("mm")))); TupleEntry heboTupleEntry = new TupleEntry(fields, tuple); Tuple pathValues = heboTupleEntry.selectTuple(pathFields); String path = pathValues.format(pathTemplate); getCollector(path) .add(heboTupleEntry.selectTuple(parentFields)); } else { String path = tupleEntry.getTuple().format(pathTemplate); getCollector(path).add(tupleEntry); } } } /** Field parent */ protected Tap parent; /** Field pathTemplate */ protected String pathTemplate; /** Field keepParentOnDelete */ protected boolean keepParentOnDelete = false; /** Field openTapsThreshold */ protected int openTapsThreshold = OPEN_TAPS_THRESHOLD_DEFAULT; private Granularity input; private Granularity output; /** Field collectors */ private final Map<String, TupleEntryCollector> collectors = new LinkedHashMap<String, TupleEntryCollector>( 1000, .75f, true); protected abstract TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<Config> flowProcess, Tap parent, String path) throws IOException; protected static Fields getPathFields(Granularity input, Granularity output) { Fields pathFields = null; int level = output.getValue() - input.getValue(); switch (level) { case 1: pathFields = new Fields(output.toField()); break; case 2: pathFields = new Fields(output.getPrevious().toField(), output.toField()); break; case 3: pathFields = new Fields(output.getPrevious().getPrevious() .toField(), output.getPrevious().toField(), output.toField()); break; } return pathFields; } protected static String getPathTemplate(Granularity input, Granularity output) { String pathTemplate = null; int level = output.getValue() - input.getValue(); switch (level) { case 1: pathTemplate = "%s"; break; case 2: pathTemplate = "%s/%s"; break; case 3: pathTemplate = "%s/%s/%s"; break; } return pathTemplate; } /** * Method getParent returns the parent Tap of this TemplateTap object. * * @return the parent (type Tap) of this TemplateTap object. */ public Tap getParent() { return parent; } /** * Method getPathTemplate returns the pathTemplate * {@link java.util.Formatter} format String of this TemplateTap object. * * @return the pathTemplate (type String) of this TemplateTap object. */ public String getPathTemplate() { return pathTemplate; } @Override public String getIdentifier() { return parent.getIdentifier(); } /** * Method getOpenTapsThreshold returns the openTapsThreshold of this * TemplateTap object. * * @return the openTapsThreshold (type int) of this TemplateTap object. */ public int getOpenTapsThreshold() { return openTapsThreshold; } @Override public TupleEntryCollector openForWrite(FlowProcess<Config> flowProcess, Output output) throws IOException { return new TemplateCollector(flowProcess); } /** @see cascading.tap.Tap#createResource(Object) */ public boolean createResource(Config conf) throws IOException { return parent.createResource(conf); } /** @see cascading.tap.Tap#deleteResource(Object) */ public boolean deleteResource(Config conf) throws IOException { return keepParentOnDelete || parent.deleteResource(conf); } @Override public boolean commitResource(Config conf) throws IOException { return parent.commitResource(conf); } @Override public boolean rollbackResource(Config conf) throws IOException { return parent.rollbackResource(conf); } /** @see cascading.tap.Tap#resourceExists(Object) */ public boolean resourceExists(Config conf) throws IOException { return parent.resourceExists(conf); } /** @see cascading.tap.Tap#getModifiedTime(Object) */ @Override public long getModifiedTime(Config conf) throws IOException { return parent.getModifiedTime(conf); } @Override public Scope outgoingScopeFor(Set<Scope> incomingScopes) { return new Scope(getSinkFields()); } @Override public boolean equals(Object object) { if (this == object) return true; if (object == null || getClass() != object.getClass()) return false; if (!super.equals(object)) return false; BaseTemplateTap that = (BaseTemplateTap) object; if (parent != null ? !parent.equals(that.parent) : that.parent != null) return false; if (pathTemplate != null ? !pathTemplate.equals(that.pathTemplate) : that.pathTemplate != null) return false; return true; } @Override public int hashCode() { int result = super.hashCode(); result = 31 * result + (parent != null ? parent.hashCode() : 0); result = 31 * result + (pathTemplate != null ? pathTemplate.hashCode() : 0); return result; } @Override public String toString() { return getClass().getSimpleName() + "[\"" + parent + "\"]" + "[\"" + pathTemplate + "\"]"; } public enum Counters { Paths_Opened, Paths_Closed, Path_Purges } protected BaseTemplateTap(Tap parent, Granularity input, Granularity output, int openTapsThreshold) { super(new TemplateScheme(parent.getScheme(), getPathFields(input, output))); this.parent = parent; this.pathTemplate = getPathTemplate(input, output); this.openTapsThreshold = openTapsThreshold; this.input = input; this.output = output; } protected BaseTemplateTap(Tap parent, Granularity input, Granularity output, SinkMode sinkMode) { super(new TemplateScheme(parent.getScheme(), getPathFields(input, output)), sinkMode); this.parent = parent; this.pathTemplate = getPathTemplate(input, output); } protected BaseTemplateTap(Tap parent, Granularity input, Granularity output, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold) { super(new TemplateScheme(parent.getScheme(), getPathFields(input, output)), sinkMode); this.parent = parent; this.pathTemplate = getPathTemplate(input, output); this.keepParentOnDelete = keepParentOnDelete; this.openTapsThreshold = openTapsThreshold; } public static class TemplateScheme<Config, Output> extends Scheme<Config, Void, Output, Void, Void> { private final Scheme scheme; private final Fields pathFields; public TemplateScheme(Scheme scheme) { this.scheme = scheme; this.pathFields = null; } public TemplateScheme(Scheme scheme, Fields pathFields) { this.scheme = scheme; if (pathFields == null || pathFields.isAll()) this.pathFields = null; else if (pathFields.isDefined()) this.pathFields = pathFields; else throw new IllegalArgumentException( "pathFields must be defined or the ALL substitution, got: " + pathFields.printVerbose()); } public Fields getSinkFields() { if (pathFields == null || scheme.getSinkFields().isAll()) return scheme.getSinkFields(); return Fields.merge(scheme.getSinkFields(), pathFields); } public void setSinkFields(Fields sinkFields) { scheme.setSinkFields(sinkFields); } public Fields getSourceFields() { return scheme.getSourceFields(); } public void setSourceFields(Fields sourceFields) { scheme.setSourceFields(sourceFields); } public int getNumSinkParts() { return scheme.getNumSinkParts(); } public void setNumSinkParts(int numSinkParts) { scheme.setNumSinkParts(numSinkParts); } @Override public void sourceConfInit(FlowProcess<Config> flowProcess, Tap<Config, Void, Output> tap, Config conf) { scheme.sourceConfInit(flowProcess, tap, conf); } @Override public void sourcePrepare(FlowProcess<Config> flowProcess, SourceCall<Void, Void> sourceCall) throws IOException { scheme.sourcePrepare(flowProcess, sourceCall); } @Override public boolean source(FlowProcess<Config> flowProcess, SourceCall<Void, Void> sourceCall) throws IOException { throw new UnsupportedOperationException("not supported"); } @Override public void sourceCleanup(FlowProcess<Config> flowProcess, SourceCall<Void, Void> sourceCall) throws IOException { scheme.sourceCleanup(flowProcess, sourceCall); } @Override public void sinkConfInit(FlowProcess<Config> flowProcess, Tap<Config, Void, Output> tap, Config conf) { scheme.sinkConfInit(flowProcess, tap, conf); } @Override public void sinkPrepare(FlowProcess<Config> flowProcess, SinkCall<Void, Output> sinkCall) throws IOException { scheme.sinkPrepare(flowProcess, sinkCall); } @Override public void sink(FlowProcess<Config> flowProcess, SinkCall<Void, Output> sinkCall) throws IOException { throw new UnsupportedOperationException("should never be called"); } @Override public void sinkCleanup(FlowProcess<Config> flowProcess, SinkCall<Void, Output> sinkCall) throws IOException { scheme.sinkCleanup(flowProcess, sinkCall); } } }