/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.exec.store.easy.text; import java.io.IOException; import java.util.Collections; import java.util.List; import java.util.Map; import org.apache.drill.common.exceptions.ExecutionSetupException; import org.apache.drill.common.expression.SchemaPath; import org.apache.drill.common.logical.FormatPluginConfig; import org.apache.drill.common.logical.StoragePluginConfig; import org.apache.drill.exec.ExecConstants; import org.apache.drill.exec.ops.FragmentContext; import org.apache.drill.exec.physical.base.AbstractGroupScan; import org.apache.drill.exec.physical.base.ScanStats; import org.apache.drill.exec.physical.base.ScanStats.GroupScanProperty; import org.apache.drill.exec.planner.physical.PlannerSettings; import org.apache.drill.exec.proto.ExecProtos.FragmentHandle; import org.apache.drill.exec.proto.UserBitShared.CoreOperatorType; import org.apache.drill.exec.server.DrillbitContext; import org.apache.drill.exec.store.RecordReader; import org.apache.drill.exec.store.RecordWriter; import org.apache.drill.exec.store.dfs.DrillFileSystem; import org.apache.drill.exec.store.dfs.FileSelection; import org.apache.drill.exec.store.dfs.FileSystemConfig; import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin; import org.apache.drill.exec.store.dfs.easy.EasyGroupScan; import org.apache.drill.exec.store.dfs.easy.EasyWriter; import org.apache.drill.exec.store.dfs.easy.FileWork; import org.apache.drill.exec.store.easy.text.compliant.CompliantTextRecordReader; import org.apache.drill.exec.store.easy.text.compliant.TextParsingSettings; import org.apache.drill.exec.store.schedule.CompleteFileWork; import org.apache.drill.exec.store.text.DrillTextRecordReader; import org.apache.drill.exec.store.text.DrillTextRecordWriter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileSplit; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonInclude.Include; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; import com.google.common.collect.ImmutableList; import com.google.common.collect.Maps; public class TextFormatPlugin extends EasyFormatPlugin<TextFormatPlugin.TextFormatConfig> { private final static String DEFAULT_NAME = "text"; public TextFormatPlugin(String name, DrillbitContext context, Configuration fsConf, StoragePluginConfig storageConfig) { super(name, context, fsConf, storageConfig, new TextFormatConfig(), true, false, true, true, Collections.<String>emptyList(), DEFAULT_NAME); } public TextFormatPlugin(String name, DrillbitContext context, Configuration fsConf, StoragePluginConfig config, TextFormatConfig formatPluginConfig) { super(name, context, fsConf, config, formatPluginConfig, true, false, true, true, formatPluginConfig.getExtensions(), DEFAULT_NAME); } @Override public RecordReader getRecordReader(FragmentContext context, DrillFileSystem dfs, FileWork fileWork, List<SchemaPath> columns, String userName) throws ExecutionSetupException { Path path = dfs.makeQualified(new Path(fileWork.getPath())); FileSplit split = new FileSplit(path, fileWork.getStart(), fileWork.getLength(), new String[]{""}); if (context.getOptions().getOption(ExecConstants.ENABLE_NEW_TEXT_READER_KEY).bool_val == true) { TextParsingSettings settings = new TextParsingSettings(); settings.set((TextFormatConfig)formatConfig); return new CompliantTextRecordReader(split, dfs, context, settings, columns); } else { char delim = ((TextFormatConfig)formatConfig).getFieldDelimiter(); return new DrillTextRecordReader(split, dfs.getConf(), context, delim, columns); } } @Override public AbstractGroupScan getGroupScan(String userName, FileSelection selection, List<SchemaPath> columns) throws IOException { return new EasyGroupScan(userName, selection, this, columns, selection.selectionRoot); } @Override protected ScanStats getScanStats(final PlannerSettings settings, final EasyGroupScan scan) { long data = 0; for (final CompleteFileWork work : scan.getWorkIterable()) { data += work.getTotalBytes(); } final double estimatedRowSize = settings.getOptions().getOption(ExecConstants.TEXT_ESTIMATED_ROW_SIZE); final double estRowCount = data / estimatedRowSize; return new ScanStats(GroupScanProperty.NO_EXACT_ROW_COUNT, (long) estRowCount, 1, data); } @Override public RecordWriter getRecordWriter(final FragmentContext context, final EasyWriter writer) throws IOException { final Map<String, String> options = Maps.newHashMap(); options.put("location", writer.getLocation()); FragmentHandle handle = context.getHandle(); String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId()); options.put("prefix", fragmentId); options.put("separator", ((TextFormatConfig)getConfig()).getFieldDelimiterAsString()); options.put(FileSystem.FS_DEFAULT_NAME_KEY, ((FileSystemConfig)writer.getStorageConfig()).connection); options.put("extension", ((TextFormatConfig)getConfig()).getExtensions().get(0)); RecordWriter recordWriter = new DrillTextRecordWriter(context.getAllocator(), writer.getStorageStrategy()); recordWriter.init(options); return recordWriter; } @JsonTypeName("text") @JsonInclude(Include.NON_DEFAULT) public static class TextFormatConfig implements FormatPluginConfig { public List<String> extensions = ImmutableList.of(); public String lineDelimiter = "\n"; public char fieldDelimiter = '\n'; public char quote = '"'; public char escape = '"'; public char comment = '#'; public boolean skipFirstLine = false; public boolean extractHeader = false; public List<String> getExtensions() { return extensions; } public char getQuote() { return quote; } public char getEscape() { return escape; } public char getComment() { return comment; } public String getLineDelimiter() { return lineDelimiter; } public char getFieldDelimiter() { return fieldDelimiter; } @JsonIgnore public boolean isHeaderExtractionEnabled() { return extractHeader; } @JsonIgnore public String getFieldDelimiterAsString(){ return new String(new char[]{fieldDelimiter}); } @Deprecated @JsonProperty("delimiter") public void setFieldDelimiter(char delimiter){ this.fieldDelimiter = delimiter; } public boolean isSkipFirstLine() { return skipFirstLine; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + comment; result = prime * result + escape; result = prime * result + ((extensions == null) ? 0 : extensions.hashCode()); result = prime * result + fieldDelimiter; result = prime * result + ((lineDelimiter == null) ? 0 : lineDelimiter.hashCode()); result = prime * result + quote; result = prime * result + (skipFirstLine ? 1231 : 1237); result = prime * result + (extractHeader ? 1231 : 1237); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } TextFormatConfig other = (TextFormatConfig) obj; if (comment != other.comment) { return false; } if (escape != other.escape) { return false; } if (extensions == null) { if (other.extensions != null) { return false; } } else if (!extensions.equals(other.extensions)) { return false; } if (fieldDelimiter != other.fieldDelimiter) { return false; } if (lineDelimiter == null) { if (other.lineDelimiter != null) { return false; } } else if (!lineDelimiter.equals(other.lineDelimiter)) { return false; } if (quote != other.quote) { return false; } if (skipFirstLine != other.skipFirstLine) { return false; } if (extractHeader != other.extractHeader) { return false; } return true; } } @Override public int getReaderOperatorType() { return CoreOperatorType.TEXT_SUB_SCAN_VALUE; } @Override public int getWriterOperatorType() { return CoreOperatorType.TEXT_WRITER_VALUE; } @Override public boolean supportsPushDown() { return true; } }