package com.aliyun.odps.udf.example.text; import com.aliyun.odps.Column; import com.aliyun.odps.data.ArrayRecord; import com.aliyun.odps.data.Record; import com.aliyun.odps.io.InputStreamSet; import com.aliyun.odps.udf.*; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; /** * Text extractor that extract schematized records from formatted plain-text(csv, tsv etc.) **/ public class TextExtractor extends Extractor { private InputStreamSet inputs; private String columnDelimiter; private DataAttributes attributes; private BufferedReader currentReader; private boolean firstRead = true; public TextExtractor() { // default to ",", this can be overwritten if a specific delimiter is provided (via DataAttributes) this.columnDelimiter = ","; } // no particular usage for execution context in this example @Override public void setup(ExecutionContext ctx, InputStreamSet inputs, DataAttributes attributes) { this.inputs = inputs; this.attributes = attributes; // check if "delimiter" attribute is supplied via SQL query String columnDelimiter = this.attributes.getValueByKey("delimiter"); if ( columnDelimiter != null) { this.columnDelimiter = columnDelimiter; } System.out.println("TextExtractor using delimiter [" + this.columnDelimiter + "]."); // note: more properties can be inited from attributes if needed } @Override public Record extract() throws IOException { String line = readNextLine(); if (line == null) { return null; } return textLineToRecord(line); } @Override public void close(){ // no-op } private Record textLineToRecord(String line) throws IllegalArgumentException { Column[] outputColumns = this.attributes.getRecordColumns(); ArrayRecord record = new ArrayRecord(outputColumns); if (this.attributes.getRecordColumns().length != 0){ // string copies are needed, not the most efficient one, but suffice as an example here String[] parts = line.split(columnDelimiter); int[] outputIndexes = this.attributes.getNeededIndexes(); if (outputIndexes == null){ throw new IllegalArgumentException("No outputIndexes supplied."); } if (outputIndexes.length != outputColumns.length){ throw new IllegalArgumentException("Mismatched output schema: Expecting " + outputColumns.length + " columns but get " + parts.length); } int index = 0; for(int i = 0; i < parts.length; i++){ // only parse data in columns indexed by output indexes if (index < outputIndexes.length && i == outputIndexes[index]){ switch (outputColumns[index].getType()) { case STRING: record.setString(index, parts[i]); break; case BIGINT: record.setBigint(index, Long.parseLong(parts[i])); break; case BOOLEAN: record.setBoolean(index, Boolean.parseBoolean(parts[i])); break; case DOUBLE: record.setDouble(index, Double.parseDouble(parts[i])); break; case DATETIME: case DECIMAL: case ARRAY: case MAP: default: throw new IllegalArgumentException("Type " + outputColumns[index].getType() + " not supported for now."); } index++; } } } return record; } /** * Read next line from underlying input streams. * @return The next line as String object. If all of the contents of input * streams has been read, return null. */ private String readNextLine() throws IOException { if (firstRead) { firstRead = false; // the first read, initialize things currentReader = moveToNextStream(); if (currentReader == null) { // empty input stream set return null; } } while (currentReader != null) { String line = currentReader.readLine(); if (line != null) { return line; } currentReader = moveToNextStream(); } return null; } private BufferedReader moveToNextStream() throws IOException { InputStream stream = inputs.next(); if (stream == null) { return null; } else { return new BufferedReader(new InputStreamReader(stream)); } } }