package com.klarna.hiverunner.data; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.lang3.ObjectUtils; import org.apache.hive.hcatalog.data.schema.HCatSchema; import com.google.common.base.Splitter; /** * A {@link FileParser} for parsing data out of a TSV file. */ public class TsvFileParser implements FileParser { private static final String DEFAULT_DELIMITER = "\t"; private static final String DEFAULT_NULL_VALUE = ""; private Splitter splitter; private Object nullValue; private Charset charset; private boolean hasHeader; public TsvFileParser() { withDelimiter(DEFAULT_DELIMITER); withNullValue(DEFAULT_NULL_VALUE); withCharset(StandardCharsets.UTF_8); withoutHeader(); } /** * Use the provided delimiter. The default is a tab. */ public TsvFileParser withDelimiter(String delimiter) { splitter = Splitter.on(delimiter); return this; } /** * Use the provided null value. When a column's value equals the null value it will be replaced with null. The default * is an empty string. */ public TsvFileParser withNullValue(Object nullValue) { this.nullValue = nullValue; return this; } /** * Use the provided {@link Charset}. The default is UTF-8. */ public TsvFileParser withCharset(Charset charset) { this.charset = charset; return this; } /** * Enable if TSV file has header row. Default is false. */ public TsvFileParser withHeader() { this.hasHeader = true; return this; } /** * Enable if TSV file has header row. Default is false. */ public TsvFileParser withoutHeader() { this.hasHeader = false; return this; } @Override public List<Object[]> parse(File file, HCatSchema schema, List<String> names) { try { List<String> lines = Files.readAllLines(file.toPath(), charset); if (this.hasHeader) { lines = lines.subList(1, lines.size()); } List<Object[]> records = new ArrayList<>(lines.size()); for (String line : lines) { records.add(parseRow(line, names.size())); } return records; } catch (IOException e) { throw new RuntimeException("Error while reading file", e); } } @Override public boolean hasColumnNames() { return this.hasHeader; } @Override public List<String> getColumnNames(File file) { try { String firstLine = Files.newBufferedReader(file.toPath(), charset).readLine(); List<String> columns = new ArrayList<>(); Iterator<String> iterator = splitter.split(firstLine).iterator(); while (iterator.hasNext()) { String column = iterator.next(); columns.add(column); } return columns; } catch(IOException e) { throw new RuntimeException("Error while reading file", e); } } private Object[] parseRow(String line, int size) { List<Object> row = new ArrayList<>(size); Iterator<String> iterator = splitter.split(line).iterator(); for (int i = 0; i < size; i++) { if (iterator.hasNext()) { String column = iterator.next(); if (ObjectUtils.equals(nullValue, column)) { row.add(null); } else { row.add(column); } } else { throw new IllegalStateException("Not enough columns. Require " + size + " columns, got " + i); } } return row.toArray(new Object[size]); } }