CSVRowParser.java example

Explorer
RecordBreaker-master
- src
/*
 * Copyright (c) 2013, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;

import java.util.List;
import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;

import au.com.bytecode.opencsv.CSVParser;

/***********************************************************************
 * <code>CSVRowParser</code> converts a single row of a CSV file into an
 * avro object with a given schema.  If the input line is empty or is the header
 * row, the parser returns a null object.
 *
 * @author "Michael Cafarella"
 ***********************************************************************/
public class CSVRowParser {
  CSVParser parser;
  Schema schema;
  List<Schema.Field> curFields;
  String headerHash;
  
  public CSVRowParser(Schema schema, String headerHash) {
    this.parser = new CSVParser();
    this.schema = schema;
    this.curFields = schema.getFields();
    this.headerHash = headerHash;
  }

  /**
   * <code>parseRow</code> returns a GenericData.Record that matches the
   * init'ed Schema and corresponds to the given row of text.
   *
   * Returns null if there's no match, or if we're looking at the header row.
   */
  public GenericData.Record parseRow(String row) {
    if (("" + row.hashCode()).compareTo(headerHash) == 0) {
      return null;
    }
    try {
      GenericData.Record cur = null;
      String parts[] = parser.parseLine(row);
      int fieldPos = 0;

      for (int i = 0; i < parts.length; i++) {
        if (cur == null) {
          cur = new GenericData.Record(schema);
        }
        String rawFieldValue = parts[i];
        if (rawFieldValue.startsWith(",")) {
          rawFieldValue = rawFieldValue.substring(1);
        }
        rawFieldValue = rawFieldValue.trim();
        if (rawFieldValue.startsWith("\"") && rawFieldValue.endsWith("\"")) {
          rawFieldValue = rawFieldValue.substring(1, rawFieldValue.length()-1);
          rawFieldValue = rawFieldValue.trim();
        }

        Schema.Field curField = curFields.get(fieldPos);
        String fieldName = curField.name();
        Schema fieldType = curField.schema();
        cur.put(fieldName, parseField(rawFieldValue, fieldType.getType()));
        fieldPos++;
      }
      return cur;
    } catch (IOException iex) {
      iex.printStackTrace();
      return null;
    } catch (NumberFormatException nfe) {
      nfe.printStackTrace();
      return null;
    }
  }

  /**
   * Parse a single CSV-separated field with the given type
   */
  Object parseField(String rawFieldValue, Schema.Type fieldType) throws IOException {
    Object fieldValue = null;
    if (fieldType == Schema.Type.INT) {
      try {
        fieldValue = Integer.parseInt(rawFieldValue);
      } catch (NumberFormatException nfe) {
        nfe.printStackTrace();
        fieldValue = 0;
      }
    } else if (fieldType == Schema.Type.DOUBLE) {
      fieldValue = Double.parseDouble(rawFieldValue);
    } else if (fieldType == Schema.Type.STRING) {
      fieldValue = rawFieldValue;
    } else {
      throw new IOException("Unexpected field-level schema type: " + fieldType);
    }
    return fieldValue;
  }
}