/*******************************************************************************
* Copyright 2012 University of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This code was developed by the Information Integration Group as part
* of the Karma project at the Information Sciences Institute of the
* University of Southern California. For more information, publications,
* and related projects, please see: http://www.isi.edu/integration
******************************************************************************/
package edu.isi.karma.imp.csv;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Scanner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import au.com.bytecode.opencsv.CSVReader;
import edu.isi.karma.rep.HNode;
import edu.isi.karma.rep.HTable;
import edu.isi.karma.rep.RepFactory;
import edu.isi.karma.rep.Row;
import edu.isi.karma.rep.Table;
import edu.isi.karma.rep.Worksheet;
import edu.isi.karma.rep.Workspace;
import edu.isi.karma.webserver.KarmaException;
public class CSVFileImport {
private final int headerRowIndex;
private final int dataStartRowIndex;
private final char delimiter;
private final char quoteCharacter;
private final char escapeCharacter = '\\';
private final File csvFile;
private final RepFactory factory;
private final Worksheet worksheet;
private static Logger logger = LoggerFactory.getLogger(CSVFileImport.class);
public CSVFileImport(int headerRowIndex, int dataStartRowIndex,
char delimiter, char quoteCharacter, File csvFile,
RepFactory factory, Workspace workspace) {
super();
this.headerRowIndex = headerRowIndex;
this.dataStartRowIndex = dataStartRowIndex;
this.delimiter = delimiter;
this.quoteCharacter = quoteCharacter;
this.csvFile = csvFile;
this.factory = factory;
this.worksheet = factory.createWorksheet(csvFile.getName(), workspace);
}
public Worksheet generateWorksheet() throws IOException, KarmaException {
Table dataTable = worksheet.getDataTable();
// Prepare the reader for reading file line by line
BufferedReader br = new BufferedReader(new FileReader(csvFile));
// Index for row currently being read
int rowCount = 0;
ArrayList<String> hNodeIdList = new ArrayList<String>();
// If no row is present for the column headers
if (headerRowIndex == 0){
hNodeIdList = addEmptyHeaders(worksheet, factory);
if(hNodeIdList == null || hNodeIdList.size() == 0){
br.close();
throw new KarmaException("Error occured while counting header " +
"nodes for the worksheet!");
}
}
// Populate the worksheet model
String line = null;
while ((line = br.readLine()) != null) {
// Check for the header row
if (rowCount + 1 == headerRowIndex) {
hNodeIdList = addHeaders(worksheet, factory, line);
rowCount++;
continue;
}
// Populate the model with data rows
if (rowCount + 1 >= dataStartRowIndex) {
addRow(worksheet, factory, line, hNodeIdList, dataTable);
rowCount++;
continue;
}
rowCount++;
}
br.close();
return worksheet;
}
private ArrayList<String> addHeaders(Worksheet worksheet, RepFactory fac,
String line) throws IOException {
HTable headers = worksheet.getHeaders();
ArrayList<String> headersList = new ArrayList<String>();
CSVReader reader = new CSVReader(new StringReader(line), delimiter,
quoteCharacter, escapeCharacter);
String[] rowValues = null;
rowValues = reader.readNext();
if (rowValues == null || rowValues.length == 0) {
reader.close();
return addEmptyHeaders(worksheet, fac);
}
for (int i = 0; i < rowValues.length; i++) {
HNode hNode = null;
if (headerRowIndex == 0)
hNode = headers.addHNode("Column_" + (i + 1), worksheet, fac);
else
hNode = headers.addHNode(rowValues[i], worksheet, fac);
headersList.add(hNode.getId());
}
reader.close();
return headersList;
}
private void addRow(Worksheet worksheet, RepFactory fac, String line,
ArrayList<String> hNodeIdList, Table dataTable) throws IOException {
CSVReader reader = new CSVReader(new StringReader(line), delimiter,
quoteCharacter, escapeCharacter);
String[] rowValues = null;
rowValues = reader.readNext();
if (rowValues == null || rowValues.length == 0) {
reader.close();
return;
}
Row row = dataTable.addRow(fac);
for (int i = 0; i < rowValues.length; i++) {
if (i < hNodeIdList.size())
row.setValue(hNodeIdList.get(i), rowValues[i], fac);
else {
// TODO Our model does not allow a value to be added to a row
// without its associated HNode. In CSVs, there could be case
// where values in rows are greater than number of column names.
logger.error("More data elements detected in the row than number of headers!");
}
}
reader.close();
}
private ArrayList<String> addEmptyHeaders(Worksheet worksheet,
RepFactory fac) throws IOException {
HTable headers = worksheet.getHeaders();
ArrayList<String> headersList = new ArrayList<String>();
Scanner scanner = null;
scanner = new Scanner(csvFile);
// Use the first data row to count the number of columns we need to add
int rowCount = 0;
while (scanner.hasNext()) {
if (rowCount + 1 == dataStartRowIndex) {
String line = scanner.nextLine();
CSVReader reader = new CSVReader(new StringReader(line),
delimiter, quoteCharacter, escapeCharacter);
String[] rowValues = null;
try {
rowValues = reader.readNext();
} catch (IOException e) {
logger.error("Error reading Line:" + line, e);
}
for (int i = 0; i < rowValues.length; i++) {
HNode hNode = headers.addHNode("Column_" + (i + 1),
worksheet, fac);
headersList.add(hNode.getId());
}
reader.close();
break;
}
rowCount++;
if(scanner.hasNext())
scanner.nextLine();
}
return headersList;
}
}