package context.core.task.csvparser; import au.com.bytecode.opencsv.CSVReader; import context.core.entity.CorpusData; import context.core.entity.FileData; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; /** * @author julianchin * */ public class CsvParserBody { private int csvSeparatorIndex; private String csvSeparatorCustomText; private char csvSeparatorChar; private char csvQuoteChar; private List<String> csvColumnList; private int groupByColumnIndex; private int textColumnIndex; private int groupThreshold; private CsvParserTaskInstance instance; private CorpusData input; private CorpusData output; private List<String[]> csvLines; private HashMap<String, List<String[]>> fileCSVLines; /** * * @param instance */ public CsvParserBody(CsvParserTaskInstance instance){ super(); this.instance = instance; this.csvSeparatorIndex=instance.getCsvSeparatorIndex(); this.csvSeparatorCustomText=instance.getCsvSeparatorCustomText(); this.csvSeparatorChar=instance.getCsvSeparatorChar(); this.csvQuoteChar = instance.getCsvQuoteChar(); this.csvColumnList=instance.getCsvColumnList(); this.groupByColumnIndex=instance.getGroupByColumnIndex(); this.textColumnIndex=instance.getTextColumnIndex(); this.input=(CorpusData)instance.getInput(); this.output=(CorpusData)instance.getTextOutput(); this.csvLines=new ArrayList<String[]>(); this.fileCSVLines = new HashMap<String, List<String[]>>(); this.groupThreshold=instance.getGroupThreshold(); } /** * Read the CSV files from a given filelist based on the grouping schema. * @author Shubhanshu */ public void readCsvFiles(){ List<FileData> files=input.getFiles(); try{ for(FileData f:files){ File file=f.getFile(); String fileName=file.getPath(); CSVReader reader=new CSVReader(new FileReader(fileName),csvSeparatorChar,csvQuoteChar,1); if(groupByColumnIndex==0){ fileCSVLines.put(FilenameUtils.getBaseName(fileName), reader.readAll()); } else { csvLines.addAll(reader.readAll()); } reader.close(); } }catch(Exception e){ e.printStackTrace(); } } /** * Parse data from each of the CSV files and store them in the required data structure based on the grouping schema. * @author Shubhanshu */ public void parseCsvToTextData(){ if(csvLines!=null){ String groupByName = ""; if(groupByColumnIndex==0){ for(Entry<String, List<String[]>> entry: fileCSVLines.entrySet()){ groupByName = entry.getKey(); for(int i = 0; i < entry.getValue().size(); i++){ saveTextFile(groupByName+"_"+i,entry.getValue().get(i)[textColumnIndex]); } } } else { HashMap<String, ArrayList<String>> groupsMap = getGroupByList(csvLines); List<String> defaultContents=new ArrayList<String>(); for( Entry<String, ArrayList<String>> entry: groupsMap.entrySet()){ if(entry.getValue().size()<groupThreshold){ defaultContents.addAll(entry.getValue()); } else{ groupByName = entry.getKey(); String fileContents = StringUtils.join(entry.getValue(), "\n"); saveTextFile(groupByName,fileContents); } //Save no-group items in the default.txt file if(defaultContents.size()>0){ String defaultFileContents=StringUtils.join(defaultContents,"\n"); saveTextFile("default",defaultFileContents); } } } } } /** * Return the grouped values based on the grouping column. Currently only single level grouping is supported. * @author Shubhanshu * @param csvLines - lines collected from ALL the files. * @return - HashMap of groupName with the corresponding List of text values in that group. */ public HashMap<String, ArrayList<String>> getGroupByList(List<String[]> csvLines){ HashMap<String, ArrayList<String>> groupsMap = new HashMap<String, ArrayList<String>>(); String currentGroup=""; if(groupByColumnIndex==0){ System.out.println("Don't use groupby in this case."); return null; } for(String[] csvNextLine: csvLines){ currentGroup=csvNextLine[groupByColumnIndex-1]; if(!groupsMap.containsKey(currentGroup)){ groupsMap.put(currentGroup, new ArrayList<String>()); } groupsMap.get(currentGroup).add(csvNextLine[textColumnIndex]); } return groupsMap; } /** * Write the data to the given file name. * @author Shubhanshu * @param fileName - Name of the file * @param fileContents - Contents of the file in text format. */ public void saveTextFile(String fileName, String fileContents){ String outputFileName=fileName+".txt"; int index=output.addFile(outputFileName); output.writeFile(index, fileContents); System.err.println("Finished Writing the Modified File: "+fileName); } }