package edu.ucsb.stko; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.util.Hashtable; import java.util.Iterator; import java.util.Vector; import javafx.geometry.Point2D; import org.json.JSONObject; import au.com.bytecode.opencsv.CSVReader; class DataPreProcessor { public JSONObject preprocessData(JSONObject parameterObject) { System.out.println("Preprocessing the input data..."); try { double duplicateDistance = parameterObject.getDouble("eps"); String inputFilePath = parameterObject.getString("dataPath"); File inputFile = new File(inputFilePath); FileReader inputFileReader = new FileReader(inputFile); CSVReader inputCSVReader = new CSVReader(inputFileReader); String newLineSymbol = System.getProperty("line.separator"); String tempFileName = parameterObject.getString("tempPath")+"/"+inputFile.getName().replace(".csv", "")+"_processed.csv"; File outputFile = new File(tempFileName); if(outputFile.exists()) { outputFile.delete(); outputFile.createNewFile(); } FileWriter outputFileWriter = new FileWriter(outputFile,true); // get the indexes int recordIDIndex = parameterObject.getInt("recordIDIndex"); int userIDIndex = parameterObject.getInt("userIDIndex"); int lngIndex = parameterObject.getInt("lngIndex"); int latIndex = parameterObject.getInt("latIndex"); /*boolean removeDuplicates = parameterObject.getBoolean("removeDuplicates"); if(removeDuplicates && (userIDIndex == -1)) { System.out.println("You have indicated to remove duplicated records, but didn't provide the column number of user id; please check the configuration file."); inputCSVReader.close(); outputFileWriter.close(); return null; }*/ Hashtable<String, Vector<Point2D>> existingDataHashtable = new Hashtable<>(1000); long totalProcessedDataRecord = 0; String[] thisInputLine = inputCSVReader.readNext(); while((thisInputLine = inputCSVReader.readNext()) != null) { String recordId = null; String ownerString = null; double latString = 0; double lngString = 0; try { recordId = thisInputLine[recordIDIndex]; if(userIDIndex != -1) ownerString = thisInputLine[userIDIndex]; latString = Double.parseDouble(thisInputLine[latIndex]); lngString = Double.parseDouble(thisInputLine[lngIndex]); } catch (Exception e) { continue; } if(userIDIndex != -1) { boolean isDuplicated = false; if(existingDataHashtable.containsKey(ownerString)) { Vector<Point2D> peopleCoordsVector = existingDataHashtable.get(ownerString); Iterator<Point2D> coordIterator = peopleCoordsVector.iterator(); while(coordIterator.hasNext()) { Point2D coordPoint = coordIterator.next(); double thisDistance = Math.sqrt((coordPoint.getX() - lngString)*(coordPoint.getX() - lngString) + (coordPoint.getY() - latString) * (coordPoint.getY() - latString)); if(thisDistance<= duplicateDistance) { isDuplicated = true; break; } } } else { Vector<Point2D> peopleCoordsVector = new Vector<>(10); existingDataHashtable.put(ownerString, peopleCoordsVector); } if(!isDuplicated) { // add this point to existing table Vector<Point2D> peopleCoordsVector = existingDataHashtable.get(ownerString); peopleCoordsVector.add(new Point2D(lngString, latString)); existingDataHashtable.put(ownerString, peopleCoordsVector); outputFileWriter.append(recordId+","+ownerString+","+latString+","+lngString+newLineSymbol); totalProcessedDataRecord++; } } else { /*if(userIDIndex != -1) { existingDataHashtable.put(ownerString, new Vector<Point2D>(1)); outputFileWriter.append(recordId+","+ownerString+","+latString+","+lngString+newLineSymbol); } else {*/ outputFileWriter.append(recordId+","+latString+","+lngString+newLineSymbol); //} totalProcessedDataRecord++; } } inputCSVReader.close(); outputFileWriter.close(); JSONObject resultObject = new JSONObject(); resultObject.put("file", tempFileName); if(userIDIndex != -1) resultObject.put("userCount", existingDataHashtable.size()); resultObject.put("recordCount", totalProcessedDataRecord); if(userIDIndex != -1) System.out.println("After pre-processing, there are "+ totalProcessedDataRecord+" records and "+ existingDataHashtable.size()+" users in the data."); else System.out.println("After pre-processing, there are "+ totalProcessedDataRecord+" records in the data."); System.out.println("---------------------------------------------------"); return resultObject; } catch (Exception e) { System.out.println("An error happened in the data preprocessing; The program has been canceled"); return null; } } }