/* * Copyright (c) 2014-2015 Giving.com, trading as JustGiving or its affiliates. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"). * You may not use this file except in compliance with the License. * A copy of the License is located in the "license" file accompanying this file. * * This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for * the specific language governing permissions and limitations under the License. * * @author Richard Freeman * */ package com.justgiving.raven.kissmetrics.utils; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.lang.reflect.Field; import java.net.URLDecoder; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import com.justgiving.raven.kissmetrics.KissmetricsConstants.TRACKING_COUNTER; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.codec.binary.StringUtils; import org.apache.commons.lang.StringEscapeUtils; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; public class KissmetricsRowParser { static final Logger logger = Logger.getLogger(KissmetricsRowParser.class); private static final int max_property_value_size = 1500; static DateFormat dateFormatter = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss"); // %Y-%m-%d %H:%M:%S private static JSONParser jsonParser = new JSONParser(); private static String id = ""; private static String emailaddress = ""; private static String tsvRow = ""; private static String propertyValue = ""; private static String timestampValueOutput = ""; private static String mobileTimestampValueOutput = ""; private static String tTimestampValue = ""; private static String serverTimestampValue = ""; private static String p = ""; private static String p2 = ""; private static String s = ""; private static String event = ""; private static String decodedStrRaw = ""; private static String decodedStrParsed = ""; public static String getDefaultCharEncoding() { byte[] bArray = { 'w' }; InputStream is = new ByteArrayInputStream(bArray); InputStreamReader reader = new InputStreamReader(is); String defaultCharacterEncoding = reader.getEncoding(); return defaultCharacterEncoding; } /*** * This method is used to replace any Octal encoded character when the * existing decoding is not working. * Source: http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=oct&unicodeinhtml=dec * * U+0000 to U+00FF - basic latin * * @param input * @return Octal replaced String */ public static String replaceOctalUft8Char(String input) { String output = input.replace("\\302\\241", "�") .replace("\\302\\242", "�").replace("\\302\\243", "�") .replace("\\302\\244", "�").replace("\\302\\245", "�") .replace("\\302\\246", "�").replace("\\302\\247", "�") .replace("\\302\\250", "�").replace("\\302\\251", "�") .replace("\\302\\252", "�").replace("\\302\\253", "�") .replace("\\302\\254", "�").replace("\\302\\255", "�") .replace("\\302\\256", "�").replace("\\302\\257", "�") .replace("\\302\\260", "�").replace("\\302\\261", "�") .replace("\\302\\262", "�").replace("\\302\\263", "�") .replace("\\302\\264", "�").replace("\\302\\265", "�") .replace("\\302\\266", "�").replace("\\302\\267", "�") .replace("\\302\\270", "�").replace("\\302\\271", "�") .replace("\\302\\272", "�").replace("\\302\\273", "�") .replace("\\302\\274", "�").replace("\\302\\275", "�") .replace("\\302\\276", "�").replace("\\302\\277", "�") .replace("\\303\\200", "�").replace("\\303\\201", "�") .replace("\\303\\202", "�").replace("\\303\\203", "�") .replace("\\303\\204", "�").replace("\\303\\205", "�") .replace("\\303\\206", "�").replace("\\303\\207", "�") .replace("\\303\\210", "�").replace("\\303\\211", "�") .replace("\\303\\212", "�").replace("\\303\\213", "�") .replace("\\303\\214", "�").replace("\\303\\215", "�") .replace("\\303\\216", "�").replace("\\303\\217", "�") .replace("\\303\\220", "�").replace("\\303\\221", "�") .replace("\\303\\222", "�").replace("\\303\\223", "�") .replace("\\303\\224", "�").replace("\\303\\225", "�") .replace("\\303\\226", "�").replace("\\303\\227", "�") .replace("\\303\\230", "�").replace("\\303\\231", "�") .replace("\\303\\232", "�").replace("\\303\\233", "�") .replace("\\303\\234", "�").replace("\\303\\235", "�") .replace("\\303\\236", "�").replace("\\303\\237", "�") .replace("\\303\\240", "�").replace("\\303\\241", "�") .replace("\\303\\242", "�").replace("\\303\\243", "�") .replace("\\303\\244", "�").replace("\\303\\245", "�") .replace("\\303\\246", "�").replace("\\303\\247", "�") .replace("\\303\\250", "�").replace("\\303\\251", "�") .replace("\\303\\252", "�").replace("\\303\\253", "�") .replace("\\303\\254", "�").replace("\\303\\255", "�") .replace("\\303\\256", "�").replace("\\303\\257", "�") .replace("\\303\\260", "�").replace("\\303\\261", "�") .replace("\\303\\262", "�").replace("\\303\\263", "�") .replace("\\303\\264", "�").replace("\\303\\265", "�") .replace("\\303\\266", "�").replace("\\303\\267", "�") .replace("\\303\\270", "�").replace("\\303\\271", "�") .replace("\\303\\272", "�").replace("\\303\\273", "�") .replace("\\303\\274", "�").replace("\\303\\275", "�") .replace("\\303\\276", "�").replace("\\303\\277", "�"); return output; } /*** * Used to parse, escape and enrich Kissmetircs Json records * * @param rawJsonRow * @param fileNameInputToMapper * @return */ public static KeyRowWrapper parseJsonRowToValidJson(Text rawJsonRow, String fileNameInputToMapper, String filePath) { String jsonString = ""; boolean wasOctalParsingNeeded = false; try { System.setProperty("file.encoding", "UTF-8"); s = rawJsonRow.toString(); Charset charSet = Charset.forName("UTF-8"); byte[] encoded = s.getBytes(charSet); decodedStrRaw = new String(encoded, charSet); // Test new Apache Lang3 // decodedStr = StringEscapeUtils.unescapeJava(decodedStr); //Replace any remaining Octal encoded Characters decodedStrParsed = replaceOctalUft8Char(decodedStrRaw); if(decodedStrParsed.compareTo(decodedStrRaw) == 0){ wasOctalParsingNeeded = false; }else{ wasOctalParsingNeeded = true; } if (decodedStrParsed != null && decodedStrParsed != "") { JSONObject jsonObject = (JSONObject) jsonParser .parse(decodedStrParsed); // get email and user_id if (jsonObject.get("_p2") != null) { p2 = jsonObject.get("_p2").toString().toLowerCase(); if (p2.contains("@")) { jsonObject.put("user_email", p2); jsonObject.put("user_email_back", p2); } else if (p2 != null && p2.length() > 0) { jsonObject.put("user_km_id", p2); } } // get email and user_id if (jsonObject.get("_p") != null) { p = jsonObject.get("_p").toString().toLowerCase(); if (p.contains("@")) { jsonObject.put("user_email", p); jsonObject.put("user_email_back", p); } else if (p != null && p.length() > 0) { jsonObject.put("user_km_id", p); } } // Add Event if (jsonObject.get("_n") != null) { event = jsonObject.get("_n").toString(); if (event != null) { jsonObject.put("event", event); } } // add unix timestamp and datetime long currentDateTime = System.currentTimeMillis(); Date currentDate = new Date(currentDateTime); if (jsonObject.get("_t") == null) { return (new KeyRowWrapper(jsonString, null, TRACKING_COUNTER.INVALID_JSON_ROW, TRACKING_COUNTER.INVALID_DATE)); } long kmTimeDateMilliSeconds; long kmTimeDateMilliSecondsMobile; try{ tTimestampValue = (String) jsonObject.get("_t").toString(); //See if new record with server timestamp if (jsonObject.get("_server_timestamp") != null) { serverTimestampValue = (String) jsonObject.get("_server_timestamp").toString(); }else{ serverTimestampValue = "0"; } //Deal with mobile timedate cases if (jsonObject.get("_c") != null){ if(serverTimestampValue.equals("0")){ timestampValueOutput =tTimestampValue; kmTimeDateMilliSecondsMobile = 0; }else{ timestampValueOutput = serverTimestampValue; mobileTimestampValueOutput = tTimestampValue; jsonObject.put("km_timestamp_mobile", mobileTimestampValueOutput); kmTimeDateMilliSecondsMobile = Long.parseLong(mobileTimestampValueOutput) * 1000; } }else{//Ignore server time //TODO Need a way to resolve mobile identify events serverTimestampValue = "0"; timestampValueOutput = tTimestampValue; kmTimeDateMilliSecondsMobile = 0; } jsonObject.put("km_timestamp", timestampValueOutput); kmTimeDateMilliSeconds = Long.parseLong(timestampValueOutput) * 1000; }catch (Exception e) { return (new KeyRowWrapper(jsonString, timestampValueOutput, TRACKING_COUNTER.INVALID_JSON_ROW, TRACKING_COUNTER.INVALID_DATE)); } Calendar calendar = Calendar.getInstance(); calendar.setTimeInMillis(kmTimeDateMilliSeconds); String event_timedate = dateFormatter .format(calendar.getTime()); jsonObject.put("event_timedate", event_timedate); if(kmTimeDateMilliSecondsMobile > 0){ calendar.setTimeInMillis(kmTimeDateMilliSecondsMobile); String event_timedate_mobile = dateFormatter .format(calendar.getTime()); jsonObject.put("event_timedate_mobile", event_timedate_mobile); } // add Map Reduce json_filename jsonObject.put("filename", fileNameInputToMapper); jsonString = jsonObject.toString(); //Add bucket path jsonObject.put("bucket", filePath); jsonString = jsonObject.toString(); // TODO add the time the record was processed by Mapper: //jsonObject.put("capturedDate", capturedDate); //jsonString = jsonObject.toString(); return (new KeyRowWrapper(jsonString, timestampValueOutput, TRACKING_COUNTER.VALID_JSON_ROW, wasOctalParsingNeeded ? null : TRACKING_COUNTER.OCTAL_PARSING_NEEDED )); } } catch (Exception e) { // System.err.println(e.getMessage()); // e.printStackTrace(); StringWriter errors = new StringWriter(); e.printStackTrace(new PrintWriter(errors)); logger.error(errors.toString()); logger.error("log - file " + fileNameInputToMapper); System.out.println("file " + fileNameInputToMapper); logger.error("log - row content: " + rawJsonRow.toString().replace("\t", "")); System.err.println("row content: " + rawJsonRow.toString().replace("\t", "")); System.err.println("Error skipping row"); logger.error("Log - Error skipping row"); } return null; } public static String runOnStringJson(Text rawJsonRow, String output_filename) throws FileNotFoundException { String fileNameInputToMapper = "pathtocurrentfile"; //String capturedDate = getCurrentDate(); KeyRowWrapper newValidJson = KissmetricsRowParser .parseJsonRowToValidJson(rawJsonRow, fileNameInputToMapper, output_filename); //logger.info(newValidJson.jsonrow); return newValidJson.jsonrow; } //static DateFormat dateFormatter = new SimpleDateFormat( // "yyyy-MM-dd HH:mm:ss"); // %Y-%m-%d %H:%M:%S public static String getCurrentDate(){ Calendar calendar = Calendar.getInstance(); String event_timedate = dateFormatter.format(calendar.getTime()); return event_timedate; } public static void runonfileValidJson(String input_filename, String output_filename) throws IOException { InputStream fis; BufferedReader bufferdReader; String line; try { File file = new File(output_filename); if (file.createNewFile()) { logger.warn("File has been created"); }//else { // logger.info("File already exists."); //} // if (!file.getParentFile().mkdirs()) // throw new IOException("Unable to create " + // file.getParentFile()); FileWriter fileWriter = new FileWriter(output_filename, false); BufferedWriter bufferedWriter = new BufferedWriter(fileWriter); String parsedLine; fis = new FileInputStream(input_filename); bufferdReader = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8"))); while ((line = bufferdReader.readLine()) != null) { parsedLine = runOnStringJson(new Text(line), output_filename) + "\n"; bufferedWriter.write(parsedLine); } bufferedWriter.close(); bufferdReader.close(); } catch (IOException e) { logger.error("Error writing to file '" + output_filename + "'"); e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } logger.info("Ouput written to " + output_filename); } private static void processFolder(String inputFolder, String outputFolder) throws IOException { File file = new File(outputFolder); if (!file.exists()) { if (file.mkdirs()) logger.info("Directory successfully created"); else logger.error("Failed to create directory"); } File folder = new File(inputFolder); File[] listOfFiles = folder.listFiles(); for( File currentFile : listOfFiles){ if(currentFile.isFile()){ logger.info("File " + currentFile.getName()); runonfileValidJson(Paths.get(inputFolder, currentFile.getName()).toString(), Paths.get(outputFolder, currentFile.getName()).toString()); }else if (currentFile.isDirectory()){ //System.out.println("Directory " + currentFile.getName()); } } } public static void main(String[] args) throws FileNotFoundException, IOException { for (String s : args) { System.out.println(s); } String inputFile ="D:\\datasets\\kissmetrics\\input\\2250.json"; String outputFile ="D:\\datasets\\kissmetrics\\output\\2250.json"; // String inputFile ="D:\\datasets\\kissmetrics\\input\\"; //String inputFile = "D:\\datasets\\kissmetrics\\input5\\"; //String outputFile = "D:\\datasets\\kissmetrics\\output5\\"; if (args.length == 2) { try { inputFile = args[0]; outputFile = args[1]; } catch (Exception e) { System.err.println("Error unable to extract arguments, valid arguments are inputFilePath inputFilePath"); System.exit(1); } } else if (args == null || args.length == 0){ logger.info("using defaul values for inputFile=" + inputFile + " outputFile=" + outputFile); } String logConfigPath = Paths.get(System.getProperty("user.dir"), "log4j.properties").toString(); File f = new File(logConfigPath); if(f.exists() && !f.isDirectory()) { System.out.println("log config file used: " + logConfigPath); PropertyConfigurator.configure(logConfigPath); logger.info("log config file used: " + logConfigPath); }else{ System.out.println("no log file detected, please copy the log4j.properties to the same folder as the JAR"); } if (inputFile.endsWith("\\")) { logger.info("Detected folder"); processFolder(inputFile, outputFile); } else { logger.info("Detected file"); runonfileValidJson(inputFile, outputFile); } } }