package storm.applications.spout.parser; import com.google.common.collect.ImmutableList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import storm.applications.util.data.DateUtils; import storm.applications.util.stream.StreamValues; /** * * @author Maycon Viana Bordin <mayconbordin@gmail.com> */ public class CommonLogParser extends Parser { private static final Logger LOG = LoggerFactory.getLogger(CommonLogParser.class); private static final DateTimeFormatter dtFormatter = DateTimeFormat.forPattern("dd/MMM/yyyy:HH:mm:ss Z"); public static final String IP = "ip"; public static final String TIMESTAMP = "timestamp"; public static final String REQUEST = "request"; public static final String RESPONSE = "response"; public static final String BYTE_SIZE = "byte_size"; private static final int NUM_FIELDS = 8; @Override public List<StreamValues> parse(String str) { Map<String, Object> entry = parseLine(str); if (entry == null) { LOG.warn("Unable to parse log: {}", str); return null; } long minute = DateUtils.getMinuteForTime((Date) entry.get(TIMESTAMP)); int msgId = String.format("%s:%s", entry.get(IP), entry.get(TIMESTAMP)).hashCode(); StreamValues values = new StreamValues(entry.get(IP), entry.get(TIMESTAMP), minute, entry.get(REQUEST), entry.get(RESPONSE), entry.get(BYTE_SIZE)); values.setMessageId(msgId); return ImmutableList.of(values); } public static Map<String, Object> parseLine(String logLine) { Map<String, Object> entry = new HashMap<>(); String logEntryPattern = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)(.*?)"; Pattern p = Pattern.compile(logEntryPattern); Matcher matcher = p.matcher(logLine); if (!matcher.matches() || NUM_FIELDS != matcher.groupCount()) { return null; } entry.put(IP, matcher.group(1)); entry.put(TIMESTAMP, dtFormatter.parseDateTime(matcher.group(4)).toDate()); entry.put(REQUEST, matcher.group(5)); entry.put(RESPONSE, Integer.parseInt(matcher.group(6))); if (matcher.group(7).equals("-")) entry.put(BYTE_SIZE, 0); else entry.put(BYTE_SIZE, Integer.parseInt(matcher.group(7))); return entry; } }