import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.log4j.Logger; public class PlayByPlayMapper extends Mapper<LongWritable, Text, Text, Text> { Logger logger = Logger.getLogger(PlayByPlayMapper.class); private static final char OUTPUT_SEPARATOR = '\t'; /** (14:56) E.Manning pass incomplete deep left to H.Nicks. */ Pattern incompletePass = Pattern .compile("([A-Za-z]*\\.?\\s?[A-Za-z]*)\\s*pass.*incomplete.*(to ([A-Za-z]*\\.?\\s?[A-Za-z]*))?"); /** * (11:28) (Shotgun) J.Cutler pass short right intended for M.Forte * INTERCEPTED by J.Freeman at CHI 4. J.Freeman for 4 yards TOUCHDOWN. */ Pattern interception = Pattern .compile("([A-Za-z]*\\.\\s?[A-Za-z]*).*intended for.*INTERCEPTED by ([A-Za-z]*\\.?\\s?[A-Za-z]*)"); /** * (14:49) E.Manning pass short middle to V.Cruz to NYG 21 for 5 yards * (S.Lee) [J.Hatcher]. */ Pattern completePass = Pattern .compile("([A-Za-z]*\\.?\\s?[A-Za-z]*)\\s*pass.*to ([A-Z]*\\.\\s?[A-Za-z]*).*\\(?([A-Z]*\\.\\s?[A-Za-z]*)?\\)?\\s?\\[?([A-Z]*\\.\\s?[A-Za-z]*)?\\]?"); /** * (13:58) S.Weatherford punts 56 yards to DAL 23 Center-Z.DeOssie. D.Bryant * to DAL 24 for 1 yard (Z.DeOssie). */ Pattern punt = Pattern .compile("([A-Z]*\\.\\s?[A-Za-z]*)\\s*punts.*to.*\\.\\s?([A-Z]*\\.\\s?[A-Za-z]*)?"); /** (13:44) D.Murray left guard to DAL 27 for 3 yards (C.Blackburn). */ Pattern run = Pattern .compile("([A-Za-z]*\\.?\\s?[A-Za-z]*)\\s*.*[to|for].*\\(?([A-Z]*\\.\\s?[A-Za-z]*)?\\)?\\s?\\[?([A-Z]*\\.\\s?[A-Za-z]*)?\\]?"); /** * D.Bailey kicks 69 yards from DAL 35 to NYG -4. D.Wilson to NYG 16 for 20 * yards (A.Holmes). */ Pattern kickoff = Pattern .compile("([A-Z]*\\.\\s?[A-Za-z]*)\\s*kicks.*from.*\\.?\\s?([A-Z]*\\.\\s?[A-Za-z]*)?"); /** (:17) (No Huddle) M.Stafford spiked the ball to stop the clock. */ Pattern spike = Pattern .compile("([A-Za-z]*\\.?\\s?[A-Za-z]*)\\s*spiked the ball"); /** * (9:14) L.Tynes 22 yard field goal is GOOD Center-Z.DeOssie * Holder-S.Weatherford. */ Pattern fieldGoal = Pattern .compile("([A-Za-z]*\\.?\\s?[A-Za-z]*).*field goal"); /** D.Bailey extra point is GOOD Center-L.Ladouceur Holder-C.Jones. */ Pattern extraPoint = Pattern .compile("([A-Za-z]*\\.?\\s?[A-Za-z]*).*extra point"); /** * (9:36) PENALTY on NYG-V.Cruz False Start 5 yards enforced at DAL 47 - No * Play. */ Pattern penalty = Pattern.compile(".*PENALTY.*"); /** * (12:19) (Shotgun) R.Tannehill FUMBLES (Aborted) at MIA 49 recovered by * MIA-D.Thomas at HST 49. D.Thomas to HST 49 for no gain (B.Cushing). */ Pattern fumble = Pattern.compile(".*FUMBLES.*"); /** (3:42) J.Flacco sacked at BLT 15 for -5 yards (T.Hali). */ Pattern sack = Pattern .compile("([A-Za-z]*\\.?\\s?[A-Za-z]*)\\s*.*sacked.*\\(?([A-Z]*\\.\\s?[A-Za-z]*)\\)?\\s?\\[?([A-Z]*\\.\\s?[A-Za-z]*)?\\]?"); /** (1:18) J.Flacco kneels to BLT 40 for -1 yards. */ Pattern kneel = Pattern.compile("([A-Za-z]*\\.?\\s?[A-Za-z]*)\\s*kneels"); /** *** play under review *** */ Pattern review = Pattern.compile("play under review"); /** (5:42) Alex Smith scrambles right end to CLV 20 for 3 yards (J.Haden). */ Pattern scramble = Pattern .compile("([A-Za-z]*\\.?\\s?[A-Za-z]*)\\s*scrambles"); /** END QUARTER 3 */ Pattern endQuarter = Pattern.compile("END [QUARTER|GAME]"); /** 20120909_STL@DET */ Pattern gameString = Pattern.compile("(\\d*)_([A-Z]*)@([A-Z]*)"); Pattern[] allPatterns = { incompletePass, interception, completePass, punt, kickoff, spike, fieldGoal, extraPoint, sack, kneel, review, scramble, endQuarter, run }; String idPrefix = null; int id = 0; @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] pieces = line.split(",", -1); if (pieces.length == 0) { // Skip lines that are only commas // ,,,,,,,,,,,, return; } String qb = "", offensivePlayer = "", defensivePlayer1 = "", defensivePlayer2 = ""; boolean hasPenalty = false, hasFumble = false, hasIncomplete = false, isGoalGood = false; String playType = ""; boolean found = false; int piecesIndex = -1; String playDesc = null; // Sometimes the play description is in a different field if (pieces[9].length() > 7) { playDesc = pieces[9]; piecesIndex = 9; } else if (pieces[11].length() > 7) { playDesc = pieces[11]; piecesIndex = 11; } else { logger.warn("Line is null \"" + line + "\""); return; } if (pieces.length < piecesIndex + 2) { logger.warn("Line is not big enough \"" + line + "\""); return; } for (Pattern pattern : allPatterns) { Matcher matcher = pattern.matcher(playDesc); if (matcher.find()) { found = true; if (pattern == incompletePass) { qb = matcher.group(1); offensivePlayer = matcher.group(3); hasIncomplete = true; playType = "PASS"; } else if (pattern == interception) { qb = matcher.group(1); defensivePlayer1 = matcher.group(2); playType = "INTERCEPTION"; } else if (pattern == completePass) { qb = matcher.group(1); offensivePlayer = matcher.group(2); defensivePlayer1 = matcher.group(3); defensivePlayer2 = matcher.group(4); playType = "PASS"; } else if (pattern == punt) { qb = matcher.group(1); defensivePlayer1 = matcher.group(2); playType = "PUNT"; } else if (pattern == kickoff) { offensivePlayer = matcher.group(1); defensivePlayer1 = matcher.group(2); playType = "KICKOFF"; } else if (pattern == spike) { qb = matcher.group(1); playType = "SPIKE"; } else if (pattern == fieldGoal) { qb = matcher.group(1); isGoalGood = playDesc.toLowerCase().indexOf("no good") == -1 && playDesc.toLowerCase().indexOf("missed") == -1; playType = "FIELDGOAL"; } else if (pattern == extraPoint) { qb = matcher.group(1); isGoalGood = playDesc.toLowerCase().indexOf("no good") == -1 && playDesc.toLowerCase().indexOf("missed") == -1; playType = "EXTRAPOINT"; } else if (pattern == sack) { offensivePlayer = matcher.group(1); defensivePlayer1 = matcher.group(2); defensivePlayer2 = matcher.group(3); // Workaround regex bug if (defensivePlayer2 != null && defensivePlayer2.equals(".")) { defensivePlayer2 = ""; } playType = "SACK"; } else if (pattern == kneel) { qb = matcher.group(1); playType = "KNEEL"; } else if (pattern == review) { playType = "REVIEW"; } else if (pattern == scramble) { qb = matcher.group(1); playType = "SCRAMBLE"; } else if (pattern == endQuarter) { playType = "END"; } else if (pattern == run) { offensivePlayer = matcher.group(1); defensivePlayer1 = matcher.group(2); defensivePlayer2 = matcher.group(3); // Workaround regex bug if (defensivePlayer2 != null && defensivePlayer2.equals(".")) { defensivePlayer2 = ""; } playType = "RUN"; } break; } } // Always check for penalties and fumbles Matcher matcher = penalty.matcher(playDesc); if (matcher.find()) { hasPenalty = true; } matcher = fumble.matcher(playDesc); if (matcher.find()) { hasFumble = true; } if (found == false) { context.getCounter("inc", "notfound").increment(1); logger.warn("Did not match \"" + line + "\""); return; } StringBuilder output = new StringBuilder(); // Process the game output Matcher gameMatcher = gameString.matcher(pieces[0]); // Process the game output if (gameMatcher.find()) { // Check that offense and defense is filled in if (pieces[4].trim().length() == 0) { pieces[4] = gameMatcher.group(3).equals(pieces[5]) ? gameMatcher .group(2) : gameMatcher.group(3); logger.warn("Replacing offense to be " + pieces[4] + " Off:" + pieces[5]); } if (pieces[5].trim().length() == 0) { pieces[4] = gameMatcher.group(3).equals(pieces[4]) ? gameMatcher .group(3) : gameMatcher.group(2); logger.warn("Replacing offense to be " + pieces[5] + " Def:" + pieces[4]); } } else { context.getCounter("inc", "gamenotfound").increment(1); logger.warn("Game did not match \"" + line + "\""); return; } // Add all of the pieces for (int i = 0; i < pieces.length; i++) { // Normalize output across all seasons by removing extra info if (piecesIndex == 11) { if (i == 9 || i == 10 || i == 12 || i == 13 || i == 14) { continue; } output.append(pieces[i]).append(OUTPUT_SEPARATOR); } else { output.append(pieces[i]).append(OUTPUT_SEPARATOR); } } // Check that extracted data isn't from missing groups if (qb == null) { qb = ""; } if (offensivePlayer == null) { offensivePlayer = ""; } if (defensivePlayer1 == null) { defensivePlayer1 = ""; } if (defensivePlayer2 == null) { defensivePlayer2 = ""; } // Process the play by play data output.append(qb).append(OUTPUT_SEPARATOR); output.append(offensivePlayer).append(OUTPUT_SEPARATOR); output.append(defensivePlayer1).append(OUTPUT_SEPARATOR); output.append(defensivePlayer2).append(OUTPUT_SEPARATOR); output.append(hasPenalty).append(OUTPUT_SEPARATOR); output.append(hasFumble).append(OUTPUT_SEPARATOR); output.append(hasIncomplete).append(OUTPUT_SEPARATOR); output.append(isGoalGood).append(OUTPUT_SEPARATOR); output.append(playType).append(OUTPUT_SEPARATOR); // Process the game output output.append(gameMatcher.group(3)).append(OUTPUT_SEPARATOR); output.append(gameMatcher.group(2)).append(OUTPUT_SEPARATOR); output.append(gameMatcher.group(1)).append(OUTPUT_SEPARATOR); // Output the unique id of the play output.append(idPrefix).append("_").append(StringUtils.leftPad(String.valueOf(id), 8, "0")); id++; context.write(new Text(pieces[0]), new Text(output.toString())); } @Override public void setup(Context context) { id = 0; FileSplit fileSplit = (FileSplit) context.getInputSplit(); idPrefix = fileSplit.getPath().getName(); // Crop until the first underscore idPrefix = idPrefix.substring(0, idPrefix.indexOf("_")); } }