#! /usr/bin/perl # This script is made freely available for non-commerical use by Mike Fast # August 2007 # http://fastballs.wordpress.com/ # Attribution is appreciated but not required. # # This script uses portions of Joseph Adler's code from hack_28_parser.pl # as published by O'Reilly Media in the book Baseball Hacks, copyright 2006 # ISBN 0-596-00942-9, available at http://www.oreilly.com/catalog/baseballhks/ # used under the terms set forth in the book on Page xvi, as follows: # "In general, you may use the code in this book in your programs and documentation. # You do not need to contact us for permission unless you're reproducing a significant # portion of the code. For example, writing a program that uses several chunks of code # from this book does not require permission." # # Code lines 26-85, 121-122, 207-217, 230-231, and 294-301 are largely by Joseph Adler # and the rest of the code is largely or completely by Mike Fast # MySQL database connection statement use DBI; $dbh = DBI->connect("DBI:mysql:database=pbp;host=localhost", 'username', 'password') or die $DBI::errstr; # Set base directory for XML game data download URL $year = 2007; $basedir = "./games/year_$year"; # Define XML objects use XML::Simple; $boxparser= new XML::Simple(ForceArray => 1, KeepRoot => 1, KeyAttr => 'boxscore'); $inningparser= new XML::Simple(ForceArray => 1, KeepRoot => 1, KeyAttr => 'inning'); $hitsparser= new XML::Simple(ForceArray => 1, KeepRoot => 1, KeyAttr => 'hitchart'); $playerparser= new XML::Simple(ForceArray => 1, KeepRoot => 1, KeyAttr => 'game'); sub extract_date($) { my($in) = @_; my $gmyr = substr($in,0,4); my $gmmn = substr($in,5,2); my $gmdy = substr($in,8,2); my $gamedate = '\'' . $gmyr . '-' . $gmmn . '-' . $gmdy . '\''; return $gamedate; } sub extract_info($) { # This subroutine parses game information from the boxscore.xml file my ($box) = @_; my $home = $box->{boxscore}->[0]->{home_team_code}; my $away = $box->{boxscore}->[0]->{away_team_code}; my $gameid = "'" . $box->{boxscore}->[0]->{game_id} . "'"; my $gamedate = extract_date($box->{boxscore}->[0]->{game_id}); my $gameinfo = "'" . $box->{boxscore}->[0]->{game_info}->[0] . "'"; return ($home, $away, $gameid, $gamedate, $gameinfo); } # Get the list of months from the base year directory opendir MDIR, $basedir; @monthdirs = readdir MDIR; closedir MDIR; foreach $mondir (@monthdirs) { if ($mondir =~ /month/) { opendir DDIR, "$basedir/$mondir"; my @daydirs = readdir DDIR; closedir DDIR; foreach $daydir (@daydirs) { if ($daydir =~ /day/) { opendir GDIR, "$basedir/$mondir/$daydir"; my @gamedirs = readdir GDIR; closedir GDIR; foreach $gamedir (@gamedirs) { if ($gamedir =~ /gid_/ and (-e "$basedir/$mondir/$daydir/$gamedir/inning/inning_hit.xml")) { $fulldir = "$basedir/$mondir/$daydir/$gamedir"; $box = $boxparser->XMLin( "$fulldir/boxscore.xml"); my ($home, $away, $gameid, $gamedate, $gameinfo) = extract_info($box); # Game number = 1, unless the 2nd game of a doubleheader when game number = 2 $game_number = substr($gameid, -2, 1); if ($gameinfo =~ /Weather<\/b>: (\d+) degrees,.*Wind<\/b>: (\d+) mph, ([\w\s]+)./) { $temperature = $1; $wind = $2; $wind_dir = "'" . $3 . "'"; } else { # Domed stadiums may list wind speed as "Indoors" $gameinfo =~ /Weather<\/b>: (\d+) degrees,.*Wind<\/b>: Indoors./; $temperature = $1; $wind = 0; $wind_dir = "'Indoors'"; } $home = $dbh->quote($home); $away = $dbh->quote($away); # Input the game info into the database $no_duplicate_query = 'SELECT game_id FROM games WHERE (date = ' . $gamedate . ' AND home = ' . $home . ' AND away = ' . $away . ' AND game = ' . $game_number . ')'; $sth= $dbh->prepare($no_duplicate_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; $sth->finish(); if ($numRows) { # don't insert duplicate game entry into games table } else { $game_query = 'INSERT INTO games (date, home, away, game, wind, wind_dir, temp) ' . 'VALUES (' . $gamedate . ', '. $home . ', ' . $away . ', ' . $game_number . ', ' . $wind . ', ' . $wind_dir . ', ' . $temperature . ')'; $sth= $dbh->prepare($game_query) or die $DBI::errstr; $sth->execute(); $sth->finish(); } # Check for new players in the players.xml file and input them into the database $players = $playerparser->XMLin( "$fulldir/players.xml"); foreach $team (@{$players->{game}->[0]->{team}}) { foreach $player (@{$team->{player}}) { $id = $player->{id}; $first = $dbh->quote($player->{first}); $last = $dbh->quote($player->{last}); $no_duplicate_query = 'SELECT eliasid FROM players WHERE eliasid = ' . $id; $sth= $dbh->prepare($no_duplicate_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; $sth->finish(); if ($numRows) { # don't insert duplicate player entry into players table } else { $player_query = 'INSERT INTO players (eliasid, first, last) ' . 'VALUES (' . $id . ', '. $first . ', ' . $last . ')'; $sth= $dbh->prepare($player_query) or die $DBI::errstr; $sth->execute(); $sth->finish(); } } } # Check if game info has been input before inputting umpire, at bat, and pitch info $game_id_query = 'SELECT game_id FROM games WHERE (date = ' . $gamedate . ' AND home = ' . $home . ' AND away = ' . $away . ' AND game = ' . $game_number . ')'; $sth= $dbh->prepare($game_id_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; if (1==$numRows) { $select_game_id = $sth->fetchrow_array(); print "\nParsing game number $select_game_id.\n"; } else { die "duplicate game entry $select_game_id in database or game not found.\n"; } $sth->finish(); # Find the home plate umpire and input him into the database foreach $umpire (@{$players->{game}->[0]->{umpires}->[0]->{umpire}}) { $umpire_name = $umpire->{name}; ($umpire_first, $umpire_last) = split(/\s/, $umpire_name); $umpire_first = $dbh->quote($umpire_first); $umpire_last = $dbh->quote($umpire_last); $position = $umpire->{position}; if ('home' eq $position) { $no_duplicate_query = 'SELECT ump_id FROM umpires WHERE first = ' . $umpire_first . ' AND last = ' . $umpire_last; $sth= $dbh->prepare($no_duplicate_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; if ($numRows) { # don't insert duplicate umpire entry into umpires table # get umpire id $select_ump_id = $sth->fetchrow_array(); $sth->finish(); } else { $sth->finish(); $umpire_query = 'INSERT INTO umpires (first, last) ' . 'VALUES (' . $umpire_first . ', ' . $umpire_last . ')'; $sth= $dbh->prepare($umpire_query) or die $DBI::errstr; $sth->execute(); $sth->finish(); # get umpire id $umpire_id_query = 'SELECT ump_id FROM umpires WHERE first = ' . $umpire_first . ' AND last = ' . $umpire_last; $sth= $dbh->prepare($umpire_id_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; if (1==$numRows) { $select_ump_id = $sth->fetchrow_array(); $sth->finish(); } else { die "numrows=$numRows, duplicate umpire entry $umpire_first $umpire_last in database or umpire not found.\n"; } } } else { # ignore base umpires } } # update game record with umpire id $umpire_update_query = 'UPDATE games SET umpire = ' . $select_ump_id. ' WHERE game_id = ' . $select_game_id; $sth= $dbh->prepare($umpire_update_query) or die $DBI::errstr; $sth->execute(); # Parse the at bats and pitches from each inning_?.xml file opendir IDIR, "$fulldir/inning"; my @inningfiles = readdir IDIR; closedir IDIR; my @innings = (); foreach $inningfn (@inningfiles) { if ($inningfn =~ /inning_(\d+)\.xml/) { $inning_num = $1; # Pre-process the inning_?.xml file $inning = $inningparser->XMLin( "$fulldir/inning/$inningfn"); @innings[$inning_num] = $inning; # Parse the at-bat and pitch data for the top and bottom halves of each inning foreach $atbat (@{$inning->{inning}->[0]->{top}->[0]->{atbat}}) { parse_at_bats_and_pitches($atbat, $dbh, $select_game_id, $inning_num); } foreach $atbat (@{$inning->{inning}->[0]->{bottom}->[0]->{atbat}}) { parse_at_bats_and_pitches($atbat, $dbh, $select_game_id, $inning_num); } } } $hits = $hitsparser->XMLin( "$fulldir/inning/inning_hit.xml"); # When a ball in play and an error are recorded on the same play, # the error may be the first play listed in inning_hit.xml or the second play. # Currently the first play is recorded in the database, and # the second play is not recorded in the database but is saved to a text file # for later manual review. Some cases of batting around in one inning may # also be saved to the text file. # This section of code could be improved by automating the manual review process. open (HITRECORD, ">> hit_record.txt") || die "sorry, system can't open hitrecord"; foreach $hip (@{$hits->{hitchart}->[0]->{hip}}) { $hit_des = $hip->{des}; $hit_x = $hip->{x}; $hit_y = $hip->{y}; $hit_type = $dbh->quote($hip->{type}); $hit_batter = $hip->{batter}; $hit_pitcher = $hip->{pitcher}; $hit_inning = $hip->{inning}; # find the at bat that matches the ball in play $find_ab_id_query = 'SELECT ab_id, hit_x, event FROM atbats WHERE (game_id = ' . $select_game_id . ' AND inning = ' . $hit_inning . ' AND batter = ' . $hit_batter . ' AND pitcher = ' . $hit_pitcher . ')'; $sth= $dbh->prepare($find_ab_id_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; if (1==$numRows) { # for one matching at bat, check if hit data already entered in database ($select_ab_id, $select_hit_x, $select_event) = $sth->fetchrow_array(); # update atbats table with hit info for each matching at_bat if (0<$select_hit_x) { # already entered into database print HITRECORD "game $select_game_id:1.1 This hit $hit_batter - $hit_pitcher - $hit_inning already recorded in database.\n"; } else { update_hit_info($hit_x, $hit_y, $hit_type, $select_ab_id); } } elsif (2==$numRows) { # if the batter has batted twice in the inning against the same pitcher ($select_ab_id, $select_hit_x, $select_event) = $sth->fetchrow_array(); # if the first ball in play is already recorded, don't update it if ($hit_x==$select_hit_x && $select_event eq $hit_des) { print HITRECORD "game $select_game_id:2.1 This hit $hit_batter - $hit_pitcher - $hit_inning already recorded in database.\n"; } elsif (0<$select_hit_x) { # select the info for the second ball in play from the database ($select_ab_id, $select_hit_x, $select_event) = $sth->fetchrow_array(); # if the second ball in play is already recorded, don't update it if ($hit_x==$select_hit_x && $select_event eq $hit_des) { print HITRECORD "game $select_game_id:2.2 This hit $hit_batter - $hit_pitcher - $hit_inning already recorded in database.\n"; } else { # if the second ball in play hasn't been recorded, update the db update_hit_info($hit_x, $hit_y, $hit_type, $select_ab_id); } } else { # if the first ball in play hasn't been recorded, update the db update_hit_info($hit_x, $hit_y, $hit_type, $select_ab_id); } } else { die "numrows=$numRows, no matching at bat found for hit $hit_batter - $hit_pitcher - $hit_inning.\n"; } } close HITRECORD; # This is a debug section if you want to look at contents of the XML file # in an easier-to-read format # use Data::Dumper; # open (OUTFILE, "> debug_parser_innings.txt") || die "sorry, system can't open outfile"; # print OUTFILE Dumper($hits); # print OUTFILE Dumper($players); # print OUTFILE Dumper($names); # print OUTFILE Dumper($box); # print OUTFILE Dumper(@innings); # close OUTFILE; } } } } } } sub update_hit_info($hit_x, $hit_y, $hit_type, $select_ab_id) { # update at bat record with hit info $hit_query = 'UPDATE atbats SET hit_x = ' . $hit_x . ', hit_y = ' . $hit_y . ', hit_type = ' . $hit_type . ' WHERE ab_id = ' . $select_ab_id; $sth= $dbh->prepare($hit_query) or die $DBI::errstr; $sth->execute(); $sth->finish(); } sub parse_at_bats_and_pitches($atbat, $dbh, $select_game_id, $inning_num) { $event = $dbh->quote($atbat->{event}); $event_num = $atbat->{num}; $ball = $atbat->{b}; $strike = $atbat->{s}; $out = $atbat->{o}; $pitcher_id = $atbat->{pitcher}; $batter_id = $atbat->{batter}; $stand = $dbh->quote($atbat->{stand}); $des = $dbh->quote($atbat->{des}); $no_duplicate_query = 'SELECT ab_id FROM atbats WHERE (game_id = ' . $select_game_id . ' AND num = ' . $event_num . ')'; $sth= $dbh->prepare($no_duplicate_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; $sth->finish(); if ($numRows) { # don't insert duplicate at bat entry into atbats table print "$select_game_id, $inning_num, $event_num: That's a duplicate at bat to one in the database already.\n"; } else { # insert a new record in the database for this at bat $ab_query = 'INSERT INTO atbats (game_id, inning, num, ball, strike, outs,' . ' batter, pitcher, stand, des, event) ' . 'VALUES (' . $select_game_id . ', ' . $inning_num . ', ' . $event_num . ', ' . $ball . ', ' . $strike . ', ' . $out . ', ' . $batter_id . ', ' . $pitcher_id . ', ' . $stand . ', ' . $des . ', ' . $event . ')'; $sth= $dbh->prepare($ab_query) or die $DBI::errstr; $sth->execute(); $sth->finish(); } # get the at bat id from the database to use when inputting the pitch data $ab_id_query = 'SELECT ab_id FROM atbats WHERE (game_id = ' . $select_game_id . ' AND num = ' . $event_num . ')'; $sth= $dbh->prepare($ab_id_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; if (1==$numRows) { $select_ab_id = $sth->fetchrow_array(); print " ab#$select_ab_id,"; $sth->finish(); } else { die "numrows=$numRows, duplicate at bat entry $select_ab_id in database or game not found.\n"; } foreach $pitch (@{$atbat->{pitch}}) { # these fields are common to pitch-f/x and non-pfx data $pitch_des = $dbh->quote($pitch->{des}); $pitch_id = $pitch->{id}; $pitch_type = $dbh->quote($pitch->{type}); $pitch_x = $pitch->{x}; $pitch_y = $pitch->{y}; $start_speed = $pitch->{start_speed}; # determine if the data for this pitch includes pitch-f/x fields $pitchfx = 0; if (0 < $start_speed) { $pitchfx = 1; $end_speed = $pitch->{end_speed}; $sz_top = $pitch->{sz_top}; $sz_bot = $pitch->{sz_bot}; $pfx_x = $pitch->{pfx_x}; $pfx_z = $pitch->{pfx_z}; $px = $pitch->{px}; $pz = $pitch->{pz}; $x0 = $pitch->{x0}; $y0 = $pitch->{y0}; $z0 = $pitch->{z0}; $vx0 = $pitch->{vx0}; $vy0 = $pitch->{vy0}; $vz0 = $pitch->{vz0}; $ax = $pitch->{ax}; $ay = $pitch->{ay}; $az = $pitch->{az}; $break_y = $pitch->{break_y}; $break_angle = $pitch->{break_angle}; $break_length = $pitch->{break_length}; } $no_duplicate_query = 'SELECT pitch_id FROM pitches WHERE (ab_id = ' . $select_ab_id . ' AND id = ' . $pitch_id . ')'; $sth= $dbh->prepare($no_duplicate_query) or die $DBI::errstr; $sth->execute(); my $numRows = $sth->rows; $sth->finish(); if ($numRows) { # don't insert duplicate pitch entry into pitches table print "$select_ab_id, $pitch_id: That's a duplicate pitch to one in the database already.\n"; } else { # insert a new record in the database for this pitch if ($pitchfx) { $pitch_query = 'INSERT INTO pitches (ab_id, des, type, id, x, y, start_speed,' . ' end_speed, sz_top, sz_bot, pfx_x, pfx_z, px, pz, x0, y0, z0, vx0, vy0,' . ' vz0, ax, ay, az, break_y, break_angle, break_length) VALUES (' . join(', ', ($select_ab_id, $pitch_des, $pitch_type, $pitch_id, $pitch_x, $pitch_y, $start_speed, $end_speed, $sz_top, $sz_bot, $pfx_x, $pfx_z, $px, $pz, $x0, $y0, $z0, $vx0, $vy0, $vz0, $ax, $ay, $az, $break_y, $break_angle, $break_length)) . ')'; } else { $pitch_query = 'INSERT INTO pitches (ab_id, des, type, id, x, y) VALUES (' . join(', ', ($select_ab_id, $pitch_des, $pitch_type, $pitch_id, $pitch_x, $pitch_y)) . ')'; } # print "SQL: $pitch_query\n"; $sth= $dbh->prepare($pitch_query) or die $DBI::errstr; $sth->execute(); } } }