Earlier this spring, I wanted to pull down spring training rosters and found that Wikipedia was the friendliest source. Below is the approach on which I landed. This code assumes you are using the scraped rosters in conjunction with the Lahman database.
The Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_spring_training_rosters(year, team_name_map): | |
""" | |
Finding MLB spring training rosters can be difficult. They can luckily be scraped from Wikipedia. Once the season | |
starts, you can easily get rosters from other sources. Therefore, this is really only useful for getting 40-man | |
rosters during spring training. | |
Scrapes Wikipedia's tables of MLB spring training rosters and returns a pandas dataframe with three columns: | |
yearID, teamID, and player_name. The first two columns are named as such to be consistent with the Lahman database. | |
Likewise, the team name map will ensure the teamID is consistent with the Lahman database. Additional wrangling | |
of the player_name will be needed to make this data consistent with something like the Lahman database. | |
:param year: current year | |
:type year: int | |
:param team_name_map: mapping of Wikipedia's team names to the Lahman database's teamID | |
:type team_name_map: dict | |
:returns: pandas dataframe | |
""" | |
url = "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_team_rosters" | |
res = requests.get(url).text | |
soup = BeautifulSoup(res, 'html.parser') | |
players = [] | |
teams = [] | |
for table in soup.find_all('table'): | |
for tr in table.find_all('tr'): | |
team_players = [] | |
for a in tr.find_all('a', href=True): | |
team_players.append(a.string) | |
players.append(team_players) | |
for th in soup.find_all('th'): | |
teams.append(th.text) | |
teams = [x for x in teams if x not in ['40-man roster\n', 'Non-roster invitees\n', 'Coaches/Other\n', | |
'vteMajor League Baseball team rosters and affiliated Minor League Baseball team rosters', | |
'MLB', 'MiLB', 'Triple-A', 'Double-A', 'High-A', 'Low-A', 'Rookie', | |
'Offseason']] | |
teams = [x.replace(f' {year} spring training rostervte\n', '') for x in teams] | |
teams = [x.rstrip() for x in teams] | |
players = [l for l in players if l != []] | |
players = [l for l in players if l != ['v', 't', 'e']] | |
players = players[:len(players) - 15] | |
teams_df = pd.DataFrame() | |
for index, team in enumerate(teams): | |
if index <= 29: | |
players[index] = [x for x in players[index] if x != None] | |
temp_df = pd.DataFrame({team: players[index]}) | |
teams_df = pd.concat([teams_df, temp_df], axis=1) | |
teams_df = teams_df.head(40) | |
final_df = pd.DataFrame() | |
teams = list(teams_df) | |
for team in teams: | |
temp_df = teams_df[[team]] | |
temp_df.rename(columns={team: 'player_name'}, inplace=True) | |
temp_df['teamID'] = team | |
final_df = final_df.append(temp_df) | |
final_df['yearID'] = year | |
final_df['teamID'] = final_df['teamID'].map(team_name_map) | |
final_df['player_name'] = final_df['player_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').\ | |
str.decode('utf-8') | |
final_df['player_name'] = final_df['player_name'].str.replace(' Jr.', '') | |
final_df['player_name'] = final_df['player_name'].str.replace('A. J.', 'AJ') | |
return final_df | |
Dictionary Mappings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
TEAM_NAME_MAP = { | |
'Baltimore Orioles': 'BAL', | |
'Boston Red Sox': 'BOS', | |
'New York Yankees': 'NYA', | |
'Tampa Bay Rays': 'TBA', | |
'Toronto Blue Jays': 'TOR', | |
'Chicago White Sox': 'CHA', | |
'Cleveland Indians': 'CLE', | |
'Detroit Tigers': 'DET', | |
'Kansas City Royals': 'KCA', | |
'Minnesota Twins': 'MIN', | |
'Houston Astros': 'HOU', | |
'Los Angeles Angels': 'LAA', | |
'Oakland Athletics': 'OAK', | |
'Seattle Mariners': 'SEA', | |
'Texas Rangers': 'TEX', | |
'Atlanta Braves': 'ATL', | |
'Miami Marlins': 'MIA', | |
'New York Mets': 'NYN', | |
'Philadelphia Phillies': 'PHI', | |
'Washington Nationals': 'WAS', | |
'Chicago Cubs': 'CHN', | |
'Cincinnati Reds': 'CIN', | |
'Milwaukee Brewers': 'MIL', | |
'Pittsburgh Pirates': 'PIT', | |
'St. Louis Cardinals': 'SLN', | |
'Arizona Diamondbacks': 'ARI', | |
'Colorado Rockies': 'COL', | |
'Los Angeles Dodgers': 'LAN', | |
'San Diego Padres': 'SDN', | |
'San Francisco Giants': 'SFN', | |
} | |
NAME_ID_MAPPING = { | |
'Adam Eaton': 'eatonad02', | |
'Austin Adams': 'adamsau02', | |
'Brian Anderson': 'anderbr06', | |
'Carlos Hernandez': 'hernaca04', | |
'Carlos Martinez': 'martica04', | |
'Edwin Diaz': 'diazed04', | |
'Fernando Tatis': 'tatisfe02', | |
'Javy Guerra': 'guerrja02', | |
'Jose Garcia': 'garcijo02', | |
'Jose Ramirez': 'ramirjo01', | |
'Josh Bell': 'belljo02', | |
'Luis Castillo': 'castilu02', | |
'Luis Gonzalez': 'gonzalu03', | |
'Vladimir Guerrero': 'guerrvl02', | |
} | |
NAME_ID_TEAM_MAPPING = { | |
'Luis Garcia': { | |
'HOU': 'garcilu05', | |
'WAS': 'garcilu04' | |
}, | |
'Will Smith': { | |
'LAN': 'smithwi05', | |
'ATL': 'smithwi04' | |
} | |
} |
Helper Functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def map_id_to_player_name(player_df, roster_df, name_id_mapping, name_id_team_mapping): | |
""" | |
The Wikipedia roster scraper only returns a player's name, not a playerID we can use with the Lahman database. This | |
function maps those names to the correct player ID. | |
:param player_df: Lahman database people data | |
(https://github.com/chadwickbureau/baseballdatabank/blob/master/core/People.csv?raw=true) | |
:type player_df: pandas dataframe | |
:param roster_df: spring training rosters scraped from Wikipedia | |
:type roster_df: pandas dataframe | |
:param name_id_mapping: for a player_name that appears multiples times, a mapping of the correct name to ID | |
:type name_id_mapping: dict | |
:param name_id_team_mapping: for player_name that appears on multiple teams, a mapping of the correct name to ID | |
:type name_id_team_mapping: dict | |
""" | |
player_df = player_df.loc[player_df['birthYear'] >= 1975] | |
player_df['player_name'] = player_df['nameFirst'] + ' ' + player_df['nameLast'] | |
player_df = player_df[['player_name', 'playerID']] | |
roster_df = pd.merge(roster_df, player_df, how='left', on='player_name') | |
for player_name, player_id in name_id_mapping.items(): | |
roster_df.loc[roster_df['player_name'] == player_name, 'playerID'] = player_id | |
for player_name, player_mapping in name_id_team_mapping.items(): | |
for team, player_id in player_mapping.items(): | |
roster_df.loc[(roster_df['player_name'] == player_name) & (roster_df['teamID'] == team), | |
'playerID'] = player_id | |
roster_df.drop_duplicates(inplace=True) | |
roster_df.drop('player_name', 1, inplace=True) | |
return roster_df | |