import os
import urllib
import webbrowser
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup


url = 'https://www.basketball-reference.com/awards/mvp.html'
webbrowser.open_new_tab(url)

True


# get the html
html = urllib.request.urlopen(url)

# create the BeautifulSoup object
soup = BeautifulSoup(html, "lxml")


# Extract the necessary values for the column headers from the table
# and store them as a list
column_headers = [th.getText() for th in soup.findAll('th', limit=30)]
column_headers = [s for s in column_headers if len(s) != 0]
column_headers = column_headers[1:]
print(column_headers)
column_headers = [e for e in column_headers if e not in ('Shooting', 'Advanced')][:-7]
print(column_headers)

['Shooting', 'Advanced', 'Season', 'Lg', 'Player', 'Voting', 'Age', 'Tm', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14', '2012-13', '2011-12']
['Season', 'Lg', 'Player', 'Voting', 'Age', 'Tm', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48']


len(column_headers)

18


# The data is found within the table rows
# We want the elements from the 3rd row and on
table_rows = soup.find_all("tr")[2:]
print(type(table_rows))
table_rows[0] # take a look at the first row

<class 'list'>

<tr><th class="left " data-stat="season" scope="row"><a href="/leagues/NBA_2018.html">2017-18</a></th><td class="left " data-stat="lg_id"><a href="/leagues/NBA_2018.html">NBA</a></td><td class="left " csk="Harden,James" data-append-csv="hardeja01" data-stat="player"><a href="/players/h/hardeja01.html">James Harden</a></td><td class="center " data-stat="voting"> (<a href="/awards/awards_2018.html#mvp">V</a>)</td><td class="right " data-stat="age">28</td><td class="left " data-stat="team_id"><a href="/teams/HOU/2018.html">HOU</a></td><td class="right " data-stat="g">72</td><td class="right " data-stat="mp_per_g">35.4</td><td class="right " data-stat="pts_per_g">30.4</td><td class="right " data-stat="trb_per_g">5.4</td><td class="right " data-stat="ast_per_g">8.8</td><td class="right " data-stat="stl_per_g">1.8</td><td class="right " data-stat="blk_per_g">0.7</td><td class="right " data-stat="fg_pct">.449</td><td class="right " data-stat="fg3_pct">.367</td><td class="right " data-stat="ft_pct">.858</td><td class="right " data-stat="ws">15.4</td><td class="right " data-stat="ws_per_48">.289</td></tr>


def extract_mvp_data(table_rows):
    """
    Extract and return the the desired information from the td elements within the table rows.
    :param: table_rows:  list of soup `tr` elements
    :return: list of player-year MVP observation
    """
    # create the empty list to store the player data
    player_data = []
    
    for row in table_rows:  # for each row do the following

        # Get the text for each table data (td) element in the row
        player_year = [td.get_text() for td in row.find_all("th")]
        player_list = [td.get_text() for td in row.find_all("td")]
              
        # there are some empty table rows, which are the repeated 
        # column headers in the table
        # we skip over those rows and and continue the for loop
        if not player_list:
            continue
            
        # Now append the data to list of data
        
        player_info = player_year+player_list
        player_data.append(player_info)
        
    return player_data


# extract the data we want
mvp_data = extract_mvp_data(table_rows)
# and then store it in a DataFrame
mvp_data_df  = pd.DataFrame(mvp_data)


mvp_data_df[0:1]


mvp_data_df.columns = column_headers
print("the MVP dataframe has {} rows (player-year observations) and  {} columns".format(mvp_data_df.shape[0],
                                                                                        mvp_data_df.shape[1]))
mvp_data_df.head()

the MVP dataframe has 113 rows (player-year observations) and  18 columns


url_2018_mvp_finalist = 'https://www.basketball-reference.com/awards/awards_2018.html#mvp'


# get the html
# examine webpage
html_finalist = urllib.request.urlopen(url_2018_mvp_finalist)
webbrowser.open_new_tab(url)

# create the BeautifulSoup object
soup_finalist = BeautifulSoup(html_finalist, "lxml") 

# Extract the necessary values for the column headers from the table and store them as a list
column_headers_finalist = [th.getText() for th in soup_finalist.findAll('th', limit=30)]
column_headers_finalist = [s for s in column_headers_finalist if len(s) != 0]
column_headers_finalist = column_headers_finalist[1:]

print("raw column names in finalist table: {}".format(column_headers_finalist))
column_headers_finalist = [e for e in column_headers_finalist if e not in ('Shooting', 'Advanced', 'Per Game')][:-4]
print("formatted column names in finalist table: {}".format(column_headers_finalist))
print("{} columns in finalist table".format(len(column_headers_finalist)))

# The data is found within the `tr` elements of the first `tbody` element
# We want the elements from the 3rd row and on
table_rows_finalist = soup_finalist.find("tbody").find_all("tr")
print(type(table_rows_finalist))
table_rows_finalist[-1] # take a look at the last row

raw column names in finalist table: ['Per Game', 'Shooting', 'Advanced', 'Rank', 'Player', 'Age', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Share', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48', '1', '2', '3', '4']
formatted column names in finalist table: ['Rank', 'Player', 'Age', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Share', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48']
20 columns in finalist table
<class 'bs4.element.ResultSet'>

<tr><th class="right " data-stat="rank" scope="row">13</th><td class="left " csk="Oladipo,Victor" data-append-csv="oladivi01" data-stat="player"><a href="/players/o/oladivi01.html">Victor Oladipo</a></td><td class="right " data-stat="age">25</td><td class="left " data-stat="team_id"><a href="/teams/IND/2018.html">IND</a></td><td class="right " data-stat="votes_first">0.0</td><td class="right " data-stat="points_won">2.0</td><td class="right " data-stat="points_max">1010</td><td class="right " data-stat="award_share">0.002</td><td class="right " data-stat="g">75</td><td class="right " data-stat="mp_per_g">34.0</td><td class="right " data-stat="pts_per_g">23.1</td><td class="right " data-stat="trb_per_g">5.2</td><td class="right " data-stat="ast_per_g">4.3</td><td class="right " data-stat="stl_per_g">2.4</td><td class="right " data-stat="blk_per_g">0.8</td><td class="right " data-stat="fg_pct">.477</td><td class="right " data-stat="fg3_pct">.371</td><td class="right " data-stat="ft_pct">.799</td><td class="right " data-stat="ws">8.2</td><td class="right " data-stat="ws_per_48">.155</td></tr>


def extract_player_data(table_rows):
    """
    Extract and return the the desired information from the td elements within the table rows.
    :param: table_rows:  list of soup `tr` elements
    :return: list of player-year MVP finalist observations
    """
    # create the empty list to store the player data
    player_data = []
    
    for row in table_rows:  # for each row do the following

        # Get the text for each table data (td) element in the row
        player_rank = [td.get_text() for td in row.find_all("th")]
        player_list = [td.get_text() for td in row.find_all("td")]
        
        
        # there are some empty table rows, which are the repeated 
        # column headers in the table
        # we skip over those rows and and continue the for loop
        if not player_list:
            continue
            
        # Now append the data to list of data
        
        player_info = player_rank+player_list
        player_data.append(player_info)
        
    return player_data


# extract the data we want
data = extract_player_data(table_rows_finalist)

# and then store it in a DataFrame
example_player_df  = pd.DataFrame(data)
example_player_df.columns = column_headers_finalist
print("the MVP finalist dataframe has {} rows (player-year observations) and  {} columns".format(example_player_df.shape[0],
                                                                                        example_player_df.shape[1]))
example_player_df.tail(6)

the MVP finalist dataframe has 13 rows (player-year observations) and  20 columns


# Create an empty list that will contain all the dataframes
# (one dataframe for all finalist dataframes)
mvp_finalist_list = []

# a list to store any errors that may come up while scraping
errors_list = []


loop_start = datetime.now()
print(loop_start)

2019-05-25 15:10:57.567450


# The url template that we pass in the finalist year info
url_template = "https://www.basketball-reference.com/awards/awards_{year}.html#mvp"

# for each year from 1956 to (and including) 2018
for year in range(1956, 2019): 
    
    # Use try/except block to catch and inspect any urls that cause an error
    try:
        # get the MVP finalist data url
        url = url_template.format(year=year)

        # get the html
        html = urllib.request.urlopen(url)

        # create the BeautifulSoup object
        soup = BeautifulSoup(html, "lxml") 

        # get the column headers
        column_headers = [th.getText() for th in soup.findAll('th', limit=30)]
        column_headers = [s for s in column_headers if len(s) != 0]
        column_headers = column_headers[1:]
        column_headers = [e for e in column_headers if e not in ('Shooting', 'Advanced', 'Per Game')][:-4]
       
        # select the data from the table 
        table_rows = soup.find_all("tr")[2:]

        # extract the player data from the table rows
        player_data = extract_player_data(table_rows)
        
        # create the dataframe for the current year's mvp finalist data
        # subset to only include MVP finalists
        year_df = pd.DataFrame(player_data)
        year_df.columns = column_headers
        year_df = year_df.loc[year_df["Pts Max"]==year_df["Pts Max"].unique()[0]]

        # add the year of the MVP finalist data to the dataframe
        year_df.insert(0, "Year", year)
 
        # append the current dataframe to the list of dataframes
        mvp_finalist_list.append(year_df)
        
    except Exception as e:
        # Store the url and the error it causes in a list
        error =[url, e] 
        # then append it to the list of errors
        errors_list.append(error)


loop_stop = datetime.now()
print(loop_stop)
print("the loop took {}".format(loop_stop-loop_start))

2019-05-25 15:12:03.441050
the loop took 0:01:05.873600


print(len(errors_list))
errors_list

0

[]


print(type(mvp_finalist_list))
print(len(mvp_finalist_list))

<class 'list'>
63


mvp_finalist_list[0:1]

[   Year Rank          Player Age   Tm First Pts Won Pts Max  Share   G  ...   \
 0  1956    1      Bob Pettit  23  STL  33.0    33.0      80  0.413  72  ...    
 1  1956    2     Paul Arizin  27  PHW  21.0    21.0      80  0.263  72  ...    
 2  1956    3       Bob Cousy  27  BOS  11.0    11.0      80  0.138  72  ...    
 3  1956    4    Mel Hutchins  27  FTW   9.0     9.0      80  0.113  66  ...    
 4  1956   5T   Dolph Schayes  27  SYR   2.0     2.0      80  0.025  72  ...    
 5  1956   5T    Bill Sharman  29  BOS   2.0     2.0      80  0.025  72  ...    
 6  1956   7T        Tom Gola  23  PHW   1.0     1.0      80  0.013  68  ...    
 7  1956   7T  Maurice Stokes  22  ROC   1.0     1.0      80  0.013  67  ...    
 
     PTS   TRB  AST STL BLK   FG% 3P%   FT%    WS WS/48  
 0  25.7  16.2  2.6          .429      .736  13.8  .236  
 1  24.2   7.5  2.6          .448      .810  12.2  .214  
 2  18.8   6.8  8.9          .360      .844   6.8  .119  
 3  12.0   7.5  2.7          .425      .643   4.4  .095  
 4  20.4  12.4  2.8          .387      .858  11.8  .225  
 5  19.9   3.6  4.7          .438      .867   8.8  .157  
 6  10.8   9.1  5.9          .412      .733   6.5  .132  
 7  16.8  16.3  4.9          .354      .714   6.0  .125  
 
 [8 rows x 21 columns]]


column_headers_finalist.insert(0, "Year")
print(column_headers_finalist)
print(len(column_headers_finalist))

['Year', 'Rank', 'Player', 'Age', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Share', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48']
21


# store all finalist data in one DataFrame
mvp_finalist_df = pd.concat(mvp_finalist_list, axis=0)
print(mvp_finalist_df.shape)

(972, 21)


# Take a look at the first row
mvp_finalist_df.iloc[0]

Year             1956
Rank                1
Player     Bob Pettit
Age                23
Tm                STL
First            33.0
Pts Won          33.0
Pts Max            80
Share           0.413
G                  72
MP               38.8
PTS              25.7
TRB              16.2
AST               2.6
STL                  
BLK                  
FG%              .429
3P%                  
FT%              .736
WS               13.8
WS/48            .236
Name: 0, dtype: object


os.makedirs('../data/raw_data', exist_ok=True)
os.makedirs('../data/clean_data', exist_ok=True)


mvp_finalist_df.head()


# Write out the  MVP data and MVP finalist data to the raw_data folder in the data folder
mvp_data_df.to_csv("../data/raw_data/mvp_data_df_raw.csv", index=False)
mvp_finalist_df.to_csv("../data/raw_data/mvp_finalist_df_raw.csv", index=False)


mvp_data_df_clean = pd.read_csv("../data/raw_data/mvp_data_df_raw.csv", encoding = "Latin-1")
mvp_finalist_df_clean = pd.read_csv("../data/raw_data/mvp_finalist_df_raw.csv", encoding = "Latin-1")


mvp_data_df_clean.head()


mvp_data_columns_dict = {'Season':'season', 'Lg':'league', 'Player':'player', 'Voting': 'voting', 'Tm': 'team', 'Age': 'age',
                         'G': 'games_played', 'MP': 'avg_minutes', 'PTS': 'avg_points',  'TRB': 'avg_rebounds', 
                         'AST': 'avg_assists', 'STL': 'avg_steals', 'BLK': 'avg_blocks', 'FG%': 'field_goal_pct',
                         '3P%': 'three_pt_pct', 'FT%': 'free_throw_pct', 'WS': 'win_shares',
                         'WS/48': 'win_shares_per_48'
                        }
mvp_finalist_columns_dict = {'Year':'year', 'Player':'player', 'Rank': 'rank', 'Tm': 'team', 'Age': 'age',
                             'First': 'first_place_votes', 'Pts Won': 'points_won', 'Pts Max': 'points_max',
                             'Share':'vote_share', 'G': 'games_played', 'MP': 'avg_minutes', 'PTS': 'avg_points',
                             'TRB': 'avg_rebounds', 'AST': 'avg_assists', 'STL': 'avg_steals', 'BLK': 'avg_blocks',
                             'FG%': 'field_goal_pct',  '3P%': 'three_pt_pct', 'FT%': 'free_throw_pct', 'WS': 'win_shares',
                             'WS/48': 'win_shares_per_48'
                              }


mvp_data_df_clean.rename(index=str,columns=mvp_data_columns_dict, inplace=True)
mvp_data_df_clean.head()


mvp_finalist_df_clean.rename(index=str,columns=mvp_finalist_columns_dict, inplace=True)
mvp_finalist_df_clean.head()


# convert the data to proper numeric types
mvp_data_df_clean = mvp_data_df_clean.apply(pd.to_numeric, errors="ignore")
mvp_data_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113 entries, 0 to 112
Data columns (total 18 columns):
season               113 non-null object
league               113 non-null object
player               113 non-null object
voting               73 non-null object
age                  73 non-null float64
team                 73 non-null object
games_played         73 non-null float64
avg_minutes          73 non-null float64
avg_points           73 non-null float64
avg_rebounds         73 non-null float64
avg_assists          73 non-null float64
avg_steals           50 non-null float64
avg_blocks           50 non-null float64
field_goal_pct       73 non-null float64
three_pt_pct         48 non-null float64
free_throw_pct       73 non-null float64
win_shares           73 non-null float64
win_shares_per_48    73 non-null float64
dtypes: float64(13), object(5)
memory usage: 16.8+ KB


# convert the data to proper numeric types
mvp_finalist_df_clean = mvp_finalist_df_clean.apply(pd.to_numeric, errors="ignore")
mvp_finalist_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 972 entries, 0 to 971
Data columns (total 21 columns):
year                 972 non-null int64
rank                 972 non-null object
player               972 non-null object
age                  972 non-null int64
team                 972 non-null object
first_place_votes    972 non-null float64
points_won           972 non-null float64
points_max           972 non-null int64
vote_share           972 non-null float64
games_played         972 non-null int64
avg_minutes          972 non-null float64
avg_points           972 non-null float64
avg_rebounds         972 non-null float64
avg_assists          972 non-null float64
avg_steals           756 non-null float64
avg_blocks           756 non-null float64
field_goal_pct       972 non-null float64
three_pt_pct         621 non-null float64
free_throw_pct       972 non-null float64
win_shares           972 non-null float64
win_shares_per_48    972 non-null float64
dtypes: float64(14), int64(4), object(3)
memory usage: 167.1+ KB


# Get the column names for the numeric columns
num_cols_mvp = mvp_data_df_clean.columns[mvp_data_df_clean.dtypes != object]
num_cols_finalist = mvp_finalist_df_clean.columns[mvp_finalist_df_clean.dtypes != object]

# Replace all NaNs with 0
mvp_data_df_clean.loc[:, num_cols_mvp] = mvp_data_df_clean.loc[:, num_cols_mvp].fillna(0)
mvp_finalist_df_clean.loc[:, num_cols_finalist] = mvp_finalist_df_clean.loc[:, num_cols_finalist].fillna(0)


mvp_finalist_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 972 entries, 0 to 971
Data columns (total 21 columns):
year                 972 non-null int64
rank                 972 non-null object
player               972 non-null object
age                  972 non-null int64
team                 972 non-null object
first_place_votes    972 non-null float64
points_won           972 non-null float64
points_max           972 non-null int64
vote_share           972 non-null float64
games_played         972 non-null int64
avg_minutes          972 non-null float64
avg_points           972 non-null float64
avg_rebounds         972 non-null float64
avg_assists          972 non-null float64
avg_steals           972 non-null float64
avg_blocks           972 non-null float64
field_goal_pct       972 non-null float64
three_pt_pct         972 non-null float64
free_throw_pct       972 non-null float64
win_shares           972 non-null float64
win_shares_per_48    972 non-null float64
dtypes: float64(14), int64(4), object(3)
memory usage: 167.1+ KB


mvp_data_df_clean = mvp_data_df_clean.loc[mvp_data_df_clean["league"]=="NBA"]
mvp_data_df_clean = mvp_data_df_clean[pd.notnull(mvp_data_df_clean['team'])]


mvp_data_df_clean = mvp_data_df_clean[pd.notnull(mvp_data_df_clean['player'])]
mvp_data_df_clean.sort_values(['season'], ascending=False, axis=0, inplace=True)


mvp_finalist_df_clean = mvp_finalist_df_clean[pd.notnull(mvp_finalist_df_clean['player'])]
mvp_finalist_df_clean.sort_values(['year', 'points_won'], ascending=False, axis=0, inplace=True)


mvp_data_df_clean.to_csv("../data/clean_data/mvp_data_df_clean.csv", index=False)
print(mvp_data_df_clean.shape)
mvp_data_df_clean.head()

(63, 18)


mvp_finalist_df_clean.to_csv("../data/clean_data/mvp_finalist_df_clean.csv", index=False)
print(mvp_finalist_df_clean.shape)
mvp_finalist_df_clean.head()

(972, 21)


import sys
import bs4
print(f'last updated: {datetime.now().strftime("%Y-%m-%d %H:%M")} \n')
print(f"System and module version information: \n")
print(f"Python version: {sys.version_info}")
print(f"urllib.request version: {urllib.request.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"Beautiful Soup version: {bs4.__version__}")

last updated: 2019-05-27 22:35 

System and module version information: 

Python version: sys.version_info(major=3, minor=7, micro=1, releaselevel='final', serial=0)
urllib.request version: 3.7
pandas version: 0.23.4
Beautiful Soup version: 4.6.3

	Season	Lg	Player	Voting	Age	Tm	G	MP	PTS	TRB	AST	STL	BLK	FG%	3P%	FT%	WS	WS/48
0	2017-18	NBA	James Harden	(V)	28.0	HOU	72.0	35.4	30.4	5.4	8.8	1.8	0.7	0.449	0.367	0.858	15.4	0.289
1	2016-17	NBA	Russell Westbrook	(V)	28.0	OKC	81.0	34.6	31.6	10.7	10.4	1.6	0.4	0.425	0.343	0.845	13.1	0.224
2	2015-16	NBA	Stephen Curry	(V)	27.0	GSW	79.0	34.2	30.1	5.4	6.7	2.1	0.2	0.504	0.454	0.908	17.9	0.318
3	2014-15	NBA	Stephen Curry	(V)	26.0	GSW	80.0	32.7	23.8	4.3	7.7	2.0	0.2	0.487	0.443	0.914	15.7	0.288
4	2013-14	NBA	Kevin Durant	(V)	25.0	OKC	81.0	38.5	32.0	7.4	5.5	1.3	0.7	0.503	0.391	0.873	19.2	0.295

	season	league	player	voting	age	team	games_played	avg_minutes	avg_points	avg_rebounds	avg_assists	avg_steals	avg_blocks	field_goal_pct	three_pt_pct	free_throw_pct	win_shares	win_shares_per_48
0	2017-18	NBA	James Harden	(V)	28.0	HOU	72.0	35.4	30.4	5.4	8.8	1.8	0.7	0.449	0.367	0.858	15.4	0.289
1	2016-17	NBA	Russell Westbrook	(V)	28.0	OKC	81.0	34.6	31.6	10.7	10.4	1.6	0.4	0.425	0.343	0.845	13.1	0.224
2	2015-16	NBA	Stephen Curry	(V)	27.0	GSW	79.0	34.2	30.1	5.4	6.7	2.1	0.2	0.504	0.454	0.908	17.9	0.318
3	2014-15	NBA	Stephen Curry	(V)	26.0	GSW	80.0	32.7	23.8	4.3	7.7	2.0	0.2	0.487	0.443	0.914	15.7	0.288
4	2013-14	NBA	Kevin Durant	(V)	25.0	OKC	81.0	38.5	32.0	7.4	5.5	1.3	0.7	0.503	0.391	0.873	19.2	0.295

	season	league	player	voting	age	team	games_played	avg_minutes	avg_points	avg_rebounds	avg_assists	avg_steals	avg_blocks	field_goal_pct	three_pt_pct	free_throw_pct	win_shares	win_shares_per_48
0	2017-18	NBA	James Harden	(V)	28.0	HOU	72.0	35.4	30.4	5.4	8.8	1.8	0.7	0.449	0.367	0.858	15.4	0.289
1	2016-17	NBA	Russell Westbrook	(V)	28.0	OKC	81.0	34.6	31.6	10.7	10.4	1.6	0.4	0.425	0.343	0.845	13.1	0.224
2	2015-16	NBA	Stephen Curry	(V)	27.0	GSW	79.0	34.2	30.1	5.4	6.7	2.1	0.2	0.504	0.454	0.908	17.9	0.318
3	2014-15	NBA	Stephen Curry	(V)	26.0	GSW	80.0	32.7	23.8	4.3	7.7	2.0	0.2	0.487	0.443	0.914	15.7	0.288
4	2013-14	NBA	Kevin Durant	(V)	25.0	OKC	81.0	38.5	32.0	7.4	5.5	1.3	0.7	0.503	0.391	0.873	19.2	0.295

	year	rank	player	age	team	first_place_votes	points_won	points_max	vote_share	games_played	...	avg_points	avg_rebounds	avg_assists	avg_steals	avg_blocks	field_goal_pct	three_pt_pct	free_throw_pct	win_shares	win_shares_per_48
959	2018	1	James Harden	28	HOU	86.0	965.0	1010	0.955	72	...	30.4	5.4	8.8	1.8	0.7	0.449	0.367	0.858	15.4	0.289
960	2018	2	LeBron James	33	CLE	15.0	738.0	1010	0.731	82	...	27.5	8.6	9.1	1.4	0.9	0.542	0.367	0.731	14.0	0.221
961	2018	3	Anthony Davis	24	NOP	0.0	445.0	1010	0.441	75	...	28.1	11.1	2.3	1.5	2.6	0.534	0.340	0.828	13.7	0.241
962	2018	4	Damian Lillard	27	POR	0.0	207.0	1010	0.205	73	...	26.9	4.5	6.6	1.1	0.4	0.439	0.361	0.916	12.6	0.227
963	2018	5	Russell Westbrook	29	OKC	0.0	76.0	1010	0.075	80	...	25.4	10.1	10.3	1.8	0.3	0.449	0.298	0.737	10.1	0.166

NBA MVP Comparisons - Part 1

NBA MVP Comparisons¶

Part 1¶

Outline¶

Scraping the Column Headers¶

Scraping the Data¶

Scraping the Data for All MVP Finalists Since 1956¶

Cleaning the Data¶

Cleaning Up the Rest of the Data¶

Review¶

	Rank	Player	Age	Tm	Pts Won	Pts Max	Share	G	MP	PTS	TRB	AST	STL	BLK	FG%	3P%	FT%	WS	WS/48
7	8	DeMar DeRozan	28	TOR	32.0	1010	0.032	80	33.9	23.0	3.9	5.2	1.1	0.3	.456	.310	.825	9.6	.170
8	9	LaMarcus Aldridge	32	SAS	6.0	1010	0.006	75	33.5	23.1	8.5	2.0	0.6	1.2	.510	.293	.837	10.9	.209
9	10T	Jimmy Butler	28	MIN	5.0	1010	0.005	59	36.7	22.2	5.3	4.9	2.0	0.4	.474	.350	.854	8.9	.198
10	10T	Stephen Curry	29	GSW	5.0	1010	0.005	51	32.0	26.4	5.1	6.1	1.6	0.2	.495	.423	.921	9.1	.267
11	12	Joel Embiid	23	PHI	4.0	1010	0.004	63	30.3	22.9	11.0	3.2	0.6	1.8	.483	.308	.769	6.2	.155
12	13	Victor Oladipo	25	IND	2.0	1010	0.002	75	34.0	23.1	5.2	4.3	2.4	0.8	.477	.371	.799	8.2	.155

	Year	Rank	Player	Age	Tm	First	Pts Won	Pts Max	Share	G	...	PTS	TRB	AST	FG%	FT%	WS	WS/48
0	1956	1	Bob Pettit	23	STL	33.0	33.0	80	0.413	72	...	25.7	16.2	2.6	.429	.736	13.8	.236
1	1956	2	Paul Arizin	27	PHW	21.0	21.0	80	0.263	72	...	24.2	7.5	2.6	.448	.810	12.2	.214
2	1956	3	Bob Cousy	27	BOS	11.0	11.0	80	0.138	72	...	18.8	6.8	8.9	.360	.844	6.8	.119
3	1956	4	Mel Hutchins	27	FTW	9.0	9.0	80	0.113	66	...	12.0	7.5	2.7	.425	.643	4.4	.095
4	1956	5T	Dolph Schayes	27	SYR	2.0	2.0	80	0.025	72	...	20.4	12.4	2.8	.387	.858	11.8	.225