# 探索各个 NBA 球员

Python、pandas 和少量 R 代码

## 创建一个统一的数据帧（警告：前路艰辛！）

#### 清单 1.设置 Jupyter Notebook 并加载数据帧

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

#### 清单 2.修改贡献值数据帧的列中的错误数据

plus_minus_df.rename(columns={"NAME":"PLAYER"}, inplace=True)
players = []
for player in plus_minus_df["PLAYER"]:
plyr, _ = player.split(",")
players.append(plyr)
plus_minus_df.drop(["PLAYER"], inplace=True, axis=1)
plus_minus_df["PLAYER"] = players

#### 清单 3.重命名并合并 basketball reference 数据帧

nba_players_df = br_stats_df.copy()
nba_players_df.rename(columns={'Player': 'PLAYER','Pos':'POSITION', 'Tm': "TEAM", 'Age': 'AGE'}, inplace=True)
nba_players_df.drop(["G", "GS", "TEAM"], inplace=True, axis=1)
nba_players_df = nba_players_df.merge(plus_minus_df, how="inner", on="PLAYER")

#### 清单 4.清理并合并 PIE 字段

pie_df_subset = pie_df[["PLAYER", "PIE", "PACE"]].copy()
nba_players_df = nba_players_df.merge(pie_df_subset, how="inner", on="PLAYER")

#### 清单 5.清理薪资数据

salary_df.rename(columns={'NAME': 'PLAYER'}, inplace=True)
salary_df.drop(["POSITION","TEAM"], inplace=True, axis=1)

#### 清单 6.寻找缺失记录并合并它们

diff = list(set(nba_players_df["PLAYER"].values.tolist()) - set(salary_df["PLAYER"].values.tolist()))
len(diff)

Out[45]:  111
nba_players_with_salary_df = nba_players_df.merge(salary_df)

#### 清单 7.Seaborn 导入的薪资数据与 WINS_RPM 之间的关系

sns.lmplot(x="SALARY_MILLIONS", y="WINS_RPM", data=nba_players_with_salary_df)

#### 清单 8.对获胜和得分数据执行回归

results = smf.ols('W ~POINTS', data=nba_players_with_salary_df).fit()
print(results.summary())

#### 清单 9.Python ggplot

from ggplot import *
p = ggplot(nba_players_with_salary_df,aes(x="POINTS", y="WINS_RPM", color="SALARY_MILLIONS")) + geom_point(size=200)
p + xlab("POINTS/GAME") + ylab("WINS/RPM") + ggtitle("NBA Players 2016-2017:  POINTS/GAME, WINS REAL PLUS MINUS and SALARY")

#### 抓取 NBA 球员的 Wikipedia 页面的查看数据

1. 确定如何从 Wikipedia（或某个网站）获取数据
2. 确定如何以编程方式生成 Wikipedia 句柄
3. 将数据写入一个数据帧中并将它与剩余数据合并

#### 清单 10.Wikipedia，第 1 部分

"""
Example Route To Construct:

https://wikimedia.org/api/rest_v1/ +
metrics/pageviews/per-article/ +
en.wikipedia/all-access/user/ +
LeBron_James/daily/2015070100/2017070500 +

"""
import requests
import pandas as pd
import time
import wikipedia

BASE_URL =\
"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user"

def construct_url(handle, period, start, end):
"""Constructs a URL based on arguments

Should construct the following URL:
/LeBron_James/daily/2015070100/2017070500
"""

urls  = [BASE_URL, handle, period, start, end]
constructed = str.join('/', urls)
return constructed

def query_wikipedia_pageviews(url):

res = requests.get(url)
return res.json()

def wikipedia_pageviews(handle, period, start, end):
"""Returns JSON"""

constructed_url = construct_url(handle, period, start,end)
pageviews = query_wikipedia_pageviews(url=constructed_url)
return pageviews

#### 清单 10.Wikipedia，第 2 部分

def wikipedia_2016(handle,sleep=0):
"""Retrieve pageviews for 2016"""

print("SLEEP: {sleep}".format(sleep=sleep))
time.sleep(sleep)
pageviews = wikipedia_pageviews(handle=handle,
period="daily", start="2016010100", end="2016123100")
if not 'items' in pageviews:
print("NO PAGEVIEWS: {handle}".format(handle=handle))
return None
return pageviews

def create_wikipedia_df(handles):
"""Creates a Dataframe of Pageviews"""

pageviews = []
timestamps = []
names = []
wikipedia_handles = []
for name, handle in handles.items():
pageviews_record = wikipedia_2016(handle)
if pageviews_record is None:
continue
for record in pageviews_record['items']:
pageviews.append(record['views'])
timestamps.append(record['timestamp'])
names.append(name)
wikipedia_handles.append(handle)
data = {
"names": names,
"wikipedia_handles": wikipedia_handles,
"pageviews": pageviews,
"timestamps": timestamps
}
df = pd.DataFrame(data)
return df

def create_wikipedia_handle(raw_handle):
"""Takes a raw handle and converts it to a wikipedia handle"""

wikipedia_handle = raw_handle.replace(" ", "_")
return wikipedia_handle

def create_wikipedia_nba_handle(name):

return url

#### 清单 10.Wikipedia，第 3 部分

def wikipedia_current_nba_roster():
"""Gets all links on wikipedia current roster page"""

nba = wikipedia.page("List_of_current_NBA_team_rosters")

def guess_wikipedia_nba_handle(data="data/nba_2017_br.csv"):
"""Attempt to get the correct wikipedia handle"""

count = 0
verified = {}
guesses = {}
for player in nba["Player"].values:
print(count)
count += 1
else:
print("NO MATCH: {player}".format(player=player))
guesses[player] = create_wikipedia_handle(player)
return verified, guesses

#### 清单 10.Wikipedia，第 4 部分

def validate_wikipedia_guesses(guesses):
"""Validate guessed wikipedia accounts"""

verified = {}
wrong = {}
try:
except (wikipedia.DisambiguationError, wikipedia.PageError) as error:
nba_handle = create_wikipedia_nba_handle(name)
try:
page = wikipedia.page(nba_handle)
print("Initial wikipedia URL Failed: {error}".format(error=error))
except (wikipedia.DisambiguationError, wikipedia.PageError) as error:
print("Second Match Failure: {error}".format(error=error))
continue
if "NBA" in page.summary:
else:
print("NO GUESS MATCH: {name}".format(name=name))
return verified, wrong

def clean_wikipedia_handles(data="data/nba_2017_br.csv"):
"""Clean Handles"""

verified, guesses = guess_wikipedia_nba_handle(data=data)
verified_cleaned, wrong = validate_wikipedia_guesses(guesses)
print("WRONG Matches: {wrong}".format(wrong=wrong))
handles = {**verified, **verified_cleaned}
return handles

def nba_wikipedia_dataframe(data="data/nba_2017_br.csv"):
handles = clean_wikipedia_handles(data=data)
df = create_wikipedia_df(handles)
return df

def create_wikipedia_csv(data="data/nba_2017_br.csv"):
df = nba_wikipedia_dataframe(data=data)
df.to_csv("data/wikipedia_nba.csv")

if __name__ == "__main__":
create_wikipedia_csv()

## 抓取 NBA 球员的 Twitter 互动数据

#### 清单 11.Twitter 提取元数据，第 1 部分

"""

df = stats_df(user="KingJames")
In [34]: df.describe()
Out[34]:
favorite_count  retweet_count
count      200.000000     200.000000
mean     11680.670000    4970.585000
std      20694.982228    9230.301069
min          0.000000      39.000000
25%       1589.500000     419.750000
50%       4659.500000    1157.500000
75%      13217.750000    4881.000000
max     128614.000000   70601.000000

In [35]: df.corr()
Out[35]:
favorite_count  retweet_count
favorite_count        1.000000       0.904623
retweet_count         0.904623       1.000000

"""

import time

from . import config
import pandas as pd
import numpy as np

def api_handler():

consumer_secret=config.CONSUMER_SECRET,
access_token_key=config.ACCESS_TOKEN_KEY,
access_token_secret=config.ACCESS_TOKEN_SECRET)
return api

def tweets_by_user(api, user, count=200):
"""Grabs the "n" number of tweets.Defaults to 200"""

tweets = api.GetUserTimeline(screen_name=user, count=count)
return tweets

#### 清单 11.Twitter 提取元数据，第 2 部分

def stats_to_df(tweets):
"""Takes twitter stats and converts them to a dataframe"""

records = []
for tweet in tweets:
records.append({"created_at":tweet.created_at,
"screen_name":tweet.user.screen_name,
"retweet_count":tweet.retweet_count,
"favorite_count":tweet.favorite_count})
df = pd.DataFrame(data=records)
return df

def stats_df(user):
"""Returns a dataframe of stats"""

api = api_handler()
tweets = tweets_by_user(api, user)
df = stats_to_df(tweets)
return df

"""yield handles"""

time.sleep(sleep) #Avoid throttling in twitter api
try:
df = stats_df(handle)
print("Error {handle} and error msg {error}".format(
handle=handle,error=error))
df = None
yield df

favorite_count = []
retweet_count = []
print(record)
#None records stored as Nan value
if record is None:
print("NO RECORD: {record}".format(record=record))
favorite_count.append(np.nan)
retweet_count.append(np.nan)
continue
try:
favorite_count.append(record['favorite_count'].median())
retweet_count.append(record["retweet_count"].median())
except KeyError as error:
print("No values found to append {error}".format(error=error))
favorite_count.append(np.nan)
retweet_count.append(np.nan)

print("Creating DF")
return nba

nba = median_engagement(data)

## 创建高级可视化

#### 清单 12.关联度热图

plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Endorsement, Social Power, On-Court Performance, Team Valuation Correlation Heatmap:  2016-2017 Season")
corr = endorsements.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values, cmap="copper")

#### 清单 13.关联度热图高级版

from matplotlib.colors import LogNorm
plt.subplots(figsize=(20,15))
pd.set_option('display.float_format', lambda x: '%.3f' % x)
norm = LogNorm()
ax = plt.axes()
grid = endorsements.select_dtypes([np.number])
ax.set_title("NBA Player Endorsement, Social Power, On-Court Performance, Team Valuation Heatmap:  2016-2017 Season")
sns.heatmap(grid,annot=True, yticklabels=endorsements["PLAYER"],fmt='g', cmap="Accent", cbar=False, norm=norm)

#### 清单 14.基于 R 的高级 ggplot

ggplot(nba_players_stats, aes(x=WINS_RPM, y=PAGEVIEWS,
geom_smooth() + scale_color_gradient2(low = "blue", mid = "grey", high =
"red", midpoint = 15) + labs(y="Wikipedia Median Daily Pageviews", x="WINS
Attributed to Player( WINS_RPM)", title = "Social Power NBA 2016-2017
Season: Wikipedia Daily Median Pageviews and Wins Attributed to Player
geom_text(vjust="inward",hjust="inward",color="black",size=4,check_overlap
= TRUE, data=subset(nba_players_stats, SALARY_MILLIONS > 25 | PAGEVIEWS
> 4500 | WINS_RPM > 15), aes(WINS_RPM,label=PLAYER )) +
annotate("text", x=8, y=13000, label= "NBA Fans Value Player Skill More
Than Salary, Points, Team Wins or Another Other Factor?", size=5) +
annotate("text", x=8, y=11000, label=paste("PAGEVIEWS/WINS Correlation:
28%"),size=4) + annotate("text", x=8, y=10000,
label=paste("PAGEVIEWS/Points Correlation 44%"),size=4) + annotate("text",
x=8, y=9000, label=paste("PAGEVIEWS/WINS_RPM Correlation: 49%"),size=4,
color="red") + annotate("text", x=8, y=8000,

## 结束语

• 支付给球员的薪资不是获胜与否的最佳预测指标。
• 球迷更喜欢与高技能球员（而不是高收入球员）进行互动。
• 代言收入与归功于球员的球队胜利次数相关，所以他们可能会谨慎选择转到哪个球队。
• 现场观看比赛的观众和通过社交媒体进行互动的观众之间似乎存在区别。现场观众会比较在乎其所关注球队的场上发挥。

#### 评论

static.content.url=http://www.ibm.com/developerworks/js/artrating/
SITE_ID=10
Zone=Big data and analytics, Open source
ArticleID=1050931
ArticleTitle=社会力量对 NBA 所产生的影响和表现，第 2 部分: 探索各个 NBA 球员
publish-date=10122017