import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
#CSV File was uploaded to google drive to be directly read by python code. See references page for original location of data.
url = 'https://drive.google.com/file/d/1t1F5JVKm8I8aoEhMKRr4HMZO9thiJrCG/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)
print(df.columns)
print(df.shape)
Index(['gameId', 'blueWins', 'blueWardsPlaced', 'blueWardsDestroyed',
'blueFirstBlood', 'blueKills', 'blueDeaths', 'blueAssists',
'blueEliteMonsters', 'blueDragons', 'blueHeralds',
'blueTowersDestroyed', 'blueTotalGold', 'blueAvgLevel',
'blueTotalExperience', 'blueTotalMinionsKilled',
'blueTotalJungleMinionsKilled', 'blueGoldDiff', 'blueExperienceDiff',
'blueCSPerMin', 'blueGoldPerMin', 'redWardsPlaced', 'redWardsDestroyed',
'redFirstBlood', 'redKills', 'redDeaths', 'redAssists',
'redEliteMonsters', 'redDragons', 'redHeralds', 'redTowersDestroyed',
'redTotalGold', 'redAvgLevel', 'redTotalExperience',
'redTotalMinionsKilled', 'redTotalJungleMinionsKilled', 'redGoldDiff',
'redExperienceDiff', 'redCSPerMin', 'redGoldPerMin'],
dtype='object')
(9879, 40)
#Creating a new column to display the winning team
teamwon = []
for i in df['blueWins']:
if i == 1:
teamwon.append('Blue')
else:
teamwon.append('Red')
df['TeamWon'] = teamwon
#Made separate datasets based on wins vs losses
dfwins = df.loc[df['blueWins']==1]
dflosses = df.loc[df['blueWins']==0]
#Made a smaller random sample dataset of 50 out of ~10k
dfsmaller=df.sample(n=50, replace=False, random_state=1)
#Split the random sample dataset
dfsmallerwins = dfsmaller.loc[dfsmaller['blueWins']==1]
dfsmallerlosses = dfsmaller.loc[dfsmaller['blueWins']==0]
#Made Index a column to better call individual games
dfsmaller.reset_index(inplace=True)
dfsmaller.head()
dfsmallerwins.reset_index(inplace=True)
#First lets drop samples that feature getting both Dragon and Herald. Exclude the index with more than 1 Elite killed
dfOneElite = df
dfMoreElite= dfOneElite.loc[(dfOneElite['blueEliteMonsters'] > 1)|(dfOneElite['redEliteMonsters']>1)]
indexlist=dfMoreElite.index
dfOneElite=dfOneElite.drop(indexlist, axis=0)
plt.xlabel(xlabel= 'Blue Total Jungle Minions Slain')
plt.title("Histogram of Blue Jungle Minions Slain over 50 Games")
plt.xticks(range(0,100,10))
plt.yticks(range(0,20,1))
sns.histplot(data=dfsmaller, x='blueTotalJungleMinionsKilled')
plt.show()
plt.xlabel(xlabel= 'Red Total Jungle Minions Slain')
plt.title("Histogram of Red Jungle Minions Slain over 50 Games")
plt.xticks(range(0,100,10))
plt.yticks(range(0,20,1))
sns.histplot(data=dfsmaller, x='redTotalJungleMinionsKilled')
plt.show()
#Examining Team v Team Jungle Minion Kills
plt.title("Scatterplot of Jungle Minions Slain and Wins")
plt.xlabel(xlabel= 'Blue Total Jungle Minions Slain')
plt.ylabel(ylabel= 'Red Total Jungle Minions Slain')
plt.grid()
sns.scatterplot(y='redTotalJungleMinionsKilled', x='blueTotalJungleMinionsKilled',data=dfsmaller, hue='TeamWon',palette=['#0000FF','#FF0000'])
plt.show()
#We can see a lot of games are centered around ~50 minions killed for both teams.
#Examining winning team's distribution of total Jungle Minions killed with 10 minutes.
print(dfsmallerwins['blueTotalJungleMinionsKilled'].describe())
print(dfsmallerlosses['redTotalJungleMinionsKilled'].describe())
count 29.000000 mean 54.068966 std 9.728203 min 29.000000 25% 49.000000 50% 55.000000 75% 60.000000 max 76.000000 Name: blueTotalJungleMinionsKilled, dtype: float64 count 21.000000 mean 51.142857 std 9.451379 min 34.000000 25% 44.000000 50% 49.000000 75% 57.000000 max 69.000000 Name: redTotalJungleMinionsKilled, dtype: float64
#Creating a confidence interval that the population mean of winning team Jungle Diamond Players Jungle Minion Kill Score.
samples, col= dfsmallerwins.shape
mean =dfsmallerwins['blueTotalJungleMinionsKilled'].mean()
std =dfsmallerwins['blueTotalJungleMinionsKilled'].std()
stderr = std/(samples**0.5)
CI = st.t.interval(0.95,samples-1,mean,stderr)
print('The average Jungle Minion Kill Score by 10 minutes is', round(mean,0))
print('The standard deviation of Jungle Minion Kill Score by 10 minutes is approximately', round(std,0))
print(f'We are 95% confident that the average winning Diamond Jungler Jungle Minions Kill Score is between \n {CI} by the first 10 minutes.')
The average Jungle Minion Kill Score by 10 minutes is 54.0 The standard deviation of Jungle Minion Kill Score by 10 minutes is approximately 10.0 We are 95% confident that the average winning Diamond Jungler Jungle Minions Kill Score is between (50.368555047990505, 57.76937598649226) by the first 10 minutes.
#Examine Outliers and Spread
sns.boxplot(data=dfwins, x='blueTotalJungleMinionsKilled', color='y')
plt.xlabel(xlabel= 'Blue Total Jungle Minions Slain')
plt.title('Blue Jungle Minions Slain Spread')
plt.show()
count,mean,std, mini,q1,q2,q3,maxi= dfwins['blueTotalJungleMinionsKilled'].describe()
iqr=q3-q1
print('Q1=',q1)
print('Median=',q2)
print('Q3=',q3)
print('IQR =',iqr)
lower = q1-1.5*iqr
upper = q3+1.5*iqr
print('Lower Bound=',lower)
print('Upper Bound=',upper)
Q1= 44.0 Median= 52.0 Q3= 59.0 IQR = 15.0 Lower Bound= 21.5 Upper Bound= 81.5
#Create Dataset to remove Outliers
dfwinsOutliers = dfwins.loc[(dfwins['blueTotalJungleMinionsKilled'] > upper) |(dfwins['blueTotalJungleMinionsKilled'] < lower)]
indexlist = dfwinsOutliers.index
dfwinsNoOutliers = dfwins.drop(indexlist)
#Creating same confidence interval but with the entire blue team winning dataset.
samples, col= dfwinsNoOutliers.shape
mean =dfwinsNoOutliers['blueTotalJungleMinionsKilled'].mean()
std =dfwinsNoOutliers['blueTotalJungleMinionsKilled'].std()
stderr = std/(samples**0.5)
CI = st.t.interval(0.95,samples-1,mean,stderr)
print('The size of winning games by blue team is', samples)
print('The average Jungle Minion Kill Score by 10 minutes is', round(mean,0))
print('The standard deviation of Jungle Minion Kill Score by 10 minutes is approximately', round(std,0))
print(f'We are 95% confident that the average winning Diamond Jungler Jungle Minions Kill Score is between \n {CI} by the first 10 minutes.')
The size of winning games by blue team is 4916 The average Jungle Minion Kill Score by 10 minutes is 52.0 The standard deviation of Jungle Minion Kill Score by 10 minutes is approximately 10.0 We are 95% confident that the average winning Diamond Jungler Jungle Minions Kill Score is between (51.50050423620725, 52.05035011692537) by the first 10 minutes.
#Focus on Dragons killed by both teams out of the entire sample.
blueonedragon= dfOneElite.loc[dfOneElite['blueDragons'] == 1]
blueonedragoncount= blueonedragon['TeamWon'].value_counts()
redonedragon= dfOneElite.loc[(dfOneElite['redDragons'] == 1)]
redonedragoncount= redonedragon['TeamWon'].value_counts()
samples, col= dfOneElite.shape
print('Total Games', samples )
print('Wins based off of blue getting dragon first, red did not \n', blueonedragoncount)
print('Wins based off of red getting dragon first, blue did not \n',redonedragoncount)
data = [3803/8439,2414/8439,2222/8439]
labels = ['Wins getting Dragon first','Losses getting Dragon First','Neither Team got dragon within 10 minutes']
plt.title('Wins based on Getting Dragon within 10 minutes')
plt.pie(x=data, labels=labels,autopct ='%0.0f%%')
plt.show()
Total Games 8439 Wins based off of blue getting dragon first, red did not Blue 1770 Red 1096 Name: TeamWon, dtype: int64 Wins based off of red getting dragon first, blue did not Red 2033 Blue 1318 Name: TeamWon, dtype: int64
data = [3803/6217,2414/6217]
labels = ['Wins getting Dragon first','Losses getting Dragon first']
plt.title('Wins based on Getting Dragon within 10 minutes')
plt.pie(x=data, labels=labels,autopct ='%0.0f%%')
plt.show()
#Focus on Heralds killed by both teams out of the entire sample.
blueoneherald= dfOneElite.loc[dfOneElite['blueHeralds'] == 1]
blueoneheraldcount= blueoneherald['TeamWon'].value_counts()
redoneherald= dfOneElite.loc[(dfOneElite['redHeralds'] == 1)]
redoneheraldcount= redoneherald['TeamWon'].value_counts()
samples, col= dfOneElite.shape
print('Total Games', samples )
print('Wins based off of blue getting Herald first, red did not \n', blueoneheraldcount)
print('Wins based off of red getting Herald first, blue did not \n',redoneheraldcount)
data = [1030/8439,968/8439,6441/8439]
labels = ['Wins getting Herald first','Losses getting Herald first','Neither Team got Herald within 10 minutes']
plt.title('Wins based on Getting Herald within 10 minutes')
plt.pie(x=data, labels=labels,autopct ='%0.0f%%')
plt.show()
Total Games 8439 Wins based off of blue getting Herald first, red did not Blue 583 Red 564 Name: TeamWon, dtype: int64 Wins based off of red getting Herald first, blue did not Red 447 Blue 404 Name: TeamWon, dtype: int64
data = [1030/1998,968/1998]
labels = ['Wins getting Herald first','Losses getting Herald first']
plt.title('Wins based on Getting Herald within 10 minutes')
plt.pie(x=data, labels=labels,autopct ='%0.0f%%')
plt.show()