In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
In [2]:
#CSV File was uploaded to google drive to be directly read by python code. See references page for original location of data.
url = 'https://drive.google.com/file/d/1t1F5JVKm8I8aoEhMKRr4HMZO9thiJrCG/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)
print(df.columns)
print(df.shape)
Index(['gameId', 'blueWins', 'blueWardsPlaced', 'blueWardsDestroyed',
       'blueFirstBlood', 'blueKills', 'blueDeaths', 'blueAssists',
       'blueEliteMonsters', 'blueDragons', 'blueHeralds',
       'blueTowersDestroyed', 'blueTotalGold', 'blueAvgLevel',
       'blueTotalExperience', 'blueTotalMinionsKilled',
       'blueTotalJungleMinionsKilled', 'blueGoldDiff', 'blueExperienceDiff',
       'blueCSPerMin', 'blueGoldPerMin', 'redWardsPlaced', 'redWardsDestroyed',
       'redFirstBlood', 'redKills', 'redDeaths', 'redAssists',
       'redEliteMonsters', 'redDragons', 'redHeralds', 'redTowersDestroyed',
       'redTotalGold', 'redAvgLevel', 'redTotalExperience',
       'redTotalMinionsKilled', 'redTotalJungleMinionsKilled', 'redGoldDiff',
       'redExperienceDiff', 'redCSPerMin', 'redGoldPerMin'],
      dtype='object')
(9879, 40)
In [3]:
#Creating a new column to display the winning team
teamwon = []
for i in df['blueWins']:
    if i == 1:
        teamwon.append('Blue')
    else:
        teamwon.append('Red')
df['TeamWon'] = teamwon
In [4]:
#Made separate datasets based on wins vs losses
dfwins = df.loc[df['blueWins']==1]
dflosses = df.loc[df['blueWins']==0]
In [5]:
#Made a smaller random sample dataset of 50 out of ~10k
dfsmaller=df.sample(n=50, replace=False, random_state=1)

#Split the random sample dataset 
dfsmallerwins = dfsmaller.loc[dfsmaller['blueWins']==1]
dfsmallerlosses = dfsmaller.loc[dfsmaller['blueWins']==0]

#Made Index a column to better call individual games
dfsmaller.reset_index(inplace=True)
dfsmaller.head()

dfsmallerwins.reset_index(inplace=True)
In [6]:
#First lets drop samples that feature getting both Dragon and Herald. Exclude the index with more than 1 Elite killed     
dfOneElite = df
dfMoreElite= dfOneElite.loc[(dfOneElite['blueEliteMonsters'] > 1)|(dfOneElite['redEliteMonsters']>1)]
indexlist=dfMoreElite.index
dfOneElite=dfOneElite.drop(indexlist, axis=0)
In [7]:
plt.xlabel(xlabel= 'Blue Total Jungle Minions Slain')
plt.title("Histogram of Blue Jungle Minions Slain over 50 Games")
plt.xticks(range(0,100,10))
plt.yticks(range(0,20,1))
sns.histplot(data=dfsmaller, x='blueTotalJungleMinionsKilled')
plt.show()
In [8]:
plt.xlabel(xlabel= 'Red Total Jungle Minions Slain')
plt.title("Histogram of Red Jungle Minions Slain over 50 Games")
plt.xticks(range(0,100,10))
plt.yticks(range(0,20,1))
sns.histplot(data=dfsmaller, x='redTotalJungleMinionsKilled')
plt.show()
In [9]:
#Examining Team v Team Jungle Minion Kills
plt.title("Scatterplot of Jungle Minions Slain and Wins")
plt.xlabel(xlabel= 'Blue Total Jungle Minions Slain')
plt.ylabel(ylabel= 'Red Total Jungle Minions Slain')
plt.grid()
sns.scatterplot(y='redTotalJungleMinionsKilled', x='blueTotalJungleMinionsKilled',data=dfsmaller, hue='TeamWon',palette=['#0000FF','#FF0000'])
plt.show()
#We can see a lot of games are centered around ~50 minions killed for both teams.
In [10]:
#Examining winning team's distribution of total Jungle Minions killed with 10 minutes.
print(dfsmallerwins['blueTotalJungleMinionsKilled'].describe())
print(dfsmallerlosses['redTotalJungleMinionsKilled'].describe())
count    29.000000
mean     54.068966
std       9.728203
min      29.000000
25%      49.000000
50%      55.000000
75%      60.000000
max      76.000000
Name: blueTotalJungleMinionsKilled, dtype: float64
count    21.000000
mean     51.142857
std       9.451379
min      34.000000
25%      44.000000
50%      49.000000
75%      57.000000
max      69.000000
Name: redTotalJungleMinionsKilled, dtype: float64
In [11]:
#Creating a confidence interval that the population mean of winning team Jungle Diamond Players Jungle Minion Kill Score. 
samples, col= dfsmallerwins.shape
mean =dfsmallerwins['blueTotalJungleMinionsKilled'].mean()
std =dfsmallerwins['blueTotalJungleMinionsKilled'].std()
stderr = std/(samples**0.5)
CI = st.t.interval(0.95,samples-1,mean,stderr)
print('The average Jungle Minion Kill Score by 10 minutes is', round(mean,0))
print('The standard deviation of Jungle Minion Kill Score by 10 minutes is approximately', round(std,0))
print(f'We are 95% confident that the average winning Diamond Jungler Jungle Minions Kill Score is between \n {CI} by the first 10 minutes.')
The average Jungle Minion Kill Score by 10 minutes is 54.0
The standard deviation of Jungle Minion Kill Score by 10 minutes is approximately 10.0
We are 95% confident that the average winning Diamond Jungler Jungle Minions Kill Score is between 
 (50.368555047990505, 57.76937598649226) by the first 10 minutes.
In [12]:
#Examine Outliers and Spread
sns.boxplot(data=dfwins, x='blueTotalJungleMinionsKilled', color='y')
plt.xlabel(xlabel= 'Blue Total Jungle Minions Slain')
plt.title('Blue Jungle Minions Slain Spread')
plt.show()
count,mean,std, mini,q1,q2,q3,maxi= dfwins['blueTotalJungleMinionsKilled'].describe()
iqr=q3-q1
print('Q1=',q1)
print('Median=',q2)
print('Q3=',q3)
print('IQR =',iqr)
lower = q1-1.5*iqr
upper = q3+1.5*iqr
print('Lower Bound=',lower)
print('Upper Bound=',upper)
Q1= 44.0
Median= 52.0
Q3= 59.0
IQR = 15.0
Lower Bound= 21.5
Upper Bound= 81.5
In [13]:
#Create Dataset to remove Outliers
dfwinsOutliers = dfwins.loc[(dfwins['blueTotalJungleMinionsKilled'] > upper) |(dfwins['blueTotalJungleMinionsKilled'] < lower)]
indexlist = dfwinsOutliers.index
dfwinsNoOutliers = dfwins.drop(indexlist)
In [14]:
#Creating same confidence interval but with the entire blue team winning dataset.  
samples, col= dfwinsNoOutliers.shape
mean =dfwinsNoOutliers['blueTotalJungleMinionsKilled'].mean()
std =dfwinsNoOutliers['blueTotalJungleMinionsKilled'].std()
stderr = std/(samples**0.5)
CI = st.t.interval(0.95,samples-1,mean,stderr)
print('The size of winning games by blue team is', samples)
print('The average Jungle Minion Kill Score by 10 minutes is', round(mean,0))
print('The standard deviation of Jungle Minion Kill Score by 10 minutes is approximately', round(std,0))
print(f'We are 95% confident that the average winning Diamond Jungler Jungle Minions Kill Score is between \n {CI} by the first 10 minutes.')
The size of winning games by blue team is 4916
The average Jungle Minion Kill Score by 10 minutes is 52.0
The standard deviation of Jungle Minion Kill Score by 10 minutes is approximately 10.0
We are 95% confident that the average winning Diamond Jungler Jungle Minions Kill Score is between 
 (51.50050423620725, 52.05035011692537) by the first 10 minutes.
In [15]:
#Focus on Dragons killed by both teams out of the entire sample.
blueonedragon= dfOneElite.loc[dfOneElite['blueDragons'] == 1]
blueonedragoncount= blueonedragon['TeamWon'].value_counts()

redonedragon= dfOneElite.loc[(dfOneElite['redDragons'] == 1)]
redonedragoncount= redonedragon['TeamWon'].value_counts()
samples, col= dfOneElite.shape
print('Total Games', samples )
print('Wins based off of blue getting dragon first, red did not \n', blueonedragoncount)
print('Wins based off of red getting dragon first, blue did not \n',redonedragoncount)

data = [3803/8439,2414/8439,2222/8439]

labels = ['Wins getting Dragon first','Losses getting Dragon First','Neither Team got dragon within 10 minutes']
plt.title('Wins based on Getting Dragon within 10 minutes')
plt.pie(x=data, labels=labels,autopct ='%0.0f%%')
plt.show()
Total Games 8439
Wins based off of blue getting dragon first, red did not 
 Blue    1770
Red     1096
Name: TeamWon, dtype: int64
Wins based off of red getting dragon first, blue did not 
 Red     2033
Blue    1318
Name: TeamWon, dtype: int64
In [16]:
data = [3803/6217,2414/6217]

labels = ['Wins getting Dragon first','Losses getting Dragon first']
plt.title('Wins based on Getting Dragon within 10 minutes')
plt.pie(x=data, labels=labels,autopct ='%0.0f%%')
plt.show()
In [17]:
#Focus on Heralds killed by both teams out of the entire sample.
blueoneherald= dfOneElite.loc[dfOneElite['blueHeralds'] == 1]
blueoneheraldcount= blueoneherald['TeamWon'].value_counts()

redoneherald= dfOneElite.loc[(dfOneElite['redHeralds'] == 1)]
redoneheraldcount= redoneherald['TeamWon'].value_counts()

samples, col= dfOneElite.shape
print('Total Games', samples )
print('Wins based off of blue getting Herald first, red did not \n', blueoneheraldcount)
print('Wins based off of red getting Herald first, blue did not \n',redoneheraldcount)

data = [1030/8439,968/8439,6441/8439]

labels = ['Wins getting Herald first','Losses getting Herald first','Neither Team got Herald within 10 minutes']
plt.title('Wins based on Getting Herald within 10 minutes')
plt.pie(x=data, labels=labels,autopct ='%0.0f%%')
plt.show()
Total Games 8439
Wins based off of blue getting Herald first, red did not 
 Blue    583
Red     564
Name: TeamWon, dtype: int64
Wins based off of red getting Herald first, blue did not 
 Red     447
Blue    404
Name: TeamWon, dtype: int64
In [18]:
data = [1030/1998,968/1998]

labels = ['Wins getting Herald first','Losses getting Herald first']
plt.title('Wins based on Getting Herald within 10 minutes')
plt.pie(x=data, labels=labels,autopct ='%0.0f%%')
plt.show()