The goal of this project is twofold: 1) predict if a player will become an all-star based on their rookie offensive stats and 2) determine clusters of rookies.
At the end of the day, this is a fairly limited analysis, though we can see some interesting patterns in the data.
options(warn=-1)
setwd("C:/Users/Micah/Desktop/applied_data_mining")
set.seed(19)
library(ggplot2)
library(lattice)
library(caret)
library(pROC)
library(plyr)
library(rpart)
library(rattle)
library(cluster)
library(data.table)
library(MASS)
library(colorRamps)
library(nFactors)
library(gplots)
library(RColorBrewer)
library(semPlot)
library(waffle)
library(extrafont)
font_import()
## Importing fonts may take a few minutes, depending on the number of fonts and the speed of the system.
## Continue? [y/n]
subset_to_rookie_year <- function(df){
# Define rookie season as first yearID in which player had more than 100 ABs.
df[, 'yearID'] <- sapply(df[, 'yearID'], as.numeric)
eligible_df <- df[ which(df$AB > 100), ]
rookie_df <- aggregate(eligible_df$yearID, by=list(eligible_df$playerID), min)
colnames(rookie_df) <- c('playerID', 'yearID')
df <- merge(df, rookie_df, by=c('playerID', 'yearID'))
df <- df[!duplicated(df$playerID),]
# For simplicity, remove small number of players with 100+ ABs for two teams
# in their rookie season.
df <- df[ which(df$AB > 100), ]
return(df)
}
# Only use the last few decades of players.
# Do not use players who are too recent - they may still become all-stars.
subset_to_between_1970_and_2010 <- function(df){
df <- df[ which(df$yearID >= 1970 & df$yearID <= 2010), ]
return(df)
}
count_all_star_appearances <- function(all_stars, batting){
all_star_temp <- all_stars[,c('playerID', 'yearID')]
all_star_temp$rookie_all_star_appearance <- 'yes'
batting <- merge(batting, all_star_temp, by=c('playerID', 'yearID'), all.x=TRUE)
batting$rookie_all_star_appearance[is.na(batting$rookie_all_star_appearance)] <- 'no'
batting$rookie_id <- 'yes'
all_stars_non_rookie <- merge(all_stars, batting, by=c('playerID', 'yearID'), all.x=TRUE)
all_stars_non_rookie$rookie_id[is.na(all_stars_non_rookie$rookie_id)] <- 'no'
all_stars_non_rookie <- all_stars_non_rookie[ which(all_stars_non_rookie$rookie_id == 'no'), ]
all_stars_non_rookie <- all_stars_non_rookie[c('playerID')]
all_stars_non_rookie <- as.data.frame(table(all_stars_non_rookie))
colnames(all_stars_non_rookie) <- c('playerID', 'all_star')
merged_df <- merge(batting, all_stars_non_rookie, by='playerID', all.x=TRUE)
merged_df$all_star[merged_df$all_star > 0] <- "yes"
merged_df$all_star[merged_df$all_star != 'yes'] <- "no"
merged_df$all_star[is.na(merged_df$all_star)] <- 'no'
return(merged_df)
}
create_name_to_id_mapping <- function(df){
df$playerName <- paste(df$nameFirst, ' ', df$nameLast)
df <- df[c('playerID', 'playerName')]
return(df)
}
calculate_slg_obp_obp_and_avg <- function(df){
df[is.na(df)] <- 0
df$avg <- df$H / df$AB
df$obp <- (df$H + df$BB + df$HBP) / (df$AB + df$BB + df$HBP + df$SF)
df$slg <- (df$H + (df$X2B + df$X3B + df$HR) + (df$X2B * 2) +
(df$X3B * 3 + df$HR * 4)) / df$AB
df[is.na(df)] <- 0
return(df)
}
select_columns_for_modeling <- function(df){
df <- subset(df, select=c(G, AB, R, H, X2B, X3B, HR, RBI, SB, BB, SO, avg, obp, slg,
all_star, playerID))
return(df)
}
drop_player_id <- function(df){
drop <- c('playerID')
df <- df[ , !(names(df) %in% drop)]
return(df)
}
count_factor_occurrences_by_target <- function(df, feature, target, title){
print(ggplot(df, aes_string(feature, fill = target)) +
geom_bar() + ggtitle(title))
}
make_histogram_by_target <- function(df, feature, target, title, bins){
print(ggplot(df, aes_string(feature, fill = target)) +
geom_histogram(binwidth = bins) + ggtitle(title))
}
make_parallel_coordinates <- function(df, feature, cuts){
c <- blue2red(cuts)
r <- cut(feature, cuts)
parcoord(df, col=c[as.numeric(r)])
}
make_scree_table_for_factor_analysis <- function(df){
nScree(df)
}
get_eigenvalues <- function(df){
eigen(cor(df))
}
build_factor_analysis_model <- function(df, n_factors){
fa <- factanal(df, factors = n_factors, lower = 0.01)
print(fa)
return(fa)
}
make_factor_analysis_heatmp <- function(fa){
heatmap.2(fa$loadings, col = brewer.pal(9, "Greens"), trace = "none",
key = FALSE, dend = 'none', Colv = FALSE, cexCol = 1.2,
main = "Factor Loadings")
}
make_factor_analysis_sem_plot <- function(fa){
semPaths(fa, what = "est", residuals = FALSE, cut = 0.4,
posCol = c("white", "darkgreen"),
negCol = c("white", "red"),
edge.label.cex = 0.60, nCharNodes = 7)
}
train_random_forest <- function(train_df, target){
control <- trainControl(method="repeatedcv", number=3, repeats=3, classProbs=TRUE)
mtry <- c(sqrt(ncol(train_df)), log2(ncol(train_df)))
grid <- expand.grid(.mtry=mtry)
formula <- as.formula(paste(target, "~ ."))
model <- train(formula,
data=train_df,
preProcess=c("center", "scale"),
method="rf",
metric="ROC",
trControl=control,
tuneGrid=grid,
allowParallel=TRUE,
num.threads=4)
return(model)
}
train_log_reg <- function(train_df, target){
control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
grid <- expand.grid(parameter=c(0.001, 0.01, 0.1, 1,10, 100))
formula <- as.formula(paste(target, "~ ."))
model <- train(formula,
data=train_df,
preProcess=c("center", "scale"),
method="glm",
family="binomial",
metric="ROC",
trControl=control,
tuneGrid=grid)
return(model)
}
train_decision_tree <- function(train_df, target){
control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
grid <- expand.grid(.maxdepth=c(3, 5, 7, 10))
formula <- as.formula(paste(target, "~ ."))
model <- train(formula,
data=train_df,
preProcess=c("center", "scale"),
method="rpart2",
metric="ROC",
trControl=control,
tuneGrid=grid)
return(model)
}
train_gradient_boosting <- function(train_df, target){
control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
grid <- expand.grid(interaction.depth = c(1, 3, 5),
n.trees = c(50, 100, 150),
shrinkage = 0.1,
n.minobsinnode = 20)
formula <- as.formula(paste(target, "~ ."))
model <- train(formula,
data=train_df,
preProcess=c("center", "scale"),
method="gbm",
metric="ROC",
verbose=F,
trControl=control,
tuneGrid=grid)
return(model)
}
plot_decision_tree <- function(df, target, depth){
formula <- as.formula(paste(target, "~ ."))
set.seed(19)
tree <- rpart(formula, method="class", maxdepth=depth, data=df)
printcp(tree)
print(tree)
fancyRpartPlot(tree)
}
plot_model <- function(model){
plot(model)
}
print_grid_search_results <- function(model){
model$bestTune
results <- model$results
results
}
print_confusion_matrix <- function(model, df, target){
predictions <- predict(model, df)
con_matrix <- confusionMatrix(predictions, target, positive = 'yes')
con_matrix
}
get_roc_auc <- function(model, df, target){
probabilities <- predict(model, df, type="prob")
ROC <- roc(predictor=probabilities$yes,
response=target)
print(ROC$auc)
plot(ROC, main="ROC")
return(ROC)
}
get_variable_importances <- function(model){
varImp(model)
}
scale_dataframe <- function(df){
df[, -c(3)] <- scale(df[, -c(3)])
df <- data.frame(df)
return(df)
}
plot_within_cluster_sum_of_squares <- function(df, title){
wss <- (nrow(df)-1) * sum(apply(df, 2, var))
for (i in 2:15) wss[i] <- sum(kmeans(df, centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares", main = paste(title,' elbow plot'))
}
train_k_means_model <- function(df, k){
set.seed(19)
model <- kmeans(df, k, nstart=25)
return(model)
}
plot_k_means_model <- function(model, df, title){
clusplot(df, model$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0, main = paste(title,' PCA Plot of K-Means'))
}
create_hclust_and_plot <- function(df){
set.seed(19)
dm = dist(df,method="euclidean")
hclust_model <- hclust(dm, method="complete")
plot(hclust_model)
return(hclust_model)
}
summarize_clusters <- function(df){
cluster1 <- df[which(df$k_means.cluster=='1'),]
cluster2 <- df[which(df$k_means.cluster=='2'),]
cluster3 <- df[which(df$k_means.cluster=='3'),]
print('cluster 1 summary')
print(summary(cluster1))
print('cluster 2 summary')
print(summary(cluster2))
print('cluster 3 summary')
print(summary(cluster3))
}
all_star_df <- read.csv('data/AllstarFull.csv')
batting_df <- read.csv('data/Batting.csv')
people_df <- read.csv('data/People.csv')
batting_df <- subset_to_rookie_year(batting_df)
batting_df <- subset_to_between_1970_and_2010(batting_df)
batting_df <- count_all_star_appearances(all_star_df, batting_df)
batting_df <- calculate_slg_obp_obp_and_avg(batting_df)
batting_df <- select_columns_for_modeling(batting_df)
batting_df_copy <- batting_df
batting_df <- drop_player_id(batting_df)
agg_cols_for_hist <- c('G', 'H', 'X2B', 'HR', 'RBI', 'SB')
for (column in agg_cols_for_hist){
make_histogram_by_target(batting_df, column, 'all_star',
paste(column,' histogram by all star'), 10)
}
rate_cols_for_hist <- c('avg', 'obp', 'slg')
for (column in rate_cols_for_hist){
make_histogram_by_target(batting_df, column, 'all_star',
paste(column,' histogram by all star'), .1)
}
# home runs paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$HR, 20)
# obp paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$obp, 20)
# hits paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$H, 20)
# slg paralell coordinates
make_parallel_coordinates(batting_df[1:14], batting_df$slg, 20)
make_scree_table_for_factor_analysis(batting_df[1:14])
## noc naf nparallel nkaiser
## 1 3 1 3 3
get_eigenvalues(batting_df[1:14])
## $values
## [1] 8.249808403 1.854338281 1.395355460 0.685687678 0.554203569
## [6] 0.437636506 0.234931186 0.224893153 0.140682786 0.085785935
## [11] 0.054002627 0.043706253 0.034306568 0.004661595
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.3004331 0.22552342 0.002991965 -0.214291356 -0.20908291
## [2,] -0.3251658 0.20256023 0.008520668 -0.087614482 -0.21942073
## [3,] -0.3322291 0.09149677 0.072664606 0.014272143 0.02906008
## [4,] -0.3320129 0.09386681 0.082728669 -0.077402592 -0.26707296
## [5,] -0.3090893 -0.01617361 -0.089239947 -0.004405929 -0.30455914
## [6,] -0.2141106 0.12724805 0.348745639 0.566515090 -0.07910700
## [7,] -0.2532477 -0.16026550 -0.460780197 0.184045072 0.23455915
## [8,] -0.3157035 -0.05599209 -0.248637445 -0.002842869 -0.09575287
## [9,] -0.1645273 0.25745151 0.496860370 0.250937159 0.47108337
## [10,] -0.2834618 0.07026278 -0.017809804 -0.433516479 0.44767756
## [11,] -0.2794128 0.15070538 -0.265222056 0.115119859 0.26886264
## [12,] -0.1741356 -0.48821838 0.374024681 -0.051084340 -0.30215863
## [13,] -0.1802661 -0.48063767 0.308843148 -0.371532047 0.27393383
## [14,] -0.1729315 -0.53136815 -0.176803007 0.424928679 0.10864709
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.02260710 -0.132918557 0.073247444 0.82866115 0.117395503
## [2,] 0.04654594 -0.049925842 0.008121485 -0.06269410 -0.115759642
## [3,] 0.08216492 -0.005373923 0.141127312 -0.14317407 -0.661216214
## [4,] 0.08859406 -0.083719295 0.025905815 -0.16922713 -0.127925992
## [5,] 0.14337766 0.721473104 -0.123363058 -0.15480659 0.130758741
## [6,] -0.66601937 -0.056150991 0.079160662 -0.05721246 0.088637853
## [7,] 0.12030128 -0.345758070 0.306311390 -0.04117933 -0.146299231
## [8,] 0.05358724 -0.209325916 0.228804563 -0.28719137 0.643289987
## [9,] 0.55814528 0.055640613 0.084156029 0.04418578 0.216009589
## [10,] -0.34447179 0.225734361 0.188871195 -0.08344856 0.005853165
## [11,] -0.05046551 -0.149869540 -0.843018472 -0.00437432 0.017853242
## [12,] 0.18113704 -0.322380880 -0.199041868 -0.07256980 -0.018510412
## [13,] -0.18403031 -0.015351012 -0.095160330 0.01798346 0.075267759
## [14,] 0.04275459 0.324163645 0.064883823 0.36649091 -0.077749979
## [,11] [,12] [,13] [,14]
## [1,] 0.156817419 0.01008885 0.134504983 -0.0494372667
## [2,] -0.344207453 -0.14592765 -0.451773086 0.6568209315
## [3,] 0.361156603 0.48923254 0.128071769 0.0658505782
## [4,] -0.240386079 -0.11481465 -0.349081662 -0.7371808459
## [5,] 0.273125021 -0.28783110 0.217895508 0.0343269306
## [6,] 0.082353918 -0.11969163 0.061609080 0.0104300563
## [7,] 0.150319319 -0.56298225 0.135453728 0.0114920150
## [8,] 0.116276488 0.46362841 -0.029821344 0.0386872680
## [9,] -0.023818168 -0.06321212 -0.003121233 0.0008024401
## [10,] -0.448366476 0.02153507 0.332770603 -0.0390386657
## [11,] 0.001338981 0.07298571 0.023370218 -0.0378598948
## [12,] -0.253233594 -0.04068682 0.489347379 0.1015157019
## [13,] 0.429695122 -0.13033954 -0.419767278 0.0324633607
## [14,] -0.317565203 0.26185694 -0.205134930 -0.0337207140
batting_factor_analysis <- build_factor_analysis_model(batting_df[1:14], 3)
##
## Call:
## factanal(x = df, factors = n_factors, lower = 0.01)
##
## Uniquenesses:
## G AB R H X2B X3B HR RBI SB BB SO avg
## 0.162 0.010 0.110 0.010 0.218 0.612 0.059 0.089 0.675 0.400 0.282 0.010
## obp slg
## 0.338 0.232
##
## Loadings:
## Factor1 Factor2 Factor3
## G 0.860 0.312
## AB 0.925 0.356
## R 0.813 0.426 0.220
## H 0.898 0.337 0.269
## X2B 0.714 0.461 0.245
## X3B 0.593 0.178
## HR 0.258 0.923 0.151
## RBI 0.605 0.701 0.231
## SB 0.556
## BB 0.642 0.428
## SO 0.588 0.609
## avg 0.245 0.963
## obp 0.177 0.161 0.777
## slg 0.647 0.588
##
## Factor1 Factor2 Factor3
## SS loadings 5.518 3.088 2.192
## Proportion Var 0.394 0.221 0.157
## Cumulative Var 0.394 0.615 0.771
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 6744.37 on 52 degrees of freedom.
## The p-value is 0
make_factor_analysis_heatmp(batting_factor_analysis)
make_factor_analysis_sem_plot(batting_factor_analysis)
# This is not perfectly to scale but close enough to be useful.
waffle(c(all_star = 52, non_all_star = 233), rows = 19,
title = "Target Distribution")
partition <- createDataPartition(batting_df$all_star, p = 0.7, list=FALSE)
train_df <- batting_df[partition, ]
test_df <- batting_df[-partition, ]
decision_tree <- train_decision_tree(train_df, 'all_star')
plot_model(decision_tree)
print_grid_search_results(decision_tree)
## maxdepth Accuracy Kappa AccuracySD KappaSD
## 1 3 0.7775736 0.1498455 0.02189015 0.07485220
## 2 5 0.7755173 0.1535988 0.01721050 0.06187979
## 3 7 0.7724271 0.1520060 0.01902593 0.07211831
## 4 10 0.7742789 0.1618712 0.01673774 0.06819085
print_confusion_matrix(decision_tree, test_df, test_df$all_star)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 526 141
## yes 10 16
##
## Accuracy : 0.7821
## 95% CI : (0.7495, 0.8123)
## No Information Rate : 0.7734
## P-Value [Acc > NIR] : 0.3111
##
## Kappa : 0.1181
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.10191
## Specificity : 0.98134
## Pos Pred Value : 0.61538
## Neg Pred Value : 0.78861
## Prevalence : 0.22655
## Detection Rate : 0.02309
## Detection Prevalence : 0.03752
## Balanced Accuracy : 0.54163
##
## 'Positive' Class : yes
##
tree_roc <- get_roc_auc(decision_tree, test_df, test_df$all_star)
## Area under the curve: 0.5845
plot_decision_tree(train_df, 'all_star', 3)
##
## Classification tree:
## rpart(formula = formula, data = df, method = "class", maxdepth = depth)
##
## Variables actually used in tree construction:
## [1] avg G H
##
## Root node error: 367/1620 = 0.22654
##
## n= 1620
##
## CP nsplit rel error xerror xstd
## 1 0.031789 0 1.00000 1.00000 0.045908
## 2 0.010000 3 0.90463 0.93188 0.044756
## n= 1620
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1620 367 no (0.7734568 0.2265432)
## 2) H< 116.5 1396 256 no (0.8166189 0.1833811) *
## 3) H>=116.5 224 111 no (0.5044643 0.4955357)
## 6) G< 156.5 204 93 no (0.5441176 0.4558824)
## 12) avg< 0.2941119 155 59 no (0.6193548 0.3806452) *
## 13) avg>=0.2941119 49 15 yes (0.3061224 0.6938776) *
## 7) G>=156.5 20 2 yes (0.1000000 0.9000000) *
random_forest <- train_random_forest(train_df, 'all_star')
plot_model(random_forest)
print_grid_search_results(random_forest)
## mtry Accuracy Kappa AccuracySD KappaSD
## 1 3.872983 0.7781886 0.1775086 0.01084145 0.04207982
## 2 3.906891 0.7792178 0.1781382 0.01009534 0.03001527
print_confusion_matrix(random_forest, test_df, test_df$all_star)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 518 133
## yes 18 24
##
## Accuracy : 0.7821
## 95% CI : (0.7495, 0.8123)
## No Information Rate : 0.7734
## P-Value [Acc > NIR] : 0.3111
##
## Kappa : 0.161
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.15287
## Specificity : 0.96642
## Pos Pred Value : 0.57143
## Neg Pred Value : 0.79570
## Prevalence : 0.22655
## Detection Rate : 0.03463
## Detection Prevalence : 0.06061
## Balanced Accuracy : 0.55964
##
## 'Positive' Class : yes
##
forest_roc <- get_roc_auc(random_forest, test_df, test_df$all_star)
## Area under the curve: 0.6496
get_variable_importances(random_forest)
## rf variable importance
##
## Overall
## G 100.00
## slg 97.65
## H 88.13
## AB 86.04
## RBI 80.95
## R 80.64
## obp 80.56
## avg 79.19
## SO 55.91
## BB 49.14
## X2B 37.37
## SB 28.71
## HR 23.32
## X3B 0.00
gradient_boosting <- train_gradient_boosting(train_df, 'all_star')
plot_model(gradient_boosting)
print_grid_search_results(gradient_boosting)
## shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa
## 1 0.1 1 20 50 0.7866273 0.1608468
## 4 0.1 3 20 50 0.7847653 0.1833904
## 7 0.1 5 20 50 0.7816929 0.1882807
## 2 0.1 1 20 100 0.7862145 0.1773515
## 5 0.1 3 20 100 0.7835219 0.1997769
## 8 0.1 5 20 100 0.7738649 0.1891862
## 3 0.1 1 20 150 0.7866260 0.1842003
## 6 0.1 3 20 150 0.7773464 0.2004193
## 9 0.1 5 20 150 0.7693330 0.1961379
## AccuracySD KappaSD
## 1 0.01357915 0.06110472
## 4 0.01774190 0.06735603
## 7 0.01927243 0.07705919
## 2 0.01423996 0.06377382
## 5 0.02021889 0.07906791
## 8 0.02398342 0.09344342
## 3 0.01591576 0.06131095
## 6 0.02132703 0.07398816
## 9 0.02550751 0.08945573
print_confusion_matrix(gradient_boosting, test_df, test_df$all_star)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 522 139
## yes 14 18
##
## Accuracy : 0.7792
## 95% CI : (0.7465, 0.8096)
## No Information Rate : 0.7734
## P-Value [Acc > NIR] : 0.3782
##
## Kappa : 0.1232
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.11465
## Specificity : 0.97388
## Pos Pred Value : 0.56250
## Neg Pred Value : 0.78971
## Prevalence : 0.22655
## Detection Rate : 0.02597
## Detection Prevalence : 0.04618
## Balanced Accuracy : 0.54427
##
## 'Positive' Class : yes
##
gb_roc <- get_roc_auc(gradient_boosting, test_df, test_df$all_star)
## Area under the curve: 0.63
get_variable_importances(gradient_boosting)
## gbm variable importance
##
## Overall
## R 100.00
## slg 89.54
## RBI 84.23
## H 74.38
## AB 45.15
## G 38.65
## HR 30.22
## avg 25.09
## obp 20.03
## X2B 11.64
## SO 10.93
## SB 10.63
## X3B 0.00
## BB 0.00
log_reg <- train_log_reg(train_df, 'all_star')
print_grid_search_results(log_reg)
## parameter Accuracy Kappa AccuracySD KappaSD
## 1 1e-03 0.7753127 0.154482 0.01962346 0.06788572
## 2 1e-02 0.7753127 0.154482 0.01962346 0.06788572
## 3 1e-01 0.7753127 0.154482 0.01962346 0.06788572
## 4 1e+00 0.7753127 0.154482 0.01962346 0.06788572
## 5 1e+01 0.7753127 0.154482 0.01962346 0.06788572
## 6 1e+02 0.7753127 0.154482 0.01962346 0.06788572
print_confusion_matrix(log_reg, test_df, test_df$all_star)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 517 132
## yes 19 25
##
## Accuracy : 0.7821
## 95% CI : (0.7495, 0.8123)
## No Information Rate : 0.7734
## P-Value [Acc > NIR] : 0.3111
##
## Kappa : 0.166
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.15924
## Specificity : 0.96455
## Pos Pred Value : 0.56818
## Neg Pred Value : 0.79661
## Prevalence : 0.22655
## Detection Rate : 0.03608
## Detection Prevalence : 0.06349
## Balanced Accuracy : 0.56189
##
## 'Positive' Class : yes
##
lr_roc <- get_roc_auc(log_reg, test_df, test_df$all_star)
## Area under the curve: 0.6762
rownames(batting_df_copy) <- batting_df_copy$playerID
batting_df_copy <- subset(batting_df_copy, select=-c(playerID, all_star))
batting_df_scaled <- scale_dataframe(batting_df_copy)
plot_within_cluster_sum_of_squares(batting_df_scaled, 'rookie batting data')
k_means <- train_k_means_model(batting_df_scaled, 3)
plot_k_means_model(k_means, batting_df_scaled, 'rookie batting')
batting_df_copy <- data.frame(batting_df_copy, k_means$cluster)
summarize_clusters(batting_df_copy)
## [1] "cluster 1 summary"
## G AB R H
## Min. : 86.0 Min. :280.0 Min. : 57.00 Min. : 73.0
## 1st Qu.:124.0 1st Qu.:427.8 1st Qu.: 62.00 1st Qu.:116.0
## Median :138.0 Median :485.0 Median : 69.00 Median :134.0
## Mean :135.2 Mean :486.7 Mean : 71.96 Mean :134.6
## 3rd Qu.:150.0 3rd Qu.:545.8 3rd Qu.: 78.75 3rd Qu.:150.0
## Max. :163.0 Max. :701.0 Max. :127.00 Max. :242.0
## X2B X3B HR RBI
## Min. : 8.00 Min. : 0.000 Min. : 0.00 Min. : 12.00
## 1st Qu.:18.00 1st Qu.: 2.000 1st Qu.: 6.00 1st Qu.: 45.00
## Median :24.00 Median : 4.000 Median :13.00 Median : 57.00
## Mean :23.94 Mean : 4.341 Mean :13.35 Mean : 59.44
## 3rd Qu.:29.00 3rd Qu.: 6.000 3rd Qu.:19.00 3rd Qu.: 73.00
## Max. :47.00 Max. :19.000 Max. :49.00 Max. :130.00
## SB BB SO avg
## Min. : 0.00 Min. : 10.00 Min. : 21.00 Min. :0.2186
## 1st Qu.: 4.00 1st Qu.: 34.00 1st Qu.: 65.00 1st Qu.:0.2596
## Median : 11.00 Median : 44.50 Median : 83.00 Median :0.2772
## Mean : 14.87 Mean : 46.14 Mean : 85.72 Mean :0.2766
## 3rd Qu.: 21.00 3rd Qu.: 57.00 3rd Qu.:105.00 3rd Qu.:0.2914
## Max. :110.00 Max. :105.00 Max. :185.00 Max. :0.3509
## obp slg k_means.cluster
## Min. :0.2684 Min. :0.2933 Min. :1
## 1st Qu.:0.3235 1st Qu.:0.5288 1st Qu.:1
## Median :0.3417 Median :0.6003 Median :1
## Mean :0.3426 Mean :0.5997 Mean :1
## 3rd Qu.:0.3603 3rd Qu.:0.6716 3rd Qu.:1
## Max. :0.4101 Max. :0.9268 Max. :1
## [1] "cluster 2 summary"
## G AB R H
## Min. : 40.0 Min. :134.0 Min. :30.00 Min. : 36.00
## 1st Qu.: 89.0 1st Qu.:260.0 1st Qu.:34.00 1st Qu.: 67.00
## Median :105.0 Median :317.5 Median :41.00 Median : 84.00
## Mean :104.5 Mean :320.6 Mean :41.01 Mean : 83.87
## 3rd Qu.:122.0 3rd Qu.:377.2 3rd Qu.:48.00 3rd Qu.: 99.25
## Max. :160.0 Max. :547.0 Max. :56.00 Max. :151.00
## X2B X3B HR RBI
## Min. : 2.0 Min. : 0.000 Min. : 0.000 Min. : 8.00
## 1st Qu.:11.0 1st Qu.: 1.000 1st Qu.: 3.000 1st Qu.:26.00
## Median :15.0 Median : 2.000 Median : 6.000 Median :34.00
## Mean :14.8 Mean : 2.531 Mean : 6.701 Mean :35.26
## 3rd Qu.:19.0 3rd Qu.: 4.000 3rd Qu.:10.000 3rd Qu.:43.00
## Max. :37.0 Max. :12.000 Max. :23.000 Max. :80.00
## SB BB SO avg
## Min. : 0.000 Min. : 5.00 Min. : 12.0 Min. :0.1866
## 1st Qu.: 2.000 1st Qu.:19.00 1st Qu.: 39.0 1st Qu.:0.2436
## Median : 4.000 Median :26.00 Median : 54.0 Median :0.2606
## Mean : 6.681 Mean :27.25 Mean : 56.6 Mean :0.2626
## 3rd Qu.: 9.000 3rd Qu.:34.00 3rd Qu.: 72.0 3rd Qu.:0.2816
## Max. :50.000 Max. :94.00 Max. :158.0 Max. :0.3491
## obp slg k_means.cluster
## Min. :0.2176 Min. :0.2946 Min. :2
## 1st Qu.:0.2999 1st Qu.:0.4665 1st Qu.:2
## Median :0.3242 Median :0.5365 Median :2
## Mean :0.3239 Mean :0.5453 Mean :2
## 3rd Qu.:0.3454 3rd Qu.:0.6178 3rd Qu.:2
## Max. :0.4542 Max. :0.9486 Max. :2
## [1] "cluster 3 summary"
## G AB R H
## Min. : 26.00 Min. :101.0 Min. : 2.00 Min. : 6.00
## 1st Qu.: 47.00 1st Qu.:121.0 1st Qu.:13.00 1st Qu.:29.00
## Median : 59.00 Median :150.0 Median :17.00 Median :36.00
## Mean : 62.82 Mean :163.4 Mean :17.79 Mean :39.48
## 3rd Qu.: 77.00 3rd Qu.:195.0 3rd Qu.:23.00 3rd Qu.:48.00
## Max. :141.00 Max. :410.0 Max. :29.00 Max. :97.00
## X2B X3B HR RBI
## Min. : 0.000 Min. :0.0000 Min. : 0.000 Min. : 2.00
## 1st Qu.: 4.000 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.:11.00
## Median : 7.000 Median :1.0000 Median : 2.000 Median :15.00
## Mean : 6.939 Mean :0.9702 Mean : 3.036 Mean :16.88
## 3rd Qu.: 9.000 3rd Qu.:1.0000 3rd Qu.: 4.000 3rd Qu.:22.00
## Max. :21.000 Max. :7.0000 Max. :18.000 Max. :59.00
## SB BB SO avg
## Min. : 0.000 Min. : 0.00 Min. : 5.00 Min. :0.05825
## 1st Qu.: 0.000 1st Qu.: 8.00 1st Qu.:21.00 1st Qu.:0.21739
## Median : 1.000 Median :12.00 Median :29.00 Median :0.23973
## Mean : 2.385 Mean :13.39 Mean :31.22 Mean :0.24055
## 3rd Qu.: 3.000 3rd Qu.:17.00 3rd Qu.:39.00 3rd Qu.:0.26431
## Max. :27.000 Max. :44.00 Max. :99.00 Max. :0.36216
## obp slg k_means.cluster
## Min. :0.1019 Min. :0.05825 Min. :3
## 1st Qu.:0.2750 1st Qu.:0.40187 1st Qu.:3
## Median :0.3007 Median :0.47964 Median :3
## Mean :0.3005 Mean :0.48697 Mean :3
## 3rd Qu.:0.3263 3rd Qu.:0.55927 3rd Qu.:3
## Max. :0.4348 Max. :0.94215 Max. :3
batting_df_copy$k_means.cluster <- as.factor(batting_df_copy$k_means.cluster)
agg_summary_cols <- c('G', 'AB', 'R', 'H', 'X2B', 'X3B', 'HR', 'RBI', 'SB')
for (column in agg_summary_cols){
make_histogram_by_target(batting_df_copy, column, 'k_means.cluster',
paste(column,' histogram by cluster'), 10)
}
rate_summary_cols <- c('avg', 'obp', 'slg')
for (column in rate_summary_cols){
make_histogram_by_target(batting_df_copy, column, 'k_means.cluster',
paste(column,' histogram by cluster'), .1)
}
hclust_model <- create_hclust_and_plot(batting_df_scaled)
plot(cut(as.dendrogram(hclust_model), h=8)$lower[[4]])
plot(cut(as.dendrogram(hclust_model), h=6)$lower[[15]])