library(tidyverse)
ConstructorsDF<-read_rds("https://strath-my.sharepoint.com/:u:/g/personal/xanne_janssen_strath_ac_uk/Ea1fQLpdL4JImLeFNwn4ChkBBz2BJcEEBOhWYM5-m7bUHw?download=1")
DriversDF<-read_rds("https://strath-my.sharepoint.com/:u:/g/personal/xanne_janssen_strath_ac_uk/EVgCPbwNJKlAuG_hpBvxLpQBShiU1q16eWuRS19SFhj7VA?download=1")
RacesDF<-read_rds("https://strath-my.sharepoint.com/:u:/g/personal/xanne_janssen_strath_ac_uk/EaogXHpyrdVIm2bFiPiVpbQBl01YxYt9YHvunMfrCQzWiw?download=1")12 Practical 5a: Exploratory data analysis in R
During this demonstration we will recap conducting exploratory analysis in R. We will use the cleaned data files we used during the previous demonstration. If you want to follow along, the files can be found here.
First we will open the constructors, drivers, and race data.
12.1 Exploratory data analysis
Let’s first check the average age of the drivers. To do this we will need to create an age variable and make sure each drivers age is only taken into account once (remember each drivers has multiple row entries). As we want to obtain the age of the driver based on the race date we will need to conduct another merge of dataframes. So let’s start with merging the DriversDF with the RacesDF and calculating the age variable.
# join the data using raceId as the unique identifier
DriversRacesDF <- merge(DriversDF, RacesDF, by= c("raceId"), all=TRUE)
#calculate age
DriversRacesDF <- DriversRacesDF %>%
mutate(Age= time_length((date-dob),"years"))Now we have the age of the drivers we can calculate the average age per year for each driver, and use this to create a line graph showcasing the change in F1 drivers from 1958 until 2024.
#calculate average age by race year and driver (keep only one value for age per year/driver to avoid double counting)
DriversRacesDF <- DriversRacesDF %>%
mutate(DriverName=paste(FirstName, LastName)) %>%
group_by(RaceYear,driverId) %>%
mutate(AverageAge = if_else(Age==max(Age,na.rm=TRUE),mean(Age, na.rm=TRUE), NA))
#create linegraph. Note how I use stat_summary, if you do not want to use stat_summary you would need to create a dataframe with the average age per year first (i.e. group_by(year))
LineGraphAge <- DriversRacesDF %>%
distinct(driverId,AverageAge,.keep_all=TRUE) %>%
filter(RaceYear>=1958)%>%
ggplot(aes(RaceYear, AverageAge))+
stat_summary(fun = mean, geom = "line", colour = "blue", size = 1)
LineGraphAge
From the graph above we can see there has been a substantial decline in the average age of drivers in F1.
Next let’s look at how many different riders have won the championship, the average age when they won, as well as the youngest and oldest winners.
To do this we first need to identify the champions of each year. We will do this by looking at the their position at the end of the season. For this we need to find out how many rounds each year had, as we can then use that to assign the drivers an end of season position.
# Get drivers position at end of the year
DriversRacesDF <- DriversRacesDF %>%
group_by(RaceYear) %>%
mutate(DriversFinalPosition = ifelse(RaceRound == max(RaceRound, na.rm = TRUE), DriversTotalPosition, NA))%>%
ungroup()
library(flextable)
# Create summary stats for champions from 1958 onwards
SummaryStatsDF <- DriversRacesDF %>%
filter(DriversFinalPosition == 1 & RaceYear >= 1958) %>%
summarise(
UniqueChampions = n_distinct(DriverName),
Age_avg = mean(AverageAge, na.rm = TRUE),
Age_min = min(AverageAge, na.rm = TRUE),
Age_max = max(AverageAge, na.rm = TRUE),
Wins_avg = mean(DriverWins, na.rm=TRUE),
Wins_min = min(DriverWins, na.rm = TRUE),
Wins_max = max(DriverWins, na.rm = TRUE)
)
# Create a table in right shape for plotting
TableDF <- tibble(
Variable = c("Unique Champions","Age (years)", "Wins"),
Total = c(SummaryStatsDF$UniqueChampions, NA,NA),
Average = c(NA,SummaryStatsDF$Age_avg, SummaryStatsDF$Wins_avg),
Min = c(NA,SummaryStatsDF$Age_min, SummaryStatsDF$Wins_min),
Max = c(NA,SummaryStatsDF$Age_max, SummaryStatsDF$Wins_max)
)
# Create the flextable
Table1 <- flextable(TableDF) %>%
set_header_labels(
Variable = "",
Average = "Average",
Min = "Minimum",
Max = "Maximum") %>%
theme_vanilla() %>%
align(align = "center", part = "all") %>%
valign(valign = "center", part = "header") %>%
autofit()
# Print the flextable
Table1Total | Average | Minimum | Maximum | |
|---|---|---|---|---|
Unique Champions | 31 | |||
Age (years) | 30.662744 | 23.03066 | 40.33463 | |
Wins | 6.590909 | 1.00000 | 19.00000 |
If I wanted to create a table with the average drivers age per year and standard deviation of age per year I would use a similar code as above, except I would group by RaceYear and not filter for just the winner (as that would give me only one entry per year which makes calculating an average and/or standard deviation redundant).
TableDFPerYear<-DriversRacesDF %>%
filter(RaceYear >= 1958) %>% #filter relevant years
group_by(RaceYear)%>%
reframe(UniqueDrivers= n_distinct(DriverName),
Age_avg=mean(AverageAge, na.rm=TRUE),
Age_std=sd(AverageAge, na.rm=TRUE),
Age_min= min(AverageAge, na.rm=TRUE),
Age_max= max(AverageAge, na.rm=TRUE),
Wins_min= min(DriverWins, na.rm=TRUE),
Wins_max= max(DriverWins, na.rm=TRUE))
flextable(TableDFPerYear) #Create basic TableRaceYear | UniqueDrivers | Age_avg | Age_std | Age_min | Age_max | Wins_min | Wins_max |
|---|---|---|---|---|---|---|---|
1,958 | 87 | 33.74429 | 6.379648 | 21.01643 | 58.95140 | 0 | 4 |
1,959 | 88 | 32.98062 | 5.916009 | 21.42460 | 46.48049 | 0 | 2 |
1,960 | 91 | 33.77181 | 6.067533 | 22.62955 | 47.47844 | 0 | 5 |
1,961 | 62 | 31.55727 | 6.203520 | 19.60849 | 48.52841 | 0 | 2 |
1,962 | 61 | 31.28101 | 6.152770 | 20.47669 | 52.49464 | 0 | 4 |
1,963 | 62 | 31.94821 | 6.831421 | 18.23135 | 53.44175 | 0 | 7 |
1,964 | 41 | 32.12268 | 6.613528 | 21.02204 | 47.62656 | 0 | 3 |
1,965 | 54 | 31.25979 | 5.197911 | 22.08038 | 44.58700 | 0 | 6 |
1,966 | 33 | 32.04609 | 4.227573 | 23.08517 | 40.33463 | 0 | 4 |
1,967 | 44 | 31.67535 | 5.961751 | 22.69624 | 45.93634 | 0 | 4 |
1,968 | 43 | 31.88735 | 5.332453 | 23.52704 | 46.88364 | 0 | 3 |
1,969 | 30 | 32.52852 | 6.329624 | 23.91513 | 47.89621 | 0 | 6 |
1,970 | 43 | 31.90064 | 5.473827 | 23.72856 | 45.60185 | 0 | 5 |
1,971 | 50 | 31.93036 | 5.076795 | 22.54689 | 46.58602 | 0 | 6 |
1,972 | 42 | 31.66242 | 4.821589 | 22.69131 | 47.58634 | 0 | 5 |
1,973 | 42 | 31.24877 | 4.337586 | 23.44274 | 44.42482 | 0 | 5 |
1,974 | 62 | 31.67150 | 4.266744 | 23.45225 | 45.32768 | 0 | 3 |
1,975 | 52 | 30.66415 | 3.984501 | 23.27845 | 46.29940 | 0 | 5 |
1,976 | 54 | 30.49249 | 3.245395 | 23.33949 | 38.63860 | 0 | 6 |
1,977 | 61 | 30.61654 | 4.017114 | 21.52374 | 39.61284 | 0 | 4 |
1,978 | 46 | 30.74011 | 4.909500 | 20.42659 | 40.73466 | 0 | 6 |
1,979 | 36 | 30.98851 | 4.763893 | 21.21725 | 41.87086 | 0 | 4 |
1,980 | 41 | 29.86114 | 5.526310 | 19.47502 | 42.85626 | 0 | 5 |
1,981 | 39 | 30.00795 | 4.825024 | 22.08661 | 41.33917 | 0 | 3 |
1,982 | 40 | 30.14940 | 5.017097 | 23.05014 | 42.34868 | 0 | 2 |
1,983 | 35 | 30.00776 | 4.336568 | 22.20744 | 39.59827 | 0 | 4 |
1,984 | 35 | 29.35463 | 4.038124 | 23.31902 | 40.61995 | 0 | 7 |
1,985 | 36 | 30.83146 | 4.513362 | 22.40840 | 41.67163 | 0 | 5 |
1,986 | 32 | 30.92709 | 4.653749 | 22.53936 | 42.63364 | 0 | 5 |
1,987 | 32 | 29.84104 | 3.777579 | 23.37183 | 39.09651 | 0 | 6 |
1,988 | 36 | 30.44227 | 4.314561 | 24.01215 | 40.07775 | 0 | 8 |
1,989 | 47 | 29.87936 | 4.180101 | 23.69952 | 41.05014 | 0 | 6 |
1,990 | 40 | 30.10647 | 4.192300 | 22.51677 | 37.92266 | 0 | 6 |
1,991 | 41 | 29.57319 | 4.449363 | 22.73192 | 38.91205 | 0 | 7 |
1,992 | 32 | 30.09895 | 4.330939 | 21.53572 | 38.91239 | 0 | 9 |
1,993 | 31 | 30.67537 | 5.522787 | 21.16222 | 39.26215 | 0 | 7 |
1,994 | 46 | 29.71650 | 4.507110 | 22.16359 | 41.07899 | 0 | 8 |
1,995 | 34 | 29.67954 | 4.675019 | 22.32535 | 42.00338 | 0 | 9 |
1,996 | 23 | 29.47950 | 4.569036 | 23.49918 | 38.55825 | 0 | 8 |
1,997 | 26 | 28.32388 | 4.596673 | 21.62811 | 37.86689 | 0 | 7 |
1,998 | 23 | 28.32605 | 4.176903 | 20.24978 | 37.78285 | 0 | 8 |
1,999 | 23 | 29.55745 | 4.081166 | 23.42368 | 38.86242 | 0 | 5 |
2,000 | 23 | 28.74339 | 4.181171 | 20.47091 | 36.11499 | 0 | 9 |
2,001 | 26 | 27.95307 | 4.568650 | 19.90079 | 37.03153 | 0 | 9 |
2,002 | 22 | 28.94406 | 4.702333 | 21.17882 | 36.61489 | 0 | 11 |
2,003 | 24 | 28.39765 | 4.189283 | 21.89733 | 36.85969 | 0 | 6 |
2,004 | 25 | 27.94991 | 4.441303 | 21.39493 | 37.82752 | 0 | 13 |
2,005 | 27 | 28.59809 | 3.846151 | 22.39706 | 36.49152 | 0 | 7 |
2,006 | 27 | 28.83353 | 4.532852 | 21.00662 | 37.48574 | 0 | 7 |
2,007 | 25 | 27.86791 | 4.483428 | 20.13216 | 36.35044 | 0 | 6 |
2,008 | 22 | 28.36342 | 4.646796 | 21.10609 | 37.31465 | 0 | 6 |
2,009 | 25 | 27.97156 | 5.196909 | 19.48905 | 38.66667 | 0 | 6 |
2,010 | 27 | 28.68005 | 5.352601 | 20.30923 | 41.52484 | 0 | 5 |
2,011 | 28 | 29.37744 | 5.718460 | 21.34313 | 42.55874 | 0 | 11 |
2,012 | 25 | 29.17600 | 5.528946 | 22.25955 | 43.56550 | 0 | 5 |
2,013 | 23 | 27.71162 | 4.205561 | 21.97846 | 36.91617 | 0 | 13 |
2,014 | 24 | 27.37823 | 4.241018 | 20.23473 | 34.75903 | 0 | 11 |
2,015 | 22 | 27.10912 | 4.942806 | 17.81692 | 35.77175 | 0 | 10 |
2,016 | 24 | 27.15609 | 5.155000 | 18.82181 | 36.77664 | 0 | 10 |
2,017 | 25 | 27.66884 | 5.505372 | 18.74483 | 37.77837 | 0 | 9 |
2,018 | 20 | 27.56199 | 5.397710 | 19.73899 | 38.77253 | 0 | 11 |
2,019 | 20 | 27.53091 | 5.615463 | 19.70340 | 39.77732 | 0 | 11 |
2,020 | 23 | 27.68901 | 5.178236 | 20.85646 | 40.93039 | 0 | 11 |
2,021 | 21 | 28.47606 | 6.387324 | 21.25344 | 41.82017 | 0 | 10 |
2,022 | 22 | 28.50541 | 5.320747 | 22.19513 | 40.97953 | 0 | 15 |
2,023 | 22 | 28.51844 | 5.624877 | 21.66516 | 41.99527 | 0 | 19 |
2,024 | 22 | 29.12940 | 6.002553 | 19.01661 | 42.77550 | 0 | 7 |
# Play around with specifications
TableFigPerYear <- flextable(TableDFPerYear) %>%
colformat_double() %>%
separate_header() %>%
theme_vanilla() %>%
align(align = "center", part = "all") %>%
valign(valign = "center", part = "header") %>%
autofit()
TableFigPerYearRaceYear | UniqueDrivers | Age | Wins | ||||
|---|---|---|---|---|---|---|---|
avg | std | min | max | min | max | ||
1,958 | 87 | 33.7 | 6.4 | 21.0 | 59.0 | 0 | 4 |
1,959 | 88 | 33.0 | 5.9 | 21.4 | 46.5 | 0 | 2 |
1,960 | 91 | 33.8 | 6.1 | 22.6 | 47.5 | 0 | 5 |
1,961 | 62 | 31.6 | 6.2 | 19.6 | 48.5 | 0 | 2 |
1,962 | 61 | 31.3 | 6.2 | 20.5 | 52.5 | 0 | 4 |
1,963 | 62 | 31.9 | 6.8 | 18.2 | 53.4 | 0 | 7 |
1,964 | 41 | 32.1 | 6.6 | 21.0 | 47.6 | 0 | 3 |
1,965 | 54 | 31.3 | 5.2 | 22.1 | 44.6 | 0 | 6 |
1,966 | 33 | 32.0 | 4.2 | 23.1 | 40.3 | 0 | 4 |
1,967 | 44 | 31.7 | 6.0 | 22.7 | 45.9 | 0 | 4 |
1,968 | 43 | 31.9 | 5.3 | 23.5 | 46.9 | 0 | 3 |
1,969 | 30 | 32.5 | 6.3 | 23.9 | 47.9 | 0 | 6 |
1,970 | 43 | 31.9 | 5.5 | 23.7 | 45.6 | 0 | 5 |
1,971 | 50 | 31.9 | 5.1 | 22.5 | 46.6 | 0 | 6 |
1,972 | 42 | 31.7 | 4.8 | 22.7 | 47.6 | 0 | 5 |
1,973 | 42 | 31.2 | 4.3 | 23.4 | 44.4 | 0 | 5 |
1,974 | 62 | 31.7 | 4.3 | 23.5 | 45.3 | 0 | 3 |
1,975 | 52 | 30.7 | 4.0 | 23.3 | 46.3 | 0 | 5 |
1,976 | 54 | 30.5 | 3.2 | 23.3 | 38.6 | 0 | 6 |
1,977 | 61 | 30.6 | 4.0 | 21.5 | 39.6 | 0 | 4 |
1,978 | 46 | 30.7 | 4.9 | 20.4 | 40.7 | 0 | 6 |
1,979 | 36 | 31.0 | 4.8 | 21.2 | 41.9 | 0 | 4 |
1,980 | 41 | 29.9 | 5.5 | 19.5 | 42.9 | 0 | 5 |
1,981 | 39 | 30.0 | 4.8 | 22.1 | 41.3 | 0 | 3 |
1,982 | 40 | 30.1 | 5.0 | 23.1 | 42.3 | 0 | 2 |
1,983 | 35 | 30.0 | 4.3 | 22.2 | 39.6 | 0 | 4 |
1,984 | 35 | 29.4 | 4.0 | 23.3 | 40.6 | 0 | 7 |
1,985 | 36 | 30.8 | 4.5 | 22.4 | 41.7 | 0 | 5 |
1,986 | 32 | 30.9 | 4.7 | 22.5 | 42.6 | 0 | 5 |
1,987 | 32 | 29.8 | 3.8 | 23.4 | 39.1 | 0 | 6 |
1,988 | 36 | 30.4 | 4.3 | 24.0 | 40.1 | 0 | 8 |
1,989 | 47 | 29.9 | 4.2 | 23.7 | 41.1 | 0 | 6 |
1,990 | 40 | 30.1 | 4.2 | 22.5 | 37.9 | 0 | 6 |
1,991 | 41 | 29.6 | 4.4 | 22.7 | 38.9 | 0 | 7 |
1,992 | 32 | 30.1 | 4.3 | 21.5 | 38.9 | 0 | 9 |
1,993 | 31 | 30.7 | 5.5 | 21.2 | 39.3 | 0 | 7 |
1,994 | 46 | 29.7 | 4.5 | 22.2 | 41.1 | 0 | 8 |
1,995 | 34 | 29.7 | 4.7 | 22.3 | 42.0 | 0 | 9 |
1,996 | 23 | 29.5 | 4.6 | 23.5 | 38.6 | 0 | 8 |
1,997 | 26 | 28.3 | 4.6 | 21.6 | 37.9 | 0 | 7 |
1,998 | 23 | 28.3 | 4.2 | 20.2 | 37.8 | 0 | 8 |
1,999 | 23 | 29.6 | 4.1 | 23.4 | 38.9 | 0 | 5 |
2,000 | 23 | 28.7 | 4.2 | 20.5 | 36.1 | 0 | 9 |
2,001 | 26 | 28.0 | 4.6 | 19.9 | 37.0 | 0 | 9 |
2,002 | 22 | 28.9 | 4.7 | 21.2 | 36.6 | 0 | 11 |
2,003 | 24 | 28.4 | 4.2 | 21.9 | 36.9 | 0 | 6 |
2,004 | 25 | 27.9 | 4.4 | 21.4 | 37.8 | 0 | 13 |
2,005 | 27 | 28.6 | 3.8 | 22.4 | 36.5 | 0 | 7 |
2,006 | 27 | 28.8 | 4.5 | 21.0 | 37.5 | 0 | 7 |
2,007 | 25 | 27.9 | 4.5 | 20.1 | 36.4 | 0 | 6 |
2,008 | 22 | 28.4 | 4.6 | 21.1 | 37.3 | 0 | 6 |
2,009 | 25 | 28.0 | 5.2 | 19.5 | 38.7 | 0 | 6 |
2,010 | 27 | 28.7 | 5.4 | 20.3 | 41.5 | 0 | 5 |
2,011 | 28 | 29.4 | 5.7 | 21.3 | 42.6 | 0 | 11 |
2,012 | 25 | 29.2 | 5.5 | 22.3 | 43.6 | 0 | 5 |
2,013 | 23 | 27.7 | 4.2 | 22.0 | 36.9 | 0 | 13 |
2,014 | 24 | 27.4 | 4.2 | 20.2 | 34.8 | 0 | 11 |
2,015 | 22 | 27.1 | 4.9 | 17.8 | 35.8 | 0 | 10 |
2,016 | 24 | 27.2 | 5.2 | 18.8 | 36.8 | 0 | 10 |
2,017 | 25 | 27.7 | 5.5 | 18.7 | 37.8 | 0 | 9 |
2,018 | 20 | 27.6 | 5.4 | 19.7 | 38.8 | 0 | 11 |
2,019 | 20 | 27.5 | 5.6 | 19.7 | 39.8 | 0 | 11 |
2,020 | 23 | 27.7 | 5.2 | 20.9 | 40.9 | 0 | 11 |
2,021 | 21 | 28.5 | 6.4 | 21.3 | 41.8 | 0 | 10 |
2,022 | 22 | 28.5 | 5.3 | 22.2 | 41.0 | 0 | 15 |
2,023 | 22 | 28.5 | 5.6 | 21.7 | 42.0 | 0 | 19 |
2,024 | 22 | 29.1 | 6.0 | 19.0 | 42.8 | 0 | 7 |
Next up let’s create a line chart of the winners age per year.
# create linegraph of the average age of the winners (note I could have used geom_line here instead of stat_summary as we only take one age per year)
LineGraphWinners <- DriversRacesDF %>%
filter(DriversFinalPosition==1) %>%
ggplot(aes(RaceYear, AverageAge)) +
stat_summary(fun = mean, geom = "line", colour = "blue", size = 1)
LineGraphWinners
# Calculate max and min age of winners and annotate
MaxAge<- DriversRacesDF%>%
filter(DriversFinalPosition==1) %>%
slice_max(AverageAge)
MinAge<- DriversRacesDF%>%
filter(DriversFinalPosition==1) %>%
slice_min(AverageAge)
AnnotatedLineGraphWinners <- LineGraphWinners +
geom_text(data=MaxAge,aes(label=DriverName),hjust=1.1, vjust=0.5)+
geom_text(data=MinAge,aes(label=DriverName),hjust=1.1, vjust=0.5)+
ylim(0,50)+
labs(y="Champions Age", x="Year")
AnnotatedLineGraphWinners
Lastly, we will look at the correlation between the fastest lap time per race and the average race time by creating a scatter plot.
# summarise data for correlational plot (i.e. one data point per year per race)
CorData<-DriversRacesDF %>%
filter(FastestLapTimeSec>=78) %>%
group_by(RaceYear, raceId) %>%
summarise(MinLapTime=min(FastestLapTimeSec, na.rm=TRUE),
AvgRaceTime=mean(racetimeMin, na.rm=TRUE),
GrandPrixName=unique(name)) %>%
ungroup()
# create scatterplot
ScatterPlot<-ggplot(CorData, aes(AvgRaceTime,MinLapTime)) +
geom_point()
ScatterPlot
# identify max race time so we can check which grandprix this was
MaxRaceTime <- CorData%>%
slice_max(AvgRaceTime)
# add annotations to highlight the longest race time grandprix
ScatterPlot<-ScatterPlot+
geom_text(data=MaxRaceTime, aes(label=GrandPrixName), hjust=0.9, vjust=-1)+
geom_text(data=MaxRaceTime, aes(label=RaceYear), hjust=2, vjust=1)
ScatterPlot
The scatterplot above shows on first view a weak correlation between the average race time and the fastest lap times. Point of interest is the high race time during the 2011 Canadian Grand Prix, worth looking into.
Last we will save the DriversDF, DriversRacesDF, ConstructorsDF, and RacesDF as an .RData file named Practical5_F1DriversRaces.RData.
save(DriversDF, DriversRacesDF, ConstructorsDF, ConstructorsDF,RacesDF, file="P5_F1Data.RData")
save.image(file="P5_F1.RData")