library(tidyverse) #only package needed in this practical
resultsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EbRQPyzvpfdFnuKsN_sDRh4BI9lOOBcWH86Sj5n5X-RZEQ?download=1"))
constructorsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EWtDRBgjjlpEixr8qZCdDDkBa8h0uuUkbeVH-uv8rPS8DQ?download=1"))
constructors_resultsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EYjwN0jnEEhLj9hNZtg56bEB5lL6_S9be3wel5-CEOOfrg?download=1"))
constructor_standingsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EXa0Ep8STzVMttkaUOoiVPkBOfZUROGpQe_l_RXtYBd2UA?download=1"))
driver_standingsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EXsq_S_QFj1LmDEmcEKS-4IBfEzwDg4LF8eawRXR-ihzQg?download=1"))
driversDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/ERxwKFrKAxtOiaGRtZeHEQsBbu-Dpt-upC8pFOHatLZfnQ?download=1"))
racesDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EeNMdw-MTbRAspgSxcca_XEB_iGTpi5f7idKVJkZxjr_HQ?download=1"))8 Practical 3a: Data preparation in R
During this practical you will recap data cleaning in R.
You will recap:
- Reading your data
- Data cleaning
- Merging your data (and how this is different to joining your data in Tableau)
8.1 Exercises
For the exercises you will require seven data files which can be found here or alternatively you can read them in using the following links:
results.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EbRQPyzvpfdFnuKsN_sDRh4BI9lOOBcWH86Sj5n5X-RZEQ?download=1
constructors.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EWtDRBgjjlpEixr8qZCdDDkBa8h0uuUkbeVH-uv8rPS8DQ?download=1
constructors_results.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EYjwN0jnEEhLj9hNZtg56bEB5lL6_S9be3wel5-CEOOfrg?download=1
constructor_standings.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EXa0Ep8STzVMttkaUOoiVPkBOfZUROGpQe_l_RXtYBd2UA?download=1
driver_standings.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EXsq_S_QFj1LmDEmcEKS-4IBfEzwDg4LF8eawRXR-ihzQg?download=1
drivers.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/ERxwKFrKAxtOiaGRtZeHEQsBbu-Dpt-upC8pFOHatLZfnQ?download=1
races.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EeNMdw-MTbRAspgSxcca_XEB_iGTpi5f7idKVJkZxjr_HQ?download=1
Exercise 1: Open the ‘results.csv’, ‘constructors.csv’, and ‘constructors_results.csv’, ‘constructor_standings.csv’, ‘driver_standings.csv’, ‘drivers.csv’, and ‘races.csv’ files in R name the dataframes resultsDF, constructorsDF, constructors_resultsDF,constructor_standingsDF,driver_standingsDF,driversDF, and racesDF.
Exercise 2: Show the first 10 rows of driversDF, racesDF, and resultsDF
head(driversDF, 10)# A tibble: 10 × 9
driverId driverRef number code forename surname dob nationality url
<int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 hamilton "44" HAM Lewis Hamilton 1985… British http…
2 2 heidfeld "\\N" HEI Nick Heidfeld 1977… German http…
3 3 rosberg "6" ROS Nico Rosberg 1985… German http…
4 4 alonso "14" ALO Fernando Alonso 1981… Spanish http…
5 5 kovalainen "\\N" KOV Heikki Kovalainen 1981… Finnish http…
6 6 nakajima "\\N" NAK Kazuki Nakajima 1985… Japanese http…
7 7 bourdais "\\N" BOU Sébastien Bourdais 1979… French http…
8 8 raikkonen "7" RAI Kimi Räikkönen 1979… Finnish http…
9 9 kubica "88" KUB Robert Kubica 1984… Polish http…
10 10 glock "\\N" GLO Timo Glock 1982… German http…
head(racesDF,10)# A tibble: 10 × 12
raceId year round circuitId name Circuit.Name date time url Latitude
<int> <int> <int> <int> <chr> <chr> <chr> <chr> <chr> <dbl>
1 435 1985 16 29 Austral… Adelaide St… 03/1… "\\N" http… -34.9
2 419 1986 16 29 Austral… Adelaide St… 26/1… "\\N" http… -34.9
3 403 1987 16 29 Austral… Adelaide St… 15/1… "\\N" http… -34.9
4 387 1988 16 29 Austral… Adelaide St… 13/1… "\\N" http… -34.9
5 371 1989 16 29 Austral… Adelaide St… 05/1… "\\N" http… -34.9
6 336 1990 16 29 Austral… Adelaide St… 04/1… "\\N" http… -34.9
7 320 1991 16 29 Austral… Adelaide St… 03/1… "\\N" http… -34.9
8 304 1992 16 29 Austral… Adelaide St… 08/1… "\\N" http… -34.9
9 288 1993 16 29 Austral… Adelaide St… 07/1… "\\N" http… -34.9
10 272 1994 16 29 Austral… Adelaide St… 13/1… "\\N" http… -34.9
# ℹ 2 more variables: Longitude <dbl>, X <chr>
head(resultsDF, 10)# A tibble: 10 × 18
resultId raceId driverId constructorId number grid position positionText
<int> <int> <int> <int> <chr> <int> <chr> <chr>
1 1 18 1 1 22 1 "1" 1
2 2 18 2 2 3 5 "2" 2
3 3 18 3 3 7 7 "3" 3
4 4 18 4 4 5 11 "4" 4
5 5 18 5 1 23 3 "5" 5
6 6 18 6 3 8 13 "6" 6
7 7 18 7 5 14 17 "7" 7
8 8 18 8 6 1 15 "8" 8
9 9 18 9 2 4 2 "\\N" R
10 10 18 10 7 12 18 "\\N" R
# ℹ 10 more variables: positionOrder <int>, points <dbl>, laps <int>,
# time <chr>, milliseconds <chr>, fastestLap <chr>, rank <chr>,
# fastestLapTime <chr>, fastestLapSpeed <chr>, statusId <int>
From this we can see drivers have multiple row entries, this is important to remember (e.g. if we want to calculate average height we do not want to double count the same athlete). We can also scroll through the variables to see the variables names and the type of data entered.
Next up we start cleaning our data. Let’s start with the driversDF and racesDF dataframes.
Exercise 3: In the driversDF dataframe, replace the “\N” in number with NA, and change dob to date format and number to numeric. In the racesDF dataframe change date to date format.
driversDF$number<-gsub("\\N",NA,driversDF$number)
driversDF<-driversDF %>%
mutate(dob=ymd(dob),
number=as.numeric(number))
racesDF$date<-dmy(racesDF$date)Next let’s move on to the resultsDF
Exercise 4: Replace the “\N” in all variables with NA. Change number, position, milliseconds, fastestLap, rank, and fastest lapspeed to numeric.
resultsDF<- replace(resultsDF, resultsDF == "\\N", NA)
resultsDF<- resultsDF %>%
mutate(number=as.numeric(number),
position=as.numeric(position),
milliseconds=as.numeric(milliseconds),
fastestLap=as.numeric(fastestLap),
rank=as.numeric(rank),
fastestLapSpeed=as.numeric(fastestLapSpeed))
## Why not resultsDF <- resultsDF %>%
## mutate(across(where(is.character), as.numeric))Exercise 5: Create two new variables. First create a variable called racetimeMin from milliseconds (this is the race time in milliseconds). Create a FastestLapTimeSec from FastestLapTime
resultsDF<- resultsDF %>%
mutate(racetimeMin= (milliseconds/1000)/60) %>%
separate(fastestLapTime, into = c("minutes", "seconds"), sep = ":") %>%
mutate(
minutes = as.numeric(minutes),
seconds = as.numeric(seconds),
FastestLapTimeSec = (minutes * 60) + seconds
)Next up let’s check if any of the datafiles have empty columns or rows.
Exercise 6: Check all dataframes for empty columns and rows. If there are any empty columns or rows remove these.
# check for missing row data
constructor_standingsDF<-constructor_standingsDF[rowSums(is.na(constructor_standingsDF)) != ncol(constructor_standingsDF), ]
constructors_resultsDF<-constructors_resultsDF[rowSums(is.na(constructors_resultsDF)) != ncol(constructors_resultsDF), ]
constructorsDF<-constructorsDF[rowSums(is.na(constructorsDF)) != ncol(constructorsDF), ]
driver_standingsDF<-driver_standingsDF[rowSums(is.na(driver_standingsDF)) != ncol(driver_standingsDF), ]
driversDF<-driversDF[rowSums(is.na(driversDF)) != ncol(driversDF), ]
racesDF<-racesDF[rowSums(is.na(racesDF)) != ncol(racesDF), ]
resultsDF<-resultsDF[rowSums(is.na(resultsDF)) != ncol(resultsDF), ]
#check for missing column data
constructor_standingsDF<-constructor_standingsDF[,colSums(is.na(constructor_standingsDF)) != nrow(constructor_standingsDF)]
constructors_resultsDF<-constructors_resultsDF[,colSums(is.na(constructors_resultsDF)) != nrow(constructors_resultsDF)]
constructorsDF<-constructorsDF[,colSums(is.na(constructorsDF)) != nrow(constructorsDF)]
driver_standingsDF<-driver_standingsDF[,colSums(is.na(driver_standingsDF)) != nrow(driver_standingsDF)]
driversDF<-driversDF[,colSums(is.na(driversDF)) != nrow(driversDF)]
racesDF<-racesDF[,colSums(is.na(racesDF)) != nrow(racesDF)]
resultsDF<-resultsDF[,colSums(is.na(resultsDF)) != nrow(resultsDF)] Next we need to merge some of the dataframes together. We want to merge the following:
- driversDF with driver_standingDF using an inner join naming it DriversMergedDF
- DriversMergedDF with the resultsDF using a left join (all drivers data included) naming it DriversMergedDF
- constructors_resultsDF with constructorsDF using an inner join, naming it ConstructersMergedDF
- ConstructorsMergedDF with constructors_standingsDF using an inner join, naming it ConstructersMergedDF
DriversMergedDF <- merge(driversDF,driver_standingsDF,by="driverId", all=FALSE)
DriversMergedDF <- merge(DriversMergedDF,resultsDF,by=c("driverId","raceId"), all.x=TRUE)
ConstructersMergedDF <- merge(constructors_resultsDF,constructorsDF,by="constructorId", all=FALSE)
ConstructersMergedDF <- merge(ConstructersMergedDF,constructor_standingsDF,by=c("constructorId", "raceId"), all=FALSE)all=FALSE results in a inner join. To apply a left join use all.x=TRUE, to use a right join use all.y=TRUE, to use a full join use all=TRUE.
Right now we have two DFs we want to do one last check, get rid of any irrelevant columns and rename some variables.
Exercise 7: In the DriversMergedDF, remove the following columns: driverRef, number.x, url, number.y, grid, positionText.y, time, milliseconds, fastestLap, minutes, seconds, statusId.
DriversMergedDF <- DriversMergedDF%>%
select(-driverRef, -number.x, -url, -number.y, -grid, -positionText.y, -time, -milliseconds, -fastestLap, -minutes, -seconds, -statusId, -positionText.x, -rank)Exercise 8: Rename the following variables:
- code = DriverCode
- forename= FirstName
- surname = LastName
- position.x = DriversTotalPosition
- points.x = DriversTotalPoints
- wins = DriverWins
- position.y = DriversRacePosition
- points.y = DriversRacePoints
- laps = LapsCompleted
DriversMergedDF <- DriversMergedDF%>%
rename(DriverCode=code,
FirstName=forename,
LastName=surname,
DriversTotalPosition=position.x,
DriversTotalPoints=points.x,
DriverWins=wins,
DriversRacePosition=position.y,
DriversRacePoints=points.y,
LapsCompleted=laps)Exercise 9: In the DriversMergedDF, remove the following columns: driverRef, number.x, url, number.y, grid, positionText.y, time, milliseconds, fastestLap, minutes, seconds, statusId.
ConstructersMergedDF <- ConstructersMergedDF%>%
select(-status, -constructorRef, -positionText, -url)Exercise 10: Rename the following variables:
- points.x = ConstructorRacePoints
- name = TeamName
- points.y = ConstructorsTotalPoints
- position = ConstructorsPosition
- wins = ConstructorWins
ConstructersMergedDF <- ConstructersMergedDF%>%
rename(ConstructorRacePoints=points.x,
TeamName=name,
ConstructorsTotalPoints=points.y,
ConstructorsPosition=position,
ConstructorWins=wins)Last we want to check if any of the racesDF dataframe variables require renaming.
Exercise 11: Rename the following variables:
- X = RaceCountry
- Circuit.Name = CircuitName
- year = RaceYear
- round = RaceRound
racesDF <- racesDF%>%
rename(RaceCountry=X,
CircuitName=Circuit.Name,
RaceYear=year,
RaceRound=round)We now have three clean dataframes. Lets save these as .csv files.
Exercise 12: Save your datafiles as a .csv file named “P3_F1Drivers”, “P3_F1Constructors”, “P3_F1Races”.
write_rds(DriversMergedDF,"C:/Users/wkb14101/OneDrive - University of Strathclyde/MSc SDA/R Projects/B1703/data/P3_F1Drivers.rds")
write_rds(ConstructersMergedDF,"C:/Users/wkb14101/OneDrive - University of Strathclyde/MSc SDA/R Projects/B1703/data/P3_F1Constructors.rds")
write_rds(racesDF,"C:/Users/wkb14101/OneDrive - University of Strathclyde/MSc SDA/R Projects/B1703/data/P3_F1Races.rds")