8  Practical 3a: Data preparation in R

During this practical you will recap data cleaning in R.

You will recap:

8.1 Exercises

For the exercises you will require seven data files which can be found here or alternatively you can read them in using the following links:

  • results.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EbRQPyzvpfdFnuKsN_sDRh4BI9lOOBcWH86Sj5n5X-RZEQ?download=1

  • constructors.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EWtDRBgjjlpEixr8qZCdDDkBa8h0uuUkbeVH-uv8rPS8DQ?download=1

  • constructors_results.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EYjwN0jnEEhLj9hNZtg56bEB5lL6_S9be3wel5-CEOOfrg?download=1

  • constructor_standings.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EXa0Ep8STzVMttkaUOoiVPkBOfZUROGpQe_l_RXtYBd2UA?download=1

  • driver_standings.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EXsq_S_QFj1LmDEmcEKS-4IBfEzwDg4LF8eawRXR-ihzQg?download=1

  • drivers.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/ERxwKFrKAxtOiaGRtZeHEQsBbu-Dpt-upC8pFOHatLZfnQ?download=1

  • races.csv: https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EeNMdw-MTbRAspgSxcca_XEB_iGTpi5f7idKVJkZxjr_HQ?download=1

Exercise 1: Open the ‘results.csv’, ‘constructors.csv’, and ‘constructors_results.csv’, ‘constructor_standings.csv’, ‘driver_standings.csv’, ‘drivers.csv’, and ‘races.csv’ files in R name the dataframes resultsDF, constructorsDF, constructors_resultsDF,constructor_standingsDF,driver_standingsDF,driversDF, and racesDF.

library(tidyverse) #only package needed in this practical

resultsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EbRQPyzvpfdFnuKsN_sDRh4BI9lOOBcWH86Sj5n5X-RZEQ?download=1"))

constructorsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EWtDRBgjjlpEixr8qZCdDDkBa8h0uuUkbeVH-uv8rPS8DQ?download=1"))

constructors_resultsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EYjwN0jnEEhLj9hNZtg56bEB5lL6_S9be3wel5-CEOOfrg?download=1"))

constructor_standingsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EXa0Ep8STzVMttkaUOoiVPkBOfZUROGpQe_l_RXtYBd2UA?download=1"))

driver_standingsDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EXsq_S_QFj1LmDEmcEKS-4IBfEzwDg4LF8eawRXR-ihzQg?download=1"))

driversDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/ERxwKFrKAxtOiaGRtZeHEQsBbu-Dpt-upC8pFOHatLZfnQ?download=1"))

racesDF <- as_tibble(read.csv("https://strath-my.sharepoint.com/:x:/g/personal/xanne_janssen_strath_ac_uk/EeNMdw-MTbRAspgSxcca_XEB_iGTpi5f7idKVJkZxjr_HQ?download=1"))

Exercise 2: Show the first 10 rows of driversDF, racesDF, and resultsDF

head(driversDF, 10)
# A tibble: 10 × 9
   driverId driverRef  number code  forename  surname    dob   nationality url  
      <int> <chr>      <chr>  <chr> <chr>     <chr>      <chr> <chr>       <chr>
 1        1 hamilton   "44"   HAM   Lewis     Hamilton   1985… British     http…
 2        2 heidfeld   "\\N"  HEI   Nick      Heidfeld   1977… German      http…
 3        3 rosberg    "6"    ROS   Nico      Rosberg    1985… German      http…
 4        4 alonso     "14"   ALO   Fernando  Alonso     1981… Spanish     http…
 5        5 kovalainen "\\N"  KOV   Heikki    Kovalainen 1981… Finnish     http…
 6        6 nakajima   "\\N"  NAK   Kazuki    Nakajima   1985… Japanese    http…
 7        7 bourdais   "\\N"  BOU   Sébastien Bourdais   1979… French      http…
 8        8 raikkonen  "7"    RAI   Kimi      Räikkönen  1979… Finnish     http…
 9        9 kubica     "88"   KUB   Robert    Kubica     1984… Polish      http…
10       10 glock      "\\N"  GLO   Timo      Glock      1982… German      http…
head(racesDF,10)
# A tibble: 10 × 12
   raceId  year round circuitId name     Circuit.Name date  time  url   Latitude
    <int> <int> <int>     <int> <chr>    <chr>        <chr> <chr> <chr>    <dbl>
 1    435  1985    16        29 Austral… Adelaide St… 03/1… "\\N" http…    -34.9
 2    419  1986    16        29 Austral… Adelaide St… 26/1… "\\N" http…    -34.9
 3    403  1987    16        29 Austral… Adelaide St… 15/1… "\\N" http…    -34.9
 4    387  1988    16        29 Austral… Adelaide St… 13/1… "\\N" http…    -34.9
 5    371  1989    16        29 Austral… Adelaide St… 05/1… "\\N" http…    -34.9
 6    336  1990    16        29 Austral… Adelaide St… 04/1… "\\N" http…    -34.9
 7    320  1991    16        29 Austral… Adelaide St… 03/1… "\\N" http…    -34.9
 8    304  1992    16        29 Austral… Adelaide St… 08/1… "\\N" http…    -34.9
 9    288  1993    16        29 Austral… Adelaide St… 07/1… "\\N" http…    -34.9
10    272  1994    16        29 Austral… Adelaide St… 13/1… "\\N" http…    -34.9
# ℹ 2 more variables: Longitude <dbl>, X <chr>
head(resultsDF, 10)
# A tibble: 10 × 18
   resultId raceId driverId constructorId number  grid position positionText
      <int>  <int>    <int>         <int> <chr>  <int> <chr>    <chr>       
 1        1     18        1             1 22         1 "1"      1           
 2        2     18        2             2 3          5 "2"      2           
 3        3     18        3             3 7          7 "3"      3           
 4        4     18        4             4 5         11 "4"      4           
 5        5     18        5             1 23         3 "5"      5           
 6        6     18        6             3 8         13 "6"      6           
 7        7     18        7             5 14        17 "7"      7           
 8        8     18        8             6 1         15 "8"      8           
 9        9     18        9             2 4          2 "\\N"    R           
10       10     18       10             7 12        18 "\\N"    R           
# ℹ 10 more variables: positionOrder <int>, points <dbl>, laps <int>,
#   time <chr>, milliseconds <chr>, fastestLap <chr>, rank <chr>,
#   fastestLapTime <chr>, fastestLapSpeed <chr>, statusId <int>

From this we can see drivers have multiple row entries, this is important to remember (e.g. if we want to calculate average height we do not want to double count the same athlete). We can also scroll through the variables to see the variables names and the type of data entered.

Next up we start cleaning our data. Let’s start with the driversDF and racesDF dataframes.

Exercise 3: In the driversDF dataframe, replace the “\N” in number with NA, and change dob to date format and number to numeric. In the racesDF dataframe change date to date format.

driversDF$number<-gsub("\\N",NA,driversDF$number)

driversDF<-driversDF %>%
  mutate(dob=ymd(dob),
         number=as.numeric(number)) 

racesDF$date<-dmy(racesDF$date)

Next let’s move on to the resultsDF

Exercise 4: Replace the “\N” in all variables with NA. Change number, position, milliseconds, fastestLap, rank, and fastest lapspeed to numeric.

resultsDF<- replace(resultsDF, resultsDF == "\\N", NA)

resultsDF<- resultsDF %>%
  mutate(number=as.numeric(number), 
         position=as.numeric(position),
         milliseconds=as.numeric(milliseconds),
         fastestLap=as.numeric(fastestLap),
         rank=as.numeric(rank),
         fastestLapSpeed=as.numeric(fastestLapSpeed))

## Why not resultsDF <- resultsDF %>% 
##           mutate(across(where(is.character), as.numeric))

Exercise 5: Create two new variables. First create a variable called racetimeMin from milliseconds (this is the race time in milliseconds). Create a FastestLapTimeSec from FastestLapTime

resultsDF<- resultsDF %>%
  mutate(racetimeMin= (milliseconds/1000)/60) %>%
  separate(fastestLapTime, into = c("minutes", "seconds"), sep = ":") %>%
  mutate(
    minutes = as.numeric(minutes),
    seconds = as.numeric(seconds),
    FastestLapTimeSec = (minutes * 60) + seconds
  )

Next up let’s check if any of the datafiles have empty columns or rows.

Exercise 6: Check all dataframes for empty columns and rows. If there are any empty columns or rows remove these.

# check for missing row data
constructor_standingsDF<-constructor_standingsDF[rowSums(is.na(constructor_standingsDF)) != ncol(constructor_standingsDF), ] 
constructors_resultsDF<-constructors_resultsDF[rowSums(is.na(constructors_resultsDF)) != ncol(constructors_resultsDF), ] 

constructorsDF<-constructorsDF[rowSums(is.na(constructorsDF)) != ncol(constructorsDF), ] 

driver_standingsDF<-driver_standingsDF[rowSums(is.na(driver_standingsDF)) != ncol(driver_standingsDF), ] 

driversDF<-driversDF[rowSums(is.na(driversDF)) != ncol(driversDF), ] 

racesDF<-racesDF[rowSums(is.na(racesDF)) != ncol(racesDF), ] 
resultsDF<-resultsDF[rowSums(is.na(resultsDF)) != ncol(resultsDF), ] 

#check for missing column data
constructor_standingsDF<-constructor_standingsDF[,colSums(is.na(constructor_standingsDF)) != nrow(constructor_standingsDF)] 

constructors_resultsDF<-constructors_resultsDF[,colSums(is.na(constructors_resultsDF)) != nrow(constructors_resultsDF)] 

constructorsDF<-constructorsDF[,colSums(is.na(constructorsDF)) != nrow(constructorsDF)] 

driver_standingsDF<-driver_standingsDF[,colSums(is.na(driver_standingsDF)) != nrow(driver_standingsDF)] 

driversDF<-driversDF[,colSums(is.na(driversDF)) != nrow(driversDF)] 

racesDF<-racesDF[,colSums(is.na(racesDF)) != nrow(racesDF)] 

resultsDF<-resultsDF[,colSums(is.na(resultsDF)) != nrow(resultsDF)] 

Next we need to merge some of the dataframes together. We want to merge the following:
- driversDF with driver_standingDF using an inner join naming it DriversMergedDF
- DriversMergedDF with the resultsDF using a left join (all drivers data included) naming it DriversMergedDF
- constructors_resultsDF with constructorsDF using an inner join, naming it ConstructersMergedDF
- ConstructorsMergedDF with constructors_standingsDF using an inner join, naming it ConstructersMergedDF

DriversMergedDF <- merge(driversDF,driver_standingsDF,by="driverId", all=FALSE)
DriversMergedDF <- merge(DriversMergedDF,resultsDF,by=c("driverId","raceId"), all.x=TRUE)
ConstructersMergedDF <- merge(constructors_resultsDF,constructorsDF,by="constructorId", all=FALSE)
ConstructersMergedDF <- merge(ConstructersMergedDF,constructor_standingsDF,by=c("constructorId", "raceId"), all=FALSE)
Note

all=FALSE results in a inner join. To apply a left join use all.x=TRUE, to use a right join use all.y=TRUE, to use a full join use all=TRUE.

Right now we have two DFs we want to do one last check, get rid of any irrelevant columns and rename some variables.

Exercise 7: In the DriversMergedDF, remove the following columns: driverRef, number.x, url, number.y, grid, positionText.y, time, milliseconds, fastestLap, minutes, seconds, statusId.

DriversMergedDF <- DriversMergedDF%>%
  select(-driverRef, -number.x, -url, -number.y, -grid, -positionText.y, -time, -milliseconds, -fastestLap, -minutes, -seconds, -statusId, -positionText.x, -rank)

Exercise 8: Rename the following variables:
- code = DriverCode
- forename= FirstName
- surname = LastName
- position.x = DriversTotalPosition
- points.x = DriversTotalPoints
- wins = DriverWins
- position.y = DriversRacePosition
- points.y = DriversRacePoints
- laps = LapsCompleted

DriversMergedDF <- DriversMergedDF%>%
  rename(DriverCode=code,
         FirstName=forename,
         LastName=surname,
         DriversTotalPosition=position.x,
         DriversTotalPoints=points.x,
         DriverWins=wins,
         DriversRacePosition=position.y,
         DriversRacePoints=points.y,
         LapsCompleted=laps)

Exercise 9: In the DriversMergedDF, remove the following columns: driverRef, number.x, url, number.y, grid, positionText.y, time, milliseconds, fastestLap, minutes, seconds, statusId.

ConstructersMergedDF <- ConstructersMergedDF%>%
  select(-status, -constructorRef, -positionText, -url)

Exercise 10: Rename the following variables:
- points.x = ConstructorRacePoints
- name = TeamName
- points.y = ConstructorsTotalPoints
- position = ConstructorsPosition
- wins = ConstructorWins

ConstructersMergedDF <- ConstructersMergedDF%>%
  rename(ConstructorRacePoints=points.x,
         TeamName=name,
         ConstructorsTotalPoints=points.y,
         ConstructorsPosition=position,
         ConstructorWins=wins)

Last we want to check if any of the racesDF dataframe variables require renaming.

Exercise 11: Rename the following variables:
- X = RaceCountry
- Circuit.Name = CircuitName
- year = RaceYear
- round = RaceRound

racesDF <- racesDF%>%
  rename(RaceCountry=X,
         CircuitName=Circuit.Name,
         RaceYear=year,
         RaceRound=round)

We now have three clean dataframes. Lets save these as .csv files.

Exercise 12: Save your datafiles as a .csv file named “P3_F1Drivers”, “P3_F1Constructors”, “P3_F1Races”.

write_rds(DriversMergedDF,"C:/Users/wkb14101/OneDrive - University of Strathclyde/MSc SDA/R Projects/B1703/data/P3_F1Drivers.rds")

write_rds(ConstructersMergedDF,"C:/Users/wkb14101/OneDrive - University of Strathclyde/MSc SDA/R Projects/B1703/data/P3_F1Constructors.rds")

write_rds(racesDF,"C:/Users/wkb14101/OneDrive - University of Strathclyde/MSc SDA/R Projects/B1703/data/P3_F1Races.rds")