Data Cleaning Process

2021-09-28

1. Document description

This document outlines all of the steps taken to clean and transform Cyclistic’s raw datasets to prepare the data for the next stage of analysis. For the purpose of this case study, only data collected between August 2020 - July 2021 will be assessed. The dataset description can be located here.

Please note that Cyclistic is a fictional company. Raw data has been collected by Motivate International Inc, the company which operates the City of Chicago’s Divvy bicycle sharing service. The license to use this public dataset can be found here.

library(tidyverse)
library(data.table)

2. Combine datasets

2.1 Load raw data

Aug_20 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202008-divvy-tripdata.csv")

Sep_20 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202009-divvy-tripdata.csv")

Oct_20 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202010-divvy-tripdata.csv")

Nov_20 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202011-divvy-tripdata.csv")

Dec_20 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202012-divvy-tripdata.csv")

Jan_21 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202101-divvy-tripdata.csv")

Feb_21 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202102-divvy-tripdata.csv")

Mar_21 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202103-divvy-tripdata.csv")

Apr_21 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202104-divvy-tripdata.csv")

May_21 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202105-divvy-tripdata.csv")

Jun_21 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202106-divvy-tripdata.csv")

Jul_21 <- read.csv("C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\02. Raw Data\\202107-divvy-tripdata.csv")

2.2 Check data structure

The structure summary outputs will help to identify if any of the individual raw datasets have different string types, column names etc.

str(Aug_20)
str(Sep_20)
str(Oct_20)
str(Nov_20)
str(Dec_20)
str(Jan_21)
str(Feb_21)
str(Mar_21)
str(Apr_21)
str(May_21)
str(Jun_21)
str(Jul_21)

2.3 Change string types

The structure outputs showed that the Aug_20 to Nov_20 datasets listed the start_station_id and end_station_id columns as ‘int’ string types instead of ‘chr’ string types. Prior to merging the raw datasets into one, all of their string types should be the same.

# Aug_20
Aug_20 <- mutate(
  Aug_20, 
  start_station_id = as.character(start_station_id),
  end_station_id = as.character(end_station_id)
  )

# Sep_20
Sep_20 <- mutate(
  Sep_20, 
  start_station_id = as.character(start_station_id),
  end_station_id = as.character(end_station_id)
  )

# Oct_20
Oct_20 <- mutate(
  Oct_20, 
  start_station_id = as.character(start_station_id),
  end_station_id = as.character(end_station_id)
  )

# Nov_20
Nov_20 <- mutate(
  Nov_20, 
  start_station_id = as.character(start_station_id),
  end_station_id = as.character(end_station_id)
  )

2.4 Merge datasets

# Merge all 12 individual datasets into one 
all_trips <- bind_rows(
  Aug_20, Sep_20, Oct_20, Nov_20, Dec_20, 
  Jan_21, Feb_21, Mar_21, Apr_21, May_21, Jun_21, Jul_21
  )

3. Prepare dataset

3.1 Change date string types

The started_at and ended_at columns should be changed from a ‘chr’ string type to a date string type to help with analysis.

# Change started_at string type 
all_trips$started_at <- as.POSIXct(
  all_trips$started_at, 
  format = "%Y-%m-%d %H:%M:%S"
  )

# Change ended_at string type 
all_trips$ended_at <- as.POSIXct(
  all_trips$ended_at, 
  format = "%Y-%m-%d %H:%M:%S"
  )

# Order by date 
all_trips <- all_trips %>%
  arrange(started_at)

3.2 Calculate ride length

Ride length as a number string type will not only be useful for future analysis but will also help to identify if there are any invalid data points, i.e. ride lengths which are less than 0.

# Calculate time difference in seconds 
all_trips$ride_length <- difftime(
  all_trips$ended_at, 
  all_trips$started_at,
  units = "secs"
  ) 

# Change string type to numeric 
all_trips$ride_length <- as.numeric(
  as.character(all_trips$ride_length)
  )

3.3 Summarise by date variables

Separate columns for year, month, day of week etc. will be useful for future analysis.

# Year 
all_trips$year <- format(
    all_trips$started_at, 
    "%Y"
    )

# Month 
all_trips$month <- format(
    all_trips$started_at, 
    "%m"
    )

# Week 
all_trips$week <- format(
  all_trips$started_at,
  "%W"
  )

# Day
all_trips$day <- format(
  all_trips$started_at, 
  "%d"
  )

# Day of week 
all_trips$day_of_week <- format(
  all_trips$started_at, 
  "%A"
  )

# Date, YYYY-MM-DD
all_trips$YMD <- format(
  all_trips$started_at, 
  "%Y-%m-%d"
  )

# Time of Day, HH:MM:SS
all_trips$ToD <- format(
  all_trips$started_at, 
  "%H:%M:%S"
  )

4. Clean dataset

4.1 Remove rows with ride length < 0

As identified in section 3.2 there were a few invalid data points with ride lengths less than 0. These data points should be deleted from the cleaned dataset.

# Remove ride lengths < 0
all_trips_cleaned <- all_trips %>%
  filter(!(ride_length < 0))

4.2 Remove incomplete rows

There were a few instances where there were no recorded station names. These incomplete rows of data should be removed.

# Remove start_station_name and end_station_name blank results 
all_trips_cleaned <- all_trips_cleaned %>%
    filter(
      !(is.na(start_station_name) |
          start_station_name == "")
      ) %>% 
  
  filter(
    !(is.na(end_station_name) |
        end_station_name == "")
    )

4.3 Remove tests

On further inspection of the all_trips_cleaned dataset within the R console, there were a few station names that are fully capitalized text strings instead of adhering to the starting capital letter followed by all lowercase letters string type. In addition, it appeared that the capitalized station names include the word ‘TEST’ within their string. This observation that test rides have been coded as all capital letters as their station_name was explored using the following code:

# Create a data frame to check if capitalized station names are test rides 
capitalized_station_name_check <- all_trips_cleaned %>%
  
  filter(
    str_detect(start_station_name, "[:upper:]")
    & !str_detect(start_station_name,"[:lower:]")
    ) %>%
  
  group_by(
    start_station_name
    ) %>%
  
  count(
    start_station_name
    )

Through further exploration of the identified capitalized station name row outputs using the R console, it appeared that the capitalized station name results were for test and maintenance purposes. These results should be removed from the all_trips_cleaned dataset.

# Remove capitalized station name results from the cleaned dataset 
all_trips_cleaned <- all_trips_cleaned %>%
    filter(
      !(str_detect(start_station_name, "[:upper:]")
        & !str_detect(start_station_name, "[:lower:]"))
      )

4.4 Remove duplicates

The ride_id column is unique to each ride. This column should be reviewed to see if there are any duplicates to delete.

# Create a data frame to check that there are no duplicates 
ride_id_check <- all_trips_cleaned %>%
  count(ride_id) %>%
  filter(n > 1)

The above code returned no results, no duplicates are recorded in the cleaned dataset.

5. Understand dataset

5.1 Check rideable type

unique(all_trips_cleaned$rideable_type)

The above line of code returned three bike types used in the cleaned dataset. The following code was ran to review if a bike type was added to the dataset at a later date.

# Create a data frame to see when a unique bike type was added to the dataset
rideable_type_check <-all_trips_cleaned %>%
  
  mutate(
    year = year(started_at), 
    month = month(started_at)
    ) %>%
  
  group_by(
    month, 
    year
    ) %>%
  
  select(
    rideable_type, 
    month, 
    year
    ) %>%
  
  count(
    rideable_type
    )

The rideable type check output within the R console showed that “classic_bikes” were added to the dataset from December 2020 onwards. This should be noted for future analysis purposes.

5.2 Check station name

There may have been a few instances where stations were removed or added from Cyclistic’s portfolio. This can be reviewed using the following lines of code:

Firstly, a data frame which lists the unique station names should be created.

# Create a data frame which lists the unique station names 
station_name_check <- all_trips_cleaned %>%
  group_by(start_station_name) %>%
  count(start_station_name)

Following this, data frames which list the unique station names used each month should be created.

# Aug 2020 data frame which lists the unique station names used that month
Aug_2020_filter <- all_trips_cleaned %>%
  filter(
    month == "08"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Sep 2020 data frame which lists the unique station names used that month
Sep_2020_filter <- all_trips_cleaned %>%
  filter(
    month == "09"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Oct 2020 data frame which lists the unique station names used that month
Oct_2020_filter <- all_trips_cleaned %>%
  filter(
    month == "10"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Oct 2020 data frame which lists the unique station names used that month
Nov_2020_filter <- all_trips_cleaned %>%
  filter(
    month == "11"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Dec 2020 data frame which lists the unique station names used that month
Dec_2020_filter <- all_trips_cleaned %>%
  filter(
    month == "12"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Jan 2021 data frame which lists the unique station names used that month
Jan_2021_filter <- all_trips_cleaned %>%
  filter(
    month == "01"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Feb 2021 data frame which lists the unique station names used that month
Feb_2021_filter <- all_trips_cleaned %>%
  filter(
    month == "02"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Mar 2021 data frame which lists the unique station names used that month
Mar_2021_filter <- all_trips_cleaned %>%
  filter(
    month == "03"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Apr 2021 data frame which lists the unique station names used that month
Apr_2021_filter <- all_trips_cleaned %>%
  filter(
    month == "04"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# May 2021 data frame which lists the unique station names used that month
May_2021_filter <- all_trips_cleaned %>%
  filter(
    month == "05"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Jun 2021 data frame which lists the unique station names used that month
Jun_2021_filter <- all_trips_cleaned %>%
  filter(
    month == "06"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

# Jul 2021 data frame which lists the unique station names used that month
Jul_2021_filter <- all_trips_cleaned %>%
  filter(
    month == "07"
    ) %>%
  group_by(
    start_station_name
    ) %>%
  count(
    start_station_name
    )

Each unique station name can be tested against the monthly filter data frames to assess which unique station was used in a particular month.

# Create columns for each month in the station name check data frame to check if the station name appears in the individual month filter data frames created earlier
station_name_check$Aug_2020 <- as.integer(station_name_check$start_station_name
                                          %in% Aug_2020_filter$start_station_name)

station_name_check$Sep_2020 <- as.integer(station_name_check$start_station_name 
                                          %in% Sep_2020_filter$start_station_name)

station_name_check$Oct_2020 <- as.integer(station_name_check$start_station_name 
                                          %in% Oct_2020_filter$start_station_name)

station_name_check$Nov_2020 <- as.integer(station_name_check$start_station_name 
                                          %in% Nov_2020_filter$start_station_name)

station_name_check$Dec_2020 <- as.integer(station_name_check$start_station_name 
                                          %in% Dec_2020_filter$start_station_name)

station_name_check$Jan_2021 <- as.integer(station_name_check$start_station_name 
                                          %in% Jan_2021_filter$start_station_name)

station_name_check$Feb_2021 <- as.integer(station_name_check$start_station_name 
                                          %in% Feb_2021_filter$start_station_name)

station_name_check$Mar_2021 <- as.integer(station_name_check$start_station_name 
                                          %in% Mar_2021_filter$start_station_name)

station_name_check$Apr_2021 <- as.integer(station_name_check$start_station_name 
                                          %in% Apr_2021_filter$start_station_name)

station_name_check$May_2021 <- as.integer(station_name_check$start_station_name 
                                          %in% May_2021_filter$start_station_name)

station_name_check$Jun_2021 <- as.integer(station_name_check$start_station_name 
                                          %in% Jun_2021_filter$start_station_name)

station_name_check$Jul_2021 <- as.integer(station_name_check$start_station_name 
                                          %in% Jul_2021_filter$start_station_name)

# Add sum column 
station_name_check$count <- rowSums(station_name_check[,3:14])

If we filter the station_name_check data frame by count < 12 it highlights that there are a few stations which have been added and/or removed from Cyclistic’s portfolio between August 2020 and July 2021 as well as help identify which stations were not used in a particular month.

The following two data frames were then created to review which stations were most likely added (check_A) or removed (check_B) from Cyclistic’s portfolio during the analysis period. Two months were used in each filter in order to avoid any anomalies whereby a station was simply not used for the month instead of the station being completely removed or added to Cyclistic’s portfolio.

# Check A 
station_name_check_A <- station_name_check %>%
  filter(
    Aug_2020<1 & Sep_2020<1
    )

# Check B
station_name_check_B <- station_name_check %>%
  filter(
    Jul_2021<1 & Jun_2021<1
    )

The returned data frames showed that there are a few stations which have been added or removed from Cyclistic’s portfolio. For example, station_name_test_A showed that Avenue L & 114th St was only used 5 times and used from the month of July 2021 onwards.

For future analysis purposes, it should be noted that there are a few stations which have most likely been added or removed from Cyclistic’s portfolio during the analysis period.

6. Save the dataset

6.1 Save the cleaned dataset

The cleaned dataset should be saved as a csv.

# Cleaned dataset
fwrite(
  all_trips_cleaned, 
  "C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\03. Analysis\\01-03-03 all_trips_cleaned.csv", 
  col.names = TRUE,
  row.names = FALSE
  )

6.2 Save additional useful datasets

The following datasets should also be saved because they may be useful for future analysis purposes.

# Raw dataset
fwrite(
  all_trips, 
  "C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\03. Analysis\\01-03-04 all_trips_raw.csv",
  col.names = TRUE, 
  row.names = FALSE
  )

# Rideable type check dataset (section 5.1)
fwrite(
  rideable_type_check, 
  "C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\03. Analysis\\01-03-05 rideable_type_check.csv",
  col.names = TRUE, 
  row.names = FALSE
  )

# Station name check dataset (section 5.2)
fwrite(
  station_name_check, 
  "C:\\Users\\izzyl\\Documents\\Portfolio\\01. Cyclistic\\03. Analysis\\01-03-06 station_name_check.csv",
  col.names = TRUE, 
  row.names = FALSE
  )