NYC flights - data wrangling

Author

SOLUTIONS

Published

October 30, 2025

library(tidyverse)
library(nycflights13)
flights <- nycflights13::flights

To demonstrate data wrangling we will use a dataset of characteristics of all flights departing from New York City (JFK, LGA, EWR) in 2013. We will use the flights data frame that comes from the nycflights13 package.

We first explore the data a bit. Examine the documentation as well.

glimpse(flights)

Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

Exercises

Practice using dplyr verbs to perform each of the following tasks. Unless explicitly stated, you should not save the output into an object.

`select()`

Select only the departure delay and arrival delay columns. Check: dimensions should be 336776 x 2

flights |> 
  select(dep_delay, arr_delay)

# A tibble: 336,776 × 2
   dep_delay arr_delay
       <dbl>     <dbl>
 1         2        11
 2         4        20
 3         2        33
 4        -1       -18
 5        -6       -25
 6        -4        12
 7        -5        19
 8        -3       -14
 9        -3        -8
10        -2         8
# ℹ 336,766 more rows

#accomplishes the same thing
select(flights, dep_delay, arr_delay)

# A tibble: 336,776 × 2
   dep_delay arr_delay
       <dbl>     <dbl>
 1         2        11
 2         4        20
 3         2        33
 4        -1       -18
 5        -6       -25
 6        -4        12
 7        -5        19
 8        -3       -14
 9        -3        -8
10        -2         8
# ℹ 336,766 more rows

Select all variables in flights, excluding departure delays. Check: dimensions should be 336776 x 18

flights |> 
  select(-dep_delay)

# A tibble: 336,776 × 18
    year month   day dep_time sched_dep_time arr_time sched_arr_time arr_delay
   <int> <int> <int>    <int>          <int>    <int>          <int>     <dbl>
 1  2013     1     1      517            515      830            819        11
 2  2013     1     1      533            529      850            830        20
 3  2013     1     1      542            540      923            850        33
 4  2013     1     1      544            545     1004           1022       -18
 5  2013     1     1      554            600      812            837       -25
 6  2013     1     1      554            558      740            728        12
 7  2013     1     1      555            600      913            854        19
 8  2013     1     1      557            600      709            723       -14
 9  2013     1     1      557            600      838            846        -8
10  2013     1     1      558            600      753            745         8
# ℹ 336,766 more rows
# ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

Select just the first three variables in flights . Check: dimensions should be 336776 x 3

#all 3 accomplish the same thing
flights |> select(1:3)

# A tibble: 336,776 × 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# ℹ 336,766 more rows

flights |> select(year:day)

# A tibble: 336,776 × 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# ℹ 336,766 more rows

flights |> select(year, month, day)

# A tibble: 336,776 × 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# ℹ 336,766 more rows

`arrange()`

Arrange the flights data by descending departure delays, with large departure delays on top. Check: top row have dep_delay of 1301

flights |> 
  arrange(desc(dep_delay))

# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     9      641            900      1301     1242           1530
 2  2013     6    15     1432           1935      1137     1607           2120
 3  2013     1    10     1121           1635      1126     1239           1810
 4  2013     9    20     1139           1845      1014     1457           2210
 5  2013     7    22      845           1600      1005     1044           1815
 6  2013     4    10     1100           1900       960     1342           2211
 7  2013     3    17     2321            810       911      135           1020
 8  2013     6    27      959           1900       899     1236           2226
 9  2013     7    22     2257            759       898      121           1026
10  2013    12     5      756           1700       896     1058           2020
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

`slice()`

Select (slice) the first 5 rows of flights . Check: dimensions should be 5 x 19.

flights |> 
  slice(1:5)

# A tibble: 5 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2013     1     1      517            515         2      830            819
2  2013     1     1      533            529         4      850            830
3  2013     1     1      542            540         2      923            850
4  2013     1     1      544            545        -1     1004           1022
5  2013     1     1      554            600        -6      812            837
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Combine arrange and slice to select the 5 flights with the longest departure delays. Check: dimensions should be 5 x 19, 5th row should have dep_delay of 1005.

flights |> 
  arrange(desc(dep_delay)) |> 
  slice(1:5)

# A tibble: 5 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2013     1     9      641            900      1301     1242           1530
2  2013     6    15     1432           1935      1137     1607           2120
3  2013     1    10     1121           1635      1126     1239           1810
4  2013     9    20     1139           1845      1014     1457           2210
5  2013     7    22      845           1600      1005     1044           1815
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

`filter()`

Select all rows where the destination airport is “LAX”. Check: dimensions should be 16174 x 19

flights |> 
  filter(dest == "LAX")

# A tibble: 16,174 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      558            600        -2      924            917
 2  2013     1     1      628            630        -2     1016            947
 3  2013     1     1      658            700        -2     1027           1025
 4  2013     1     1      702            700         2     1058           1014
 5  2013     1     1      743            730        13     1107           1100
 6  2013     1     1      828            823         5     1150           1143
 7  2013     1     1      829            830        -1     1152           1200
 8  2013     1     1      856            900        -4     1226           1220
 9  2013     1     1      859            900        -1     1223           1225
10  2013     1     1      921            900        21     1237           1227
# ℹ 16,164 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Select all rows where the destination airport is “LAX” and the arrival delay is less than 0. Check: dimensions should be 9790 x 19

flights |> 
  filter(dest == "LAX", arr_delay < 0)

# A tibble: 9,790 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      829            830        -1     1152           1200
 2  2013     1     1      859            900        -1     1223           1225
 3  2013     1     1     1133           1130         3     1448           1450
 4  2013     1     1     1153           1200        -7     1450           1529
 5  2013     1     1     1320           1320         0     1625           1636
 6  2013     1     1     1327           1330        -3     1638           1655
 7  2013     1     1     1454           1500        -6     1815           1837
 8  2013     1     1     1539           1545        -6     1853           1910
 9  2013     1     1     1655           1655         0     2025           2030
10  2013     1     1     1750           1750         0     2109           2115
# ℹ 9,780 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Select all rows where the destination airport is “LAX”, “BUR”, or “LGB” AND the departure or the arrival delay is negative. Check: dimensions should be 12492 x 19

flights |> 
  filter(dest %in% c("LAX", "BUR", "LGB"),
         (arr_delay < 0 | dep_delay < 0))

# A tibble: 12,492 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      558            600        -2      924            917
 2  2013     1     1      628            630        -2     1016            947
 3  2013     1     1      658            700        -2     1027           1025
 4  2013     1     1      829            830        -1     1152           1200
 5  2013     1     1      856            900        -4     1226           1220
 6  2013     1     1      859            900        -1     1223           1225
 7  2013     1     1      941            945        -4     1300           1258
 8  2013     1     1     1026           1030        -4     1351           1340
 9  2013     1     1     1127           1130        -3     1504           1448
10  2013     1     1     1133           1130         3     1448           1450
# ℹ 12,482 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

`mutate()`

air_time is provided in minutes. Create a new variable called hours that coverts air_time to hours. Make sure to save this new variable by overwriting the flights dataframe. Check: dimensions of flights should now be 336776 x 20. First value of hours should be 3.7833333.

flights <- flights |> 
  mutate(air_time_hours = air_time/60)

Create a new variable called mph that computes the miles traveled (distance) per hour. Make sure to save this variable. Check: dimensions of flights should now be 336776 x 21. First value of mph should be 370.0441

flights <- flights |> 
  mutate(mph = distance/air_time_hours)

Create a new variable called on_time that is TRUE if the flight is on time (or early) and FALSE if it is not. Make sure to save this variable. Check: dimensions of flights should now be 336776 x 22. First value of on_time should be FALSE.

#both pipelines do the same thing
flights <- flights |> 
  mutate(on_time = arr_delay <= 0) 

flights <- flights |> 
  mutate(on_time = ifelse(arr_delay <= 0, TRUE, FALSE))

`summarise()` and `group_by()`

Calculate the average departure delay. Check: answer should be 12.6

flights |> 
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE))

# A tibble: 1 × 1
  avg_dep_delay
          <dbl>
1          12.6

#also calculates mean but returns single value, not data frame
#can't extend as easily to calculating by group
mean(flights$dep_delay, na.rm = TRUE)

[1] 12.63907

Calculate the average departure delay for each month. Check: dimensions should be 12 x 2

flights |> 
  group_by(month) |> 
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE))

# A tibble: 12 × 2
   month avg_dep_delay
   <int>         <dbl>
 1     1         10.0 
 2     2         10.8 
 3     3         13.2 
 4     4         13.9 
 5     5         13.0 
 6     6         20.8 
 7     7         21.7 
 8     8         12.6 
 9     9          6.72
10    10          6.24
11    11          5.44
12    12         16.6

Count the number of on-time flights for each month. Check: count should be 15248 in January.

flights |> 
  group_by(month) |> 
  count(on_time) |> 
  filter(on_time)

# A tibble: 12 × 3
# Groups:   month [12]
   month on_time     n
   <int> <lgl>   <int>
 1     1 TRUE    15248
 2     2 TRUE    13511
 3     3 TRUE    16983
 4     4 TRUE    15042
 5     5 TRUE    17939
 6     6 TRUE    14585
 7     7 TRUE    14989
 8     8 TRUE    17127
 9     9 TRUE    20165
10    10 TRUE    18795
11    11 TRUE    17332
12    12 TRUE    12626

Compute the proportion of flights that are on-time for each month. Check: proportion should be 0.565 for January.

flights |> 
  group_by(month) |> 
  count(on_time) |> 
  mutate(prop = n/sum(n)) |> 
  filter(on_time)

# A tibble: 12 × 4
# Groups:   month [12]
   month on_time     n  prop
   <int> <lgl>   <int> <dbl>
 1     1 TRUE    15248 0.565
 2     2 TRUE    13511 0.542
 3     3 TRUE    16983 0.589
 4     4 TRUE    15042 0.531
 5     5 TRUE    17939 0.623
 6     6 TRUE    14585 0.516
 7     7 TRUE    14989 0.509
 8     8 TRUE    17127 0.584
 9     9 TRUE    20165 0.731
10    10 TRUE    18795 0.651
11    11 TRUE    17332 0.636
12    12 TRUE    12626 0.449

In addition to the proportion on time for each month, also find the maximum delay, median delay, and the count of flights in each month. Check: dimensions should be 12 x 5.

flights |> 
  group_by(month, on_time) |> 
  summarize(n = n(),
            max_delay = max(dep_delay),
            med_delay = median(dep_delay)) |> 
  mutate(total_flights = sum(n),
        prop_on_time = n/total_flights) |> 
  filter(on_time) |> 
  select(-on_time, -n)

`summarise()` has grouped output by 'month'. You can override using the
`.groups` argument.

# A tibble: 12 × 5
# Groups:   month [12]
   month max_delay med_delay total_flights prop_on_time
   <int>     <dbl>     <dbl>         <int>        <dbl>
 1     1        42        -4         27004        0.565
 2     2        60        -4         24951        0.542
 3     3        51        -3         28834        0.589
 4     4        61        -4         28330        0.531
 5     5        61        -3         28796        0.623
 6     6        53        -3         28243        0.516
 7     7        63        -3         29425        0.509
 8     8        53        -3         29327        0.584
 9     9        45        -4         27574        0.731
10    10        52        -4         28889        0.651
11    11        44        -4         27268        0.636
12    12        52        -3         28135        0.449

What is the proportion on time for flights traveling to “LAX”, “BUR”, and “LGB”? Check: Proportion on time for Burbank is 0.55.

flights |> 
  filter(dest %in% c("LAX", "BUR", "LGB")) |> 
  group_by(dest) |> 
  count(on_time) |> 
  mutate(prop = n/sum(n)) |> 
  filter(on_time)

# A tibble: 3 × 4
# Groups:   dest [3]
  dest  on_time     n  prop
  <chr> <lgl>   <int> <dbl>
1 BUR   TRUE      204 0.550
2 LAX   TRUE    10059 0.622
3 LGB   TRUE      422 0.632

More (optional) practice

Create a new dataset that only contains flights that do not have a missing departure time. Include the columns year,month, day, dep_time, dep_delay, and dep_delay_hours (the departure delay in hours). Note you may need to use mutate() to make one or more of these variables. Check: dimensions should be 328521 x 6.

flights |> 
  filter(!is.na(dep_time)) |> 
  select(year, month, day, dep_time, dep_delay) |> 
  mutate(dep_delay_hours = dep_delay/60)

# A tibble: 328,521 × 6
    year month   day dep_time dep_delay dep_delay_hours
   <int> <int> <int>    <int>     <dbl>           <dbl>
 1  2013     1     1      517         2          0.0333
 2  2013     1     1      533         4          0.0667
 3  2013     1     1      542         2          0.0333
 4  2013     1     1      544        -1         -0.0167
 5  2013     1     1      554        -6         -0.1   
 6  2013     1     1      554        -4         -0.0667
 7  2013     1     1      555        -5         -0.0833
 8  2013     1     1      557        -3         -0.05  
 9  2013     1     1      557        -3         -0.05  
10  2013     1     1      558        -2         -0.0333
# ℹ 328,511 more rows

For each airplane (uniquely identified by tailnum), use a group_by() paired with summarize() to find the sample size, mean, and standard deviation of flight distances. Then, pick off the top 5 and bottom 5 airplanes in terms of mean distance traveled per flight. Check: dimensions should be 10 x 4. Largest mean is 173, smallest is 4983.

flights |> 
  group_by(tailnum) |> 
  summarize(n = n(),
            avg_dist = mean(distance),
            sd_dist = sd(distance)
            ) |> 
  arrange(avg_dist) |> 
  slice(1:5, (n() - 4):n())

# A tibble: 10 × 4
   tailnum     n avg_dist sd_dist
   <chr>   <int>    <dbl>   <dbl>
 1 N955UW    225     173.    32.9
 2 N948UW    232     174.    32.7
 3 N959UW    213     174.    34.3
 4 N956UW    222     174.    31.4
 5 N945UW    285     176.    31.2
 6 N390HA     20    4983      0  
 7 N391HA     21    4983      0  
 8 N392HA     13    4983      0  
 9 N393HA     10    4983      0  
10 N395HA      7    4983      0

Exercises

select()

arrange()

slice()

filter()

mutate()

summarise() and group_by()