Example 08: Time Series

Setup

Code
library(here)      # manage file paths
here() starts at /Users/kjhealy/Documents/courses/vsd
Code
library(socviz)    # data and some useful functions
library(tidyverse) # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(tsibble)   # time series and forecasting tools

Attaching package: 'tsibble'

The following object is masked from 'package:lubridate':

    interval

The following objects are masked from 'package:base':

    intersect, setdiff, union
Code
library(feasts)
Loading required package: fabletools
Code
library(fable)
library(seasonal)

Attaching package: 'seasonal'

The following object is masked from 'package:tibble':

    view

Data

Here’s an example that goes beyond what’s in class. We have hourly data on electricity generation and demand for Duke Energy for the whole of 2019:

Code
power <- read_csv(here::here("files", "data", "duke_power.csv")) 
Rows: 8760 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl  (4): hour_number, demand_forecast_mw, demand_mw, net_generation_mw
dttm (2): local_time_at_end_of_hour, utc_time_at_end_of_hour
date (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
power 
# A tibble: 8,760 × 7
   date       hour_number local_time_at_end_of_hour utc_time_at_end_of_hour
   <date>           <dbl> <dttm>                    <dttm>                 
 1 2019-01-01           1 2019-01-01 01:00:00       2019-01-01 06:00:00    
 2 2019-01-01           2 2019-01-01 02:00:00       2019-01-01 07:00:00    
 3 2019-01-01           3 2019-01-01 03:00:00       2019-01-01 08:00:00    
 4 2019-01-01           4 2019-01-01 04:00:00       2019-01-01 09:00:00    
 5 2019-01-01           5 2019-01-01 05:00:00       2019-01-01 10:00:00    
 6 2019-01-01           6 2019-01-01 06:00:00       2019-01-01 11:00:00    
 7 2019-01-01           7 2019-01-01 07:00:00       2019-01-01 12:00:00    
 8 2019-01-01           8 2019-01-01 08:00:00       2019-01-01 13:00:00    
 9 2019-01-01           9 2019-01-01 09:00:00       2019-01-01 14:00:00    
10 2019-01-01          10 2019-01-01 10:00:00       2019-01-01 15:00:00    
# ℹ 8,750 more rows
# ℹ 3 more variables: demand_forecast_mw <dbl>, demand_mw <dbl>,
#   net_generation_mw <dbl>

Raw data

If we graph this, the hourly character of the data makes it very hard to see what’s happening if we use a line graph.

Code
power |> 
  ggplot(mapping = aes(x = date, y = demand_mw)) + 
  geom_line()

Daily median demand over the year

One option might be to aggregate into a daily median figure:

Code
power |> 
  group_by(date) |> 
  summarize(med_d = median(demand_mw)) |> 
  ggplot(mapping = aes(x = date, y = med_d)) + 
  geom_line()

You can see one day of data is missing, on March 10th.

Daily median/min/max over the year

We could calculate more daily summaries. E.g., let’s use geom_ribbon() to add min and max bounds.

Code
power |> 
  group_by(date) |> 
  summarize(med_d = median(demand_mw),
            min_d = min(demand_mw),
            max_d = max(demand_mw)
            ) |> 
  ggplot() + 
  geom_ribbon(mapping = aes(x = date, ymin = min_d, ymax = max_d), 
              color = "gray70", fill = "gray70") +
  geom_line(mapping = aes(x = date, y = med_d))

Smoothed too much

An ordinary smoother will tend to aggregate away a lot of information, though perhaps it’s still informative?

Code
power |> 
  group_by(date) |> 
  summarize(med_d = median(demand_mw),
            min_d = min(demand_mw),
            max_d = max(demand_mw)
            ) |> 
  ggplot() + 
  geom_smooth(mapping = aes(x = date, y = med_d), se = FALSE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).

Alternative ways to keep the hourly resolution?

We could use a tile or raster layout:

Code
power |>
  ggplot(mapping = aes(x = date, y = hour_number, fill = demand_mw)) + 
  geom_tile() + 
  coord_fixed() + 
  scale_fill_viridis_c(option = "C")

We could try a line graph with polar coordinates:

Code
power |>
  # There's one DST hour
  filter(hour_number %nin% c(25)) |> 
  mutate(month = lubridate::month(date, label = TRUE, abbr = TRUE)) |> 
  group_by(month, hour_number) |> 
  summarize(mean_d = mean(demand_mw)) |> 
  ggplot(mapping = aes(x = hour_number, y = mean_d, group = month)) + 
  geom_line(mapping = aes(color = month)) + 
  coord_polar() + 
  scale_color_viridis_d(option = "C") +
  scale_x_continuous(breaks = c(1:24), labels = c(1:24))
`summarise()` has grouped output by 'month'. You can override using the
`.groups` argument.
Warning: Removed 23 rows containing missing values (`geom_line()`).

Or facet that instead?

Code
power |>
  # There's one DST hour
  filter(hour_number %nin% c(25)) |> 
  mutate(month = lubridate::month(date, label = TRUE, abbr = TRUE)) |> 
  group_by(month, hour_number) |> 
  summarize(mean_d = mean(demand_mw, na.rm = TRUE)) |> 
  ggplot(mapping = aes(x = hour_number, y = mean_d)) + 
  geom_line(linewidth = rel(1.15)) + 
  coord_polar() + 
  scale_x_continuous(breaks = c(1:24), labels = c(1:24)) + 
  facet_wrap(~ month)
`summarise()` has grouped output by 'month'. You can override using the
`.groups` argument.

A Decomposition

Some new data:

Code
economics
# A tibble: 574 × 6
   date         pce    pop psavert uempmed unemploy
   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
 1 1967-07-01  507. 198712    12.6     4.5     2944
 2 1967-08-01  510. 198911    12.6     4.7     2945
 3 1967-09-01  516. 199113    11.9     4.6     2958
 4 1967-10-01  512. 199311    12.9     4.9     3143
 5 1967-11-01  517. 199498    12.8     4.7     3066
 6 1967-12-01  525. 199657    11.8     4.8     3018
 7 1968-01-01  531. 199808    11.7     5.1     2878
 8 1968-02-01  534. 199920    12.3     4.5     3001
 9 1968-03-01  544. 200056    11.7     4.1     2877
10 1968-04-01  544  200208    12.3     4.6     2709
# ℹ 564 more rows
Code
## This data is included with the tidyverse
economics
# A tibble: 574 × 6
   date         pce    pop psavert uempmed unemploy
   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
 1 1967-07-01  507. 198712    12.6     4.5     2944
 2 1967-08-01  510. 198911    12.6     4.7     2945
 3 1967-09-01  516. 199113    11.9     4.6     2958
 4 1967-10-01  512. 199311    12.9     4.9     3143
 5 1967-11-01  517. 199498    12.8     4.7     3066
 6 1967-12-01  525. 199657    11.8     4.8     3018
 7 1968-01-01  531. 199808    11.7     5.1     2878
 8 1968-02-01  534. 199920    12.3     4.5     3001
 9 1968-03-01  544. 200056    11.7     4.1     2877
10 1968-04-01  544  200208    12.3     4.6     2709
# ℹ 564 more rows
Code
economics <- economics |> 
  mutate(pct_unemp = (unemploy/pop) * 100)

economics |> 
  ggplot(mapping = aes(x = date, y = pct_unemp)) +
  geom_line() + 
  labs(title = "Crude Unemployment Rate")

Let’s use an STL decomposition. The data are monthly.

We convert it to a “tsibble”, which is what the decomposition function likes.

Code
economics |> 
  as_tsibble() |> 
  mutate(date = yearmonth(date)) |> 
  model(
    STL(pct_unemp ~ trend(window = 7) +
                   season(window = "periodic"),
    robust = TRUE)) |>
  components() |>
  autoplot()
Using `date` as index variable.

Read the help for feasts::STL to learn more about the window argument.