library(here) # manage file pathslibrary(socviz) # data and some useful things, especially %nin%library(tidyverse) # your friend and minelibrary(scales) # Convenient scale labels## New packages# install.packages("tsibble") # Time series objects# install.packages("feasts") # Time series feature analysis # install.packages("slider") # Moving averages and related methods# remotes::install_github("kjhealy/demog") # Some US demographic datalibrary(tsibble)library(feasts)library(slider)library(demog)
Here the births column means “Average daily births per million population”
Looking at the Series
boom |>ggplot(mapping =aes(x = date, y = births)) +geom_line()
Looking at the Series
boom |>ggplot(mapping =aes(x = date, y = births)) +geom_line() +geom_smooth()
Too much smoothing here
Time Series Decomposition
The analysis of Time Series is a big area; people often want to see into the future
We will focus on a couple of elementary methods that are more purely descriptive, particularly the idea of decomposing a time series into its trend, seasonal, and remainder components.
Decomposition methods are descriptive rather than predictive. They also make assumptions about the character of the data (e.g. its seasonality) which might be something we want to investigate.
More complex forecasting methods are either more detailed, or attempt to be proper models, or both.
This just means e.g. we use half of December of the previous year and half of December of the current year to calculate the centred moving average in June of the current year.
A Centered Moving Average of order 12
We can calculate the CMA for even orders in two steps.
boom_ts <- boom_t |>left_join(boom_s, by ="month") boom_ts
# A tibble: 996 × 5
date births month t s
<date> <dbl> <dbl> <dbl> <dbl>
1 1933-01-01 46.4 1 NA -1.62
2 1933-02-01 47.2 2 NA -0.575
3 1933-03-01 47.2 3 NA -0.909
4 1933-04-01 45.5 4 NA -2.21
5 1933-05-01 44.9 5 NA -1.98
6 1933-06-01 44.9 6 NA -0.330
7 1933-07-01 46.5 7 45.4 1.90
8 1933-08-01 46.7 8 45.4 2.74
9 1933-09-01 44.5 9 45.4 3.40
10 1933-10-01 42.9 10 45.3 0.768
# ℹ 986 more rows
Calculate the Seasonal part
In a “Classical” decomposition the Seasonal part just repeats.
boom_ts |>ggplot(aes(x = date, y = s)) +geom_line()
Calculate the Remainder part
The remainder is just what’s left over from y (i.e., births) after we have calculated t and s.
boom_tsr <- boom_ts |>mutate(r = births - t - s)boom_tsr
# A tibble: 996 × 6
date births month t s r
<date> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1933-01-01 46.4 1 NA -1.62 NA
2 1933-02-01 47.2 2 NA -0.575 NA
3 1933-03-01 47.2 3 NA -0.909 NA
4 1933-04-01 45.5 4 NA -2.21 NA
5 1933-05-01 44.9 5 NA -1.98 NA
6 1933-06-01 44.9 6 NA -0.330 NA
7 1933-07-01 46.5 7 45.4 1.90 -0.863
8 1933-08-01 46.7 8 45.4 2.74 -1.46
9 1933-09-01 44.5 9 45.4 3.40 -4.26
10 1933-10-01 42.9 10 45.3 0.768 -3.11
# ℹ 986 more rows
Decomposition: y = t + s + r
This is an additive decomposition. You can also do multiplicative decompositions.
boom_tsr |>mutate(tsr = t + s + r)
# A tibble: 996 × 7
date births month t s r tsr
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1933-01-01 46.4 1 NA -1.62 NA NA
2 1933-02-01 47.2 2 NA -0.575 NA NA
3 1933-03-01 47.2 3 NA -0.909 NA NA
4 1933-04-01 45.5 4 NA -2.21 NA NA
5 1933-05-01 44.9 5 NA -1.98 NA NA
6 1933-06-01 44.9 6 NA -0.330 NA NA
7 1933-07-01 46.5 7 45.4 1.90 -0.863 46.5
8 1933-08-01 46.7 8 45.4 2.74 -1.46 46.7
9 1933-09-01 44.5 9 45.4 3.40 -4.26 44.5
10 1933-10-01 42.9 10 45.3 0.768 -3.11 42.9
# ℹ 986 more rows
# A tsibble: 996 x 6 [1M]
date births trend seasonal random season_adjust
<mth> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1933 Jan 46.4 NA -1.62 NA 48.0
2 1933 Feb 47.2 NA -0.575 NA 47.8
3 1933 Mar 47.2 NA -0.909 NA 48.1
4 1933 Apr 45.5 NA -2.21 NA 47.7
5 1933 May 44.9 NA -1.98 NA 46.9
6 1933 Jun 44.9 NA -0.330 NA 45.3
7 1933 Jul 46.5 45.4 1.90 -0.863 44.6
8 1933 Aug 46.7 45.4 2.74 -1.46 44.0
9 1933 Sep 44.5 45.4 3.40 -4.26 41.1
10 1933 Oct 42.9 45.3 0.768 -3.11 42.1
# ℹ 986 more rows
bc <- boom |>mutate(date =yearmonth(date)) |>as_tsibble(index ="date") |>model(# Experiment with a six-monthly trend windowSTL(births ~trend(window =7) +season(window =7), robust =TRUE) ) |>components() |>select(-.model)bc
# A tsibble: 996 x 6 [1M]
date births trend season_year remainder season_adjust
<mth> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1933 Jan 46.4 46.4 -0.148 0.0757 46.5
2 1933 Feb 47.2 46.4 0.969 -0.197 46.2
3 1933 Mar 47.2 46.4 0.781 0.0247 46.4
4 1933 Apr 45.5 46.3 -1.17 0.304 46.7
5 1933 May 44.9 46.1 -1.49 0.329 46.4
6 1933 Jun 44.9 45.5 -0.646 0.0632 45.6
7 1933 Jul 46.5 44.8 2.25 -0.632 44.2
8 1933 Aug 46.7 44.6 2.40 -0.278 44.3
9 1933 Sep 44.5 45.0 2.47 -2.98 42.0
10 1933 Oct 42.9 45.6 -0.584 -2.12 43.5
# ℹ 986 more rows
Manually plotting the components
bc |>pivot_longer(cols =c(births, trend, season_year, remainder)) |>mutate(date =as.Date(date),name =factor(name, levels =c("births", "trend", "season_year", "remainder"), labels =c("Births", "Trend", "Seasonal", "Remainder"),ordered =TRUE)) |>ggplot() +geom_line(aes(date, value)) +scale_x_date(breaks =seq(as.Date("1935-01-01"), as.Date("2015-01-01"), by="10 years"),date_labels ="%Y") + ggforce::facet_col(~ name, scales ='free', space ='free') +labs(title ="Decomposition of US Monthly Births, 1933-2017", subtitle ="Average number of daily births per million population each month", x ="Time", y ="Birth rate")
my_labs <- bc_int$seq_idnames(my_labs) <- bc_int$monthind <-names(my_labs) %in%c("Jan", "May", "Sep")my_labs <- my_labs[ind]bc_int |>ggplot(aes(x = seq_id, y = season_year, color = period)) +geom_line(linewidth =rel(1.2)) +scale_x_continuous(breaks = my_labs, labels =names(my_labs)) +facet_wrap(~ period, ncol =1) +guides(color ="none") +labs(x ="Month", y ="Seasonal Component of the Birth Rate", title ="Changing Seasonality in Births: Three Six-Year periods in Three Decades", subtitle ="Seasonal Component from an STL decomposition of 1933-2015 monthly births")