The data set may-counts.tsv
contains the relative frequencies of tweets containing selected topic nodes in May 2018 (and similarly march-counts.tsv
for March 2018).
We read the data using read_tsv()
from the library readr
.
library(readr)
counts <- read_tsv("may-counts.tsv")
str(counts, max.level = 1)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 33 obs. of 46 variables:
## $ abortion : num 0.000346 0.000373 0.000404 0.000286 0.000315 ...
## $ again : num 0.00909 0.00899 0.0088 0.00956 0.00889 ...
## $ amazing : num 0.00631 0.00583 0.00602 0.00632 0.00626 ...
## $ america : num 0.00218 0.00212 0.00193 0.00278 0.00319 ...
## $ awesome : num 0.00335 0.00358 0.00331 0.00302 0.0033 ...
## $ beautiful : num 0.0058 0.00568 0.00687 0.00696 0.00621 ...
## $ day : num 0.0272 0.03 0.0268 0.0253 0.026 ...
## $ economy : num 0.000547 0.000575 0.000649 0.000485 0.000608 ...
## $ energy : num 0.00153 0.00137 0.00127 0.0011 0.00137 ...
## $ friday : num 0.001939 0.005582 0.001653 0.000954 0.00153 ...
## $ fuck : num 0.0113 0.0123 0.0169 0.0171 0.0125 ...
## $ germany : num 0.000328 0.000294 0.000399 0.000402 0.000423 ...
## $ good : num 0.0479 0.0474 0.0472 0.0481 0.0484 ...
## $ great : num 0.016 0.0165 0.0151 0.0151 0.0152 ...
## $ happy : num 0.0108 0.0137 0.0138 0.0121 0.0116 ...
## $ hate : num 0.00665 0.00658 0.00641 0.00691 0.00696 ...
## $ interesting: num 0.00209 0.00169 0.00142 0.00154 0.00163 ...
## $ iphone : num 0.000641 0.000551 0.0006 0.000513 0.000635 ...
## $ iran : num 0.000609 0.000433 0.00076 0.000925 0.001292 ...
## $ japan : num 0.000598 0.000608 0.000755 0.000649 0.000598 ...
## $ jcpoa : num 7.29e-06 9.58e-06 1.03e-05 7.73e-06 1.97e-05 ...
## $ know : num 0.0322 0.0311 0.0297 0.0294 0.0315 ...
## $ love : num 0.0365 0.037 0.0376 0.0392 0.0383 ...
## $ make : num 0.0331 0.0324 0.031 0.0312 0.0319 ...
## $ monday : num 0.000915 0.001156 0.001162 0.00107 0.004342 ...
## $ moon : num 0.000834 0.000871 0.000868 0.000719 0.00075 ...
## $ more : num 0.0376 0.0339 0.0319 0.0324 0.0349 ...
## $ need : num 0.0261 0.0238 0.0228 0.0234 0.0244 ...
## $ new : num 0.0258 0.0258 0.0225 0.0213 0.0247 ...
## $ people : num 0.0268 0.0261 0.0266 0.0257 0.0271 ...
## $ porn : num 0.00372 0.00452 0.01728 0.01577 0.00431 ...
## $ sad : num 0.00351 0.00343 0.00351 0.00354 0.0036 ...
## $ saturday : num 0.00195 0.00157 0.00331 0.0014 0.00137 ...
## $ skripal : num 7.29e-06 2.15e-05 2.58e-06 1.80e-05 4.92e-06 ...
## $ sunday : num 0.00127 0.00149 0.00141 0.00513 0.00161 ...
## $ think : num 0.0273 0.0254 0.0248 0.0254 0.0269 ...
## $ thursday : num 0.002715 0.000773 0.000425 0.00043 0.000812 ...
## $ time : num 0.0292 0.0289 0.0294 0.0284 0.0289 ...
## $ trump : num 0.00938 0.00784 0.00658 0.00636 0.00708 ...
## $ tuesday : num 0.000649 0.000694 0.0005 0.000686 0.00092 ...
## $ usa : num 0.000907 0.000898 0.000961 0.000961 0.000903 ...
## $ wanna : num 0.00568 0.00583 0.00565 0.00594 0.00609 ...
## $ want : num 0.0246 0.0234 0.0246 0.0238 0.0248 ...
## $ wednesday : num 0.000623 0.000496 0.000322 0.00042 0.00093 ...
## $ win : num 0.00939 0.00975 0.01035 0.01097 0.0093 ...
## $ ymd : Date, format: "2018-05-03" "2018-05-04" ...
## - attr(*, "spec")=
## .. cols(
## .. abortion = col_double(),
## .. again = col_double(),
## .. amazing = col_double(),
## .. america = col_double(),
## .. awesome = col_double(),
## .. beautiful = col_double(),
## .. day = col_double(),
## .. economy = col_double(),
## .. energy = col_double(),
## .. friday = col_double(),
## .. fuck = col_double(),
## .. germany = col_double(),
## .. good = col_double(),
## .. great = col_double(),
## .. happy = col_double(),
## .. hate = col_double(),
## .. interesting = col_double(),
## .. iphone = col_double(),
## .. iran = col_double(),
## .. japan = col_double(),
## .. jcpoa = col_double(),
## .. know = col_double(),
## .. love = col_double(),
## .. make = col_double(),
## .. monday = col_double(),
## .. moon = col_double(),
## .. more = col_double(),
## .. need = col_double(),
## .. new = col_double(),
## .. people = col_double(),
## .. porn = col_double(),
## .. sad = col_double(),
## .. saturday = col_double(),
## .. skripal = col_double(),
## .. sunday = col_double(),
## .. think = col_double(),
## .. thursday = col_double(),
## .. time = col_double(),
## .. trump = col_double(),
## .. tuesday = col_double(),
## .. usa = col_double(),
## .. wanna = col_double(),
## .. want = col_double(),
## .. wednesday = col_double(),
## .. win = col_double(),
## .. ymd = col_date(format = "")
## .. )
The ymd
column contains information about the respective date.
counts$ymd
## [1] "2018-05-03" "2018-05-04" "2018-05-05" "2018-05-06" "2018-05-07"
## [6] "2018-05-08" "2018-05-09" "2018-05-10" "2018-05-11" "2018-05-12"
## [11] "2018-05-13" "2018-05-14" "2018-05-15" "2018-05-16" "2018-05-17"
## [16] "2018-05-18" "2018-05-19" "2018-05-20" "2018-05-21" "2018-05-22"
## [21] "2018-05-23" "2018-05-24" "2018-05-25" "2018-05-26" "2018-05-27"
## [26] "2018-05-28" "2018-05-29" "2018-05-30" "2018-05-31" "2018-06-01"
## [31] "2018-06-02" "2018-06-03" "2018-06-04"
If we want to visualize the data, we have to convert it into a ‘long’ format. We use the melt()
function from the reshape2
library here:
head(counts) # Current format
## # A tibble: 6 x 46
## abortion again amazing america awesome beautiful day economy energy
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000346 0.00909 0.00631 0.00218 0.00335 0.00580 0.0272 5.47e-4 0.00153
## 2 0.000373 0.00899 0.00583 0.00212 0.00358 0.00568 0.0300 5.75e-4 0.00137
## 3 0.000404 0.00880 0.00602 0.00193 0.00331 0.00687 0.0268 6.49e-4 0.00127
## 4 0.000286 0.00956 0.00632 0.00278 0.00302 0.00696 0.0253 4.85e-4 0.00110
## 5 0.000315 0.00889 0.00626 0.00319 0.00330 0.00621 0.0260 6.08e-4 0.00137
## 6 0.000377 0.00901 0.00608 0.00305 0.00329 0.00625 0.0256 6.26e-4 0.00137
## # … with 37 more variables: friday <dbl>, fuck <dbl>, germany <dbl>,
## # good <dbl>, great <dbl>, happy <dbl>, hate <dbl>, interesting <dbl>,
## # iphone <dbl>, iran <dbl>, japan <dbl>, jcpoa <dbl>, know <dbl>,
## # love <dbl>, make <dbl>, monday <dbl>, moon <dbl>, more <dbl>,
## # need <dbl>, new <dbl>, people <dbl>, porn <dbl>, sad <dbl>,
## # saturday <dbl>, skripal <dbl>, sunday <dbl>, think <dbl>,
## # thursday <dbl>, time <dbl>, trump <dbl>, tuesday <dbl>, usa <dbl>,
## # wanna <dbl>, want <dbl>, wednesday <dbl>, win <dbl>, ymd <date>
library(reshape2)
df.long <- melt(counts, 'ymd')
head(df.long) # New 'long' format
## ymd variable value
## 1 2018-05-03 abortion 0.0003461746
## 2 2018-05-04 abortion 0.0003734353
## 3 2018-05-05 abortion 0.0004043463
## 4 2018-05-06 abortion 0.0002860766
## 5 2018-05-07 abortion 0.0003148971
## 6 2018-05-08 abortion 0.0003768004
names(df.long) # Current variable names
## [1] "ymd" "variable" "value"
names(df.long) <- c("ymd", "lemma", "freq") # New variable names
head(df.long) # Check the manipulation
## ymd lemma freq
## 1 2018-05-03 abortion 0.0003461746
## 2 2018-05-04 abortion 0.0003734353
## 3 2018-05-05 abortion 0.0004043463
## 4 2018-05-06 abortion 0.0002860766
## 5 2018-05-07 abortion 0.0003148971
## 6 2018-05-08 abortion 0.0003768004
Let’s plot the time series for the lemma “thursday”:
library(ggplot2)
table(df.long$lemma) # All the lemmas for which we have data
##
## abortion again amazing america awesome beautiful
## 33 33 33 33 33 33
## day economy energy friday fuck germany
## 33 33 33 33 33 33
## good great happy hate interesting iphone
## 33 33 33 33 33 33
## iran japan jcpoa know love make
## 33 33 33 33 33 33
## monday moon more need new people
## 33 33 33 33 33 33
## porn sad saturday skripal sunday think
## 33 33 33 33 33 33
## thursday time trump tuesday usa wanna
## 33 33 33 33 33 33
## want wednesday win
## 33 33 33
df.plot <- subset(df.long, lemma=="thursday") # Selecting only Thursday
table(df.plot$lemma) # Check
##
## abortion again amazing america awesome beautiful
## 0 0 0 0 0 0
## day economy energy friday fuck germany
## 0 0 0 0 0 0
## good great happy hate interesting iphone
## 0 0 0 0 0 0
## iran japan jcpoa know love make
## 0 0 0 0 0 0
## monday moon more need new people
## 0 0 0 0 0 0
## porn sad saturday skripal sunday think
## 0 0 0 0 0 0
## thursday time trump tuesday usa wanna
## 33 0 0 0 0 0
## want wednesday win
## 0 0 0
table(factor(df.plot$lemma)) # Drop unused factors for clarity
##
## thursday
## 33
ggplot(df.plot, aes(x=ymd, y=freq)) + geom_line() +
scale_x_date() + xlab("") + ylab("") + facet_wrap(~lemma, scale="free") +
geom_vline(aes(xintercept=as.Date("2018-05-3"))) +
geom_vline(aes(xintercept=as.Date("2018-05-10"))) +
geom_vline(aes(xintercept=as.Date("2018-05-17"))) +
geom_vline(aes(xintercept=as.Date("2018-05-24"))) +
geom_vline(aes(xintercept=as.Date("2018-05-31")))
The explanation for this time series seems straightforward: There are peaks on Thursdays, indicated by vertical lines.
We can also plot several time series at once, using the facet_wrap()
function.
df.plot <- subset(df.long,
lemma %in% c("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "day"))
ggplot(df.plot, aes(x=ymd, y=freq)) + geom_line() +
scale_x_date() + xlab("") + ylab("") + facet_wrap(~lemma, scale="free") # Please be patient, this may take a little while!
The tseries
package provides functionality for estimating ARMA models. Let’s have a look at the the mentions of tuesday:
library(tseries)
x <- subset(df.long, lemma=="tuesday")
ggplot(x, aes(x=ymd, y=freq)) + geom_line() + scale_x_date() + xlab("") + ylab("")
First we have to identify whether there is a trend in the time series. Most methods rely on a detrended series.
(reg <- lm(x$freq ~ x$ymd))
##
## Call:
## lm(formula = x$freq ~ x$ymd)
##
## Coefficients:
## (Intercept) x$ymd
## 5.383e-02 -2.998e-06
y <- residuals(reg)
plot(y)
This looks like there’s no long-term trend.
What about seasonality? For daily data, a weekly season is most often appropriate. We can most easily observe this in the autocorrelation function:
acf(x$freq)
We can remove these the weekly season cycle by subtracting every seventh value from one another:
z <- diff(x$freq, 7)
acf(z, lag.max=20)
After using diff()
, the autocorrelation function (acf()
) still shows some correlation, but far less so!
Another useful function is the partial autocorrelation function (pacf()
). Some of the correlation between \(X_{t_0}\) and \(X_{t_1}\), \(t_1\neq t_0\) is captured by all the other \(X_t\) with \(t_0\neq t \neq t_1\). This is also why the ACF never reaches zero, even for AR(1) processes. The partial autocorrelation function, on the other hand, measures the direct impact of \(X_{t_0}\) and \(X_{t_1}\):
pacf(z, lag.max=20)
Since there doesn’t seem to be any correlation between the points in time, we can try to fit an MA(1) model:
(model <- arima(z, order=c(0,0,1)))
##
## Call:
## arima(x = z, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.3528 0e+00
## s.e. 0.1682 2e-04
##
## sigma^2 estimated as 1.56e-08: log likelihood = 196.73, aic = -387.47
res <- residuals(model)
acf(res, lag=100)
If we want to plot it, we need to transform our data into an object that R
identifies as time series:
x.ts <- ts(x$freq, start=as.Date("2018-05-03"))
We can directly fit the seasonal model to our data:
model <- arima(x.ts, order=c(1,0,0),
seasonal = list(order = c(0, 1, 0), period=7))
pred <- predict(model, n.ahead = 20)
ts.plot(x.ts, pred$pred, lty = c(1,3))
In this section, we combine our knowledge to detect anomalies in time series data. We use the anomalize
package (and some tidyverse
magic).
library(anomalize)
library(tidyverse)
# prepare data
head(df.long); tail(df.long) # Current data format
## ymd lemma freq
## 1 2018-05-03 abortion 0.0003461746
## 2 2018-05-04 abortion 0.0003734353
## 3 2018-05-05 abortion 0.0004043463
## 4 2018-05-06 abortion 0.0002860766
## 5 2018-05-07 abortion 0.0003148971
## 6 2018-05-08 abortion 0.0003768004
## ymd lemma freq
## 1480 2018-05-30 win 0.007796550
## 1481 2018-05-31 win 0.008566546
## 1482 2018-06-01 win 0.009469877
## 1483 2018-06-02 win 0.008678828
## 1484 2018-06-03 win 0.008993954
## 1485 2018-06-04 win 0.008359044
df.sorted <- df.long %>% group_by(lemma) %>% arrange(ymd) # tidyverse magic
head(df.sorted); tail(df.sorted) # New data format
## # A tibble: 6 x 3
## # Groups: lemma [6]
## ymd lemma freq
## <date> <fct> <dbl>
## 1 2018-05-03 abortion 0.000346
## 2 2018-05-03 again 0.00909
## 3 2018-05-03 amazing 0.00631
## 4 2018-05-03 america 0.00218
## 5 2018-05-03 awesome 0.00335
## 6 2018-05-03 beautiful 0.00580
## # A tibble: 6 x 3
## # Groups: lemma [6]
## ymd lemma freq
## <date> <fct> <dbl>
## 1 2018-06-04 tuesday 0.000751
## 2 2018-06-04 usa 0.00106
## 3 2018-06-04 wanna 0.00625
## 4 2018-06-04 want 0.0250
## 5 2018-06-04 wednesday 0.000982
## 6 2018-06-04 win 0.00836
We now decompose our time series into seasonal, trend, and remainder components:
df.decomposed <- df.sorted %>% time_decompose(freq, merge=TRUE)
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
glimpse(df.decomposed)
## Observations: 1,485
## Variables: 7
## Groups: lemma [45]
## $ lemma <fct> abortion, abortion, abortion, abortion, abortion, abor…
## $ ymd <date> 2018-05-03, 2018-05-04, 2018-05-05, 2018-05-06, 2018-…
## $ freq <dbl> 0.0003461746, 0.0003734353, 0.0004043463, 0.0002860766…
## $ observed <dbl> 0.0003461746, 0.0003734353, 0.0004043463, 0.0002860766…
## $ season <dbl> -1.087017e-05, -2.128977e-05, 3.742109e-06, -3.797578e…
## $ trend <dbl> 0.0003297701, 0.0003338525, 0.0003379348, 0.0003420172…
## $ remainder <dbl> 2.727461e-05, 6.087262e-05, 6.266939e-05, -1.796483e-0…
Our new dataset contains some additional information (using merge=TRUE
in time_decompose()
also ensured that the old data is still there):
Some notes on the standard settings that were used here: Since we gave daily data as input, the default is a weekly seasonality. The smoothing which is used for trend detection uses spans of 3 months by default.
The remainder is just the difference between observed and explained (season and trend). This is what we will use to search for outliers:
df.anomalized <- df.decomposed %>% anomalize(remainder)
The anomalize()
function applies anomaly detection methods to the remainder component. It adds the following to the data:
glimpse(df.anomalized)
## Observations: 1,485
## Variables: 10
## Groups: lemma [45]
## $ lemma <fct> abortion, abortion, abortion, abortion, abortion, a…
## $ ymd <date> 2018-05-03, 2018-05-04, 2018-05-05, 2018-05-06, 20…
## $ freq <dbl> 0.0003461746, 0.0003734353, 0.0004043463, 0.0002860…
## $ observed <dbl> 0.0003461746, 0.0003734353, 0.0004043463, 0.0002860…
## $ season <dbl> -1.087017e-05, -2.128977e-05, 3.742109e-06, -3.7975…
## $ trend <dbl> 0.0003297701, 0.0003338525, 0.0003379348, 0.0003420…
## $ remainder <dbl> 2.727461e-05, 6.087262e-05, 6.266939e-05, -1.796483…
## $ remainder_l1 <dbl> -0.0005445342, -0.0005445342, -0.0005445342, -0.000…
## $ remainder_l2 <dbl> 0.000596838, 0.000596838, 0.000596838, 0.000596838,…
## $ anomaly <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No…
round(prop.table(table(df.anomalized$anomaly)), 2)
##
## No Yes
## 0.96 0.04
We can now separate non-anomalie via time_recompose()
:
glimpse(df.anomalized %>% time_recompose())
## Observations: 1,485
## Variables: 12
## Groups: lemma [45]
## $ lemma <fct> abortion, abortion, abortion, abortion, abortion, …
## $ ymd <date> 2018-05-03, 2018-05-04, 2018-05-05, 2018-05-06, 2…
## $ freq <dbl> 0.0003461746, 0.0003734353, 0.0004043463, 0.000286…
## $ observed <dbl> 0.0003461746, 0.0003734353, 0.0004043463, 0.000286…
## $ season <dbl> -1.087017e-05, -2.128977e-05, 3.742109e-06, -3.797…
## $ trend <dbl> 0.0003297701, 0.0003338525, 0.0003379348, 0.000342…
## $ remainder <dbl> 2.727461e-05, 6.087262e-05, 6.266939e-05, -1.79648…
## $ remainder_l1 <dbl> -0.0005445342, -0.0005445342, -0.0005445342, -0.00…
## $ remainder_l2 <dbl> 0.000596838, 0.000596838, 0.000596838, 0.000596838…
## $ anomaly <chr> "No", "No", "No", "No", "No", "No", "No", "No", "N…
## $ recomposed_l1 <dbl> -2.256342e-04, -2.319714e-04, -2.028572e-04, -2.40…
## $ recomposed_l2 <dbl> 0.0009157380, 0.0009094007, 0.0009385150, 0.000900…
This adds two further variables to our dataset:
We can also visualize the decomposition. This only makes sense for one group at a time though:
df.anomalized %>% subset(lemma=="abortion") %>% ungroup() %>%
plot_anomaly_decomposition() +
ggtitle("anomaly decomposition")
The whole time series with a visualization of the anomaly detection can be plotted via plot_anomalies()
:
df.sorted %>% subset(lemma %in% c(
"iran", "trump", "thursday", "abortion", "amazing", "interesting"
)) %>% time_decompose(freq) %>%
anomalize(remainder) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)