<- readr::read_csv("data/lc_kepler.csv") lc_df
Activity46
Tabby’s Star: An Astronomical Mystery
Tabby’s Star (KIC 8462852) is an F-type main-sequence star located approximately 1,480 light-years from Earth. It gained scientific attention due to its unusual light curve patterns:
- Irregular, aperiodic dips in brightness of up to 22% (compared to typical <1% for planetary transits)
- Dimming events lasting between 5-80 days
- No significant infrared excess that would indicate a circumstellar dust disk
The leading hypothesis suggests the passage of a family of exo-comet or planetesimal fragments, all associated with a single previous break-up event. It is named after named after Tabetha Boyajian, the researcher at Louisiana State University (USA). Please download the dataset to your course folder and load it to R for further analysis.
%>% mutate(
lc_df datetime = as_datetime((time + 14245.5) * 86400, tz = "UTC")
%>%
) ggplot(aes(x = datetime, y = flux)) +
geom_line(alpha = 0.5) +
geom_point(size = 0.1) +
labs(
title = "Tabby's Star (KIC 8462852) Flux",
x = "Date",
y = "Flux"
+
) theme_minimal()
<- lc_df %>% filter(!is.na(flux)) %>%
lc_df mutate(
median_flux = median(flux),
mad_flux = median(abs(flux - median_flux))
%>%
) filter(abs(flux - median_flux) <= 5 * mad_flux) %>%
mutate(norm_flux = flux / mean(flux)) %>%
::select(-median_flux, -mad_flux)
dplyr
<- lc_df %>%
lc_ts mutate(
datetime = as_datetime((time + 14245.5) * 86400, tz = "UTC")
%>%
) select(datetime, norm_flux) %>%
as_tsibble(index = datetime, regular = FALSE) %>%
fill_gaps(norm_flux = NA) %>%
fill(norm_flux, .direction = "down")
%>%
lc_ts ggplot(aes(x = datetime, y = norm_flux)) +
geom_line(alpha = 0.5) +
geom_point(size = 0.1) +
labs(
title = "Tabby's Star (KIC 8462852) Normalized Flux",
x = "Date",
y = "Normalized Flux"
+
) theme_minimal()
Aggregation to Regular Grids
Hourly
Kepler’s “long cadence” is 29.4 min per integration. We round timestamps to the nearest hour and average:
<- lc_ts %>% as_tibble() %>%
lc_1hr mutate(
one_hour = floor_date(datetime, "1 hour")
%>%
) group_by(one_hour) %>%
summarise(flux_1hr = mean(norm_flux, na.rm = TRUE)) %>%
as_tsibble(index = one_hour, regular = TRUE) %>%
fill_gaps(flux_1hr = NA) %>%
fill(flux_1hr, .direction = "down")
# Zoom in on a major dipping event (around day 1500)
<- as_datetime("2013-01-8")
major_dip_start <- as_datetime("2013-05-10")
major_dip_end
%>%
lc_1hr filter(one_hour >= major_dip_start, one_hour <= major_dip_end) %>%
ggplot(aes(x = one_hour, y = flux_1hr)) +
geom_line() +
labs(
title = "Tabby's Star: Major Dipping Event (2013)",
x = "Date",
y = "Normalized Flux"
+
) theme_minimal()
Time Series Decomposition and Analysis
Let’s examine the time series components and patterns in the hourly data.
# Time series decomposition
%>%
lc_1hr model(STL(flux_1hr)) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition of Hourly Flux Data")
# ACF and PACF analysis
%>%
lc_1hr gg_tsdisplay(flux_1hr, plot_type = "partial") +
labs(title = "ACF and PACF of Hourly Flux Data")
The ACF (Autocorrelation Function) and PACF (Partial Autocorrelation Function) plots help us identify potential ARIMA model parameters. The ACF shows the correlation between the time series and its lags, while the PACF shows the correlation between the time series and its lags that is not explained by previous lags.
Modeling Hourly Data
Now we’ll fit ARIMA models to the hourly data.
# Fit ARIMA model to hourly data
<- lc_1hr %>%
arima_1hr model(ARIMA = ARIMA(flux_1hr))
# Model summary
report(arima_1hr)
Series: flux_1hr
Model: ARIMA(2,1,3)(1,0,0)[24]
Coefficients:
ar1 ar2 ma1 ma2 ma3 sar1
1.8482 -0.9245 -2.0932 1.3223 -0.2123 -0.0167
s.e. 0.0030 0.0030 0.0069 0.0132 0.0065 0.0056
sigma^2 estimated as 6.947e-09: log likelihood=279666.7
AIC=-559319.3 AICc=-559319.3 BIC=-559260.1
# Residual diagnostics for ARIMA model
%>%
arima_1hr gg_tsresiduals() +
labs(title = "Residual Diagnostics for ARIMA Model (Hourly)")
# Ljung-Box test for residual autocorrelation
augment(arima_1hr) %>%
features(.innov, ljung_box, lag = 24, dof = 3)
# A tibble: 1 × 3
.model lb_stat lb_pvalue
<chr> <dbl> <dbl>
1 ARIMA 1374. 0
The residual diagnostics help us assess whether our models have captured all the patterns in the data. Ideally, the residuals should resemble white noise with no significant autocorrelation. The Ljung-Box test formally tests for the presence of autocorrelation in the residuals.
Forecasting with Hourly Models
Let’s generate forecasts using our models and visualize them.
# Generate forecasts (next 1000 periods)
<- arima_1hr %>%
fc_1hr forecast(h = 1000)
# Visualize forecasts
%>%
fc_1hr autoplot(lc_1hr) +
labs(
title = "ARIMA Forecast for Hourly Flux Data",
x = "Date",
y = "Normalized Flux"
)
Activities: Daily Data Analysis
Extend your analysis to daily data aggregation and answer the following questions.
# Create daily time series
<- lc_ts %>% as_tibble() %>%
lc_daily mutate(date = as_date(datetime)) %>%
group_by(date) %>%
summarise(flux_d = ____________) %>%
as_tsibble(index = ____________) %>%
fill_gaps(flux_d = NA) %>%
fill(flux_d, .direction = "down")
# Visualize the daily data
%>%
lc_daily ggplot(aes(x = ____________, y = ____________)) +
geom_line() +
labs(
title = "Tabby's Star: Daily Flux Data",
x = "Date",
y = "Normalized Flux"
+
) theme_minimal()
# Fit ARIMA model to daily data
<- lc_daily %>%
arima_daily model(ARIMA = ARIMA(____________))
# Model summary
report(arima_daily)
# Residual diagnostics
%>%
arima_daily gg_tsresiduals() +
labs(title = "Residual Diagnostics for ARIMA Model (Daily)")
# Generate forecasts
<- arima_daily %>%
fc_daily forecast(h = 14)
# Visualize forecasts
%>%
fc_daily autoplot(____________)
Model Comparison
Compare the performance of your models across different time scales.
# Prepare model comparison table
<- bind_rows(
model_comparison %>% glance() %>% mutate(Aggregation = "Hourly", Model = "ARIMA"),
____________ # Add your daily ARIMA model
%>% glance() %>% mutate(Aggregation = "Daily", Model = "ARIMA")
____________ %>%
) select(Aggregation, Model, AIC, AICc, BIC)
# Display comparison table
::kable(model_comparison, caption = "Model Comparison Across Time Scales") knitr