Source code for postprocessor.core.processes.autoreg

from collections import namedtuple

import numpy as np
import pandas as pd
import scipy.linalg as linalg
from agora.abc import ParametersABC

from postprocessor.core.abc import PostProcessABC

# TODO: Provide the option of whether to optimise AR order -- see issue #1
# TODO: Provide the functionality of 'smoothing' a time series with AR -- see
# issue #1


[docs]class autoregParameters(ParametersABC): """Parameters for the 'autoreg' process Parameters for the 'autoreg' process. Attributes ---------- sampling_period : float Sampling period of measurement values, in unit time. freq_npoints : int Number of points for the frequency axis of the closed-form solution of the estimated periodogram. Defines the resolution for use in, for example, plots. # Sampling period should be listed in the HDF5 metadata (i.e. the # 'time_settings/timeinterval' attribute, unit seconds). When using # this processor in a typical routine, use the sampling period from # there rather than relying on this default value of 5 minutes. """ _defaults = { "sampling_period": 5, "freq_npoints": 100, }
[docs]class autoreg(PostProcessABC): """ Process to estimate power spectral density (autoregressive model). Methods ------- fit_autoreg(timeseries: array_like, ar_order: int) Computes linear algebra parameters used to fit an autoregressive model to a time series optimise_ar_order(timeseries: array_like, ar_order_upper_limit: int) Optimise autoregressive model order to fit a time series autoreg_periodogram(timeseries: array_like, sampling_period: float, freq_npoints: int, ar_order: int) Estimates the closed-form solution of the sample power spectrum of a timeseries, based on fitting an autoregressive model. run(signal: pd.DataFrame) Estimates the power spectral densities of a dataframe of time series, using autoregressive model """
[docs] def __init__( self, parameters: autoregParameters, ): super().__init__(parameters)
# Should this be a static method?
[docs] def fit_autoreg(self, timeseries, ar_order) -> dict: """Computes linear algebra parameters to fit an AR model to a timeseries Computes linear algebra parameters used to fit an autoregressive model of a specified order to a time series. Ultimately computes the Akaike information criterion of the model with the specified order. This model-fitting implementation is based on the supplementary information of: * Jia & Grima (2020). BioRxiv [https://doi.org/10.1101/2020.09.23.309724] Parameters ---------- timeseries : array_like Time series of measurement values. ar_order : int Order of the autoregressive model. The order defines the complexity of the model; if the order is P, then each time point is modelled as a linear combination of P time points preceding it. Returns ------- This function returns a dictionary, whose values are defined by these keys: sample_acfs : 1d array of floats Sample autocorrelation function; element indices go from 0 to ar_order. ar_coeffs : 1d array of floats Coefficients for the autoregressive model; defines the linear combination that defines the model. noise_param : float Noise parameter. aic : float Akaike information criterion of the autoregressive model. """ # Estimates sample autocorrelation function (R). # sample_acfs: 1D array of R values sample_acfs = np.zeros(ar_order + 1) for ii in range(ar_order + 1): sample_acfs[ii] = (1 / len(timeseries)) * np.sum( [ (timeseries[k] - np.mean(timeseries)) * (timeseries[k + ii] - np.mean(timeseries)) for k in range(len(timeseries) - ii) ] ) # Estimates AR coefficients (phi) by solving Yule-Walker equation. # ar_coeffs: 1D array of coefficients (i.e. phi values) sample_acfs_toeplitz = linalg.toeplitz(sample_acfs[0:ar_order]) # phi vector goes from 1 to P in Jia & Grima (2020) ar_coeffs = linalg.inv(sample_acfs_toeplitz).dot( sample_acfs[1 : ar_order + 1] ) # defines a dummy phi_0 as 1. This is so that the indices I use in # get_noise_param are consistent with Jia & Grima (2020) ar_coeffs = np.insert(ar_coeffs, 0, 1.0, axis=0) # Estimates noise parameter (noise_param) noise_param = sample_acfs[0] - np.sum( [ar_coeffs[k] * sample_acfs[k] for k in range(1, ar_order + 1)] ) # Calculates AIC (aic) aic = np.log(noise_param) + (ar_order) / len(timeseries) return { "sample_acfs": sample_acfs, "ar_coeffs": ar_coeffs, "noise_param": noise_param, "aic": aic, }
# Should this be a static method?
[docs] def optimise_ar_order( self, timeseries, ar_order_upper_limit, ) -> int: """Optimise autoregressive model order to fit a time series Optimise the autoregressive model order to fit a time series. Model selection relies on minimising the Akaike information criterion, sweeping over a range of possible orders. This implementation is based on the supplementary information of: * Jia & Grima (2020). BioRxiv [https://doi.org/10.1101/2020.09.23.309724] Parameters ---------- timeseries : array_like Time series of measurement values. ar_order_upper_limit : int Upper bound for autoregressive model order; recommended to be the square root of the length of the time series. Returns ------- int Optimal order for an autoregressive model that fits the time series. """ # Bug: artificial dip at order 1 if time series is a smooth sinusoid. # Will probably need to fix it so that it checks if the minimum also # corresponds to a zero derivative. ar_orders = np.arange(1, ar_order_upper_limit) aics = np.zeros(len(ar_orders)) for ii, ar_order in enumerate(ar_orders): model = self.fit_autoreg(timeseries, ar_order) aics[ii] = model["aic"] return ar_orders[np.argmin(aics)]
[docs] def autoreg_periodogram( self, timeseries, sampling_period, freq_npoints, ar_order, ): """Estimates the power spectrum of a timeseries, based on an AR model Estimates the closed-form solution of the sample power spectrum of a time series. This estimation is based on fitting an autoregressive model of an optimised order to the time series, and using computed linear algebra parameters to compute the closed-form solution. This implementation is based on the supplementary information of: * Jia & Grima (2020). BioRxiv [https://doi.org/10.1101/2020.09.23.309724] Parameters --------- timeseries : array_like Time series of measurement values. sampling_period : float Sampling period of measurement values, in unit time. freq_npoints : int Number of points for the frequency axis of the closed-form solution of the estimated periodogram. Defines the resolution for use in, for example, plots. ar_order : int Order of the autoregressive model. The order defines the complexity of the model; if the order is P, then each time point is modelled as a linear combination of P time points preceding it. Returns ------- freqs: ndarray Array of sample frequencies, unit time-1. power: ndarray Power spectral density or power spectrum of the time series, arbitrary units. Examples -------- FIXME: Add docs. """ ar_model = self.fit_autoreg(timeseries, ar_order) freqs = np.linspace(0, 1 / (2 * sampling_period), freq_npoints) power = np.zeros(len(freqs)) # Sweeps the frequencies; corresponds to the 'xi' variable in # Jia & Grima (2020) for ii, freq in enumerate(freqs): # multiplied 2pi into the exponential to get the frequency rather # than angular frequency summation = [ ar_model["ar_coeffs"][k] * np.exp(-1j * k * (2 * np.pi) * freq) for k in range(ar_order + 1) ] summation[0] = 1 # minus sign error??? divisor = np.sum(summation) power[ii] = (ar_model["noise_param"] / (2 * np.pi)) / np.power( np.abs(divisor), 2 ) # Normalise by first element of power axis. This is consistent with # the MATLAB code from Ramon Grima. power = power / power[0] return freqs, power
[docs] def run(self, signal: pd.DataFrame): # TODO: Return an additional DataFrame that contains the orders, # optimised or not """Estimates the power spectra of a dataframe of time series, using AR Estimates the power spectral densities of a dataframe of time series. This estimation is based on fitting an autoregressive model of an optimised order to the time series, and using computed linear algebra parameters to compute the closed-form solution. This implementation is based on the supplementary information of: * Jia & Grima (2020). BioRxiv [https://doi.org/10.1101/2020.09.23.309724] Parameters ---------- signal: pandas.DataFrame Time series, with rows indicating individiual time series (e.g. from each cell), and columns indicating time points. Returns ------- freqs_df: pandas.DataFrame Sample frequencies from each time series, with labels preserved from 'signal'. power_df: pandas.DataFrame Power spectrum from each time series, with labels preserved from 'signal'. order_df: pandas.DataFrame Optimised order of the autoregressive model fitted to each time series, with labels preserved from 'signal'. Examples -------- FIXME: Add docs. """ AutoregAxes = namedtuple("AutoregAxes", ["freqs", "power", "order"]) # Each element in this list is a named tuple: (freqs, power, order) axes = [] # Use enumerate instead? for row_index in range(len(signal)): timeseries = signal.iloc[row_index, :].dropna().values optimal_ar_order = self.optimise_ar_order( timeseries, int(3 * np.sqrt(len(timeseries))) ) axes.insert( row_index, AutoregAxes( *self.autoreg_periodogram( timeseries=timeseries, sampling_period=self.sampling_period, freq_npoints=self.freq_npoints, ar_order=optimal_ar_order, ), optimal_ar_order, ), ) freqs_df = pd.DataFrame( [element.freqs for element in axes], index=signal.index ) power_df = pd.DataFrame( [element.power for element in axes], index=signal.index ) order_df = pd.DataFrame( [element.order for element in axes], index=signal.index ) return freqs_df, power_df, order_df