Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Matrix Profile IX: Admissible Time Series Motif Discovery with Missing Data - R #83

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions R/CalcLB_dist.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#' This is the code for the paper "Admissible Motif Discovery with Missing
#' Data" by Yan Zhu, Abdullah Mueen and Eamonn Keogh.
#'
#' Translated from Matlab to R by Christoper English
#' All mistakes are mine as their code worked and results seem reliable
#' as executed in Octave
#' this function in intended to be called from within StompSelfJoinNA

#' @param i 'int' iterator of for loop
#' @param num 'int' length of time_series
#' @param m 'int' window_size
#' @param Vmax 'vector' at 'i' max of 'm' forward values - calculated once in StompSelfJoinNA
#' @param Vmin 'vector' at 'i' min of 'm' forward values - calculated once in StompSelfJoinNA
#' @param QZ 'vector' last_product as calc'd by tsmp::dist_profile
#' @param QB 'vector' last_product as calc'd by tsmp::dist_profile
#' @param BZ 'vector' last_product as calc'd by tsmp::dist_profile
#' @param ZB 'vector' last_product as calc'd by tsmp::dist_profile
#' @param BX 'vector' last_product as calc'd by tsmp::dist_profile
#' @param XB 'vector' last_product as calc'd by tsmp::dist_profile
#' @param meanz 'vector' as calc'd by mass_pre_NA for Z - See StompSelfJoinNA
#' @param sigmaz 'vector' as calc'd by mass_pre_NA for Z - See StompSelfJoinNA
#' @param meanb 'vector' as calc'd by mass_pre_NA for B - See StompSelfJoinNA
#' @param sigmab 'vector' as calc'd by mass_pre_NA for B - See StompSelfJoinNA
#' @param meanx 'vector' as calc'd by mass_pre_NA for X - See StompSelfJoinNA
#' @param sigmax 'vector' as calc'd by mass_pre_NA for X - See StompSelfJoinNA
#' @param sigmai_or2
#' @param sigmaj_or2
#' @param q
#' @param max_sigma2
#' @param D
#' @return Returns 'D' the distanceProfile
CalcLB_dist <- function(i,num,m,Vmax,Vmin,QZ,QB,BZ,ZB,BX,XB,meanz,sigmaz,meanb,sigmab,meanx,sigmax,sigmai_or2,sigmaj_or2,q,max_sigma2, D) {

for (j in 1:(num-m+1)){

if (identical(QB[j], m)) { # to identical() from == and back to == and back to identical

D[j] = 2*m*(1-q[j]) #QB[j] = m
#print('case_1')
} else { # QB[j] != m
if (meanb[i] > meanb[j]) {

sigma_or2 = sigmaj_or2[j]
sigma2 = sigmaz[i]^2
l_r = meanb[i]*m
mean_r = meanz[i]/meanb[i]
sigma_r2 = (sigmaz[i]^2 + meanz[i]^2)/meanb[i] - mean_r^2
} else { #meanb[i] !> meanb[j]
sigma_or2 = sigmaj_or2[j]
sigma2 = sigmaz[j]^2
l_r = meanb[j]*m
mean_r = meanz[j]/meanb[j]
sigma_r2 = (sigmaz[j]^2 + meanz[j]^2)/meanb[j] - mean_r^2
}
if (max(meanb[i], meanb[j]) == 1) { #to ==

if (meanb[i] > meanb[j]) { #Ti, m doesn't have missing value at i
sigma_or2 = sigmai_or2[j]
sigma2 = sigmaz[j]^2
}

if (q[j] <= 0) { # to identical and back to <=

D[j] = QB[j] * sigma_or2/sigma2
#print('case_2')
} else {
D[j] = QB[j] * sigma_or2/sigma2*(1-q[j]^2)
#print('case_3')
}
} else {

meanr_i = meanz[i]/meanb[i]
meanr_j = meanz[j]/meanb[j]
maxsigma2_i=meanb[i]*((sigmaz[i]^2+meanz[i]^2)/meanb[i]-meanr_i^2+Vmax[i]*Vmin[i]+meanr_i*(meanr_i-Vmax[i]-Vmin[i]))+(Vmax[i]-Vmin[i])^2/4
maxsigma2_j=meanb[j]*((sigmaz[j]^2+meanz[j]^2)/meanb[j]-meanr_j^2+Vmax[j]*Vmin[j]+meanr_j*(meanr_j-Vmax[j]-Vmin[j]))+(Vmax[j]-Vmin[j])^2/4
v=max(sigmai_or2[j]/maxsigma2_i,sigmaj_or2[j]/maxsigma2_j)
if (q[j] <= 0) {
D[j] = QB[j]*v
#print('case_4')
} else {
D[j] = QB[j]*v*(1-q[j]^2)
#print('case_5')
}
#if (i %% 599 == 0 & j %% 599 == 0) browser()
}
}
}
return(D)
}

177 changes: 177 additions & 0 deletions R/StompSelfJoinNA.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
StompSelfJoinNA <- function(time_series, window_size, exclusion_zone) {
#' This is the code for the paper "Admissible Motif Discovery with Missing
#' Data" by Yan Zhu, Abdullah Mueen and Eamonn Keogh.
#'
#' Input:
#' time_series: Time Series with missing values
#' window_size: Subsequence length, window size for motif discovery
#' Output:
#' Matrix Profile: The lowerbound Matrix Profile
#' MPindex: The lowerbound Matrix Profile Index
#'
#' Translated from Matlab to R by Christoper English
#' All mistakes are mine as their code worked and results seemed reliable
#' as executed in Octave
#' @param time_series 'vector' or 'matrix' time series with missing values
#' @param window_size 'int' sliding window size
#' @param exclusion_zone 'numeric' to prevent trivial matches tsmp default 1/2 Seismology example uses 1/4 (0.25)
#' @return Returns 'MatrixProfile', 'MPindex', 'Z' (time_series with NA replaced by 0)
# mode:1-FFT 2-no FFT only mode 1 implemented
mode=1;
time_series = time_series # for debugging
window_size = window_size # for debugging
# set trivial match exclusion zone
exclusion_zone = exclusion_zone
exclusion_zone = round(window_size * exclusion_zone)

# check input
if (window_size > length(time_series)/2) {
error('Error: Time series is too short relative to desired window size')
}
if (window_size < 4) {
error('Error: window_size must be at least 4L')
}
num = length(time_series)
MatrixProfileLen = num - window_size +1
Vmax = rep.int(0, times = MatrixProfileLen)
Vmin = rep.int(0, times = MatrixProfileLen)

#' create Z, B, and X from time_series
#' switching out NA for 0 insulates for use with tsmp
#' might need to DO THE SAME for is.infinite for purposes of generality
Z = time_series
Z[is.na(Z)] = 0
for (s in 1:MatrixProfileLen) {
Vmax[s] = max(Z[s]:Z[s+window_size -1], na.rm = FALSE)
Vmin[s] = min(Z[s]:Z[s+window_size -1], na.rm = FALSE)
}
B <- rep.int(1, times = length(time_series))
B[is.na(time_series)] = 0
X = Z^2
#
MatrixProfile = matrix(0, MatrixProfileLen,1) #from rep.int vec to matrix
MPindex = rep.int(0, times = MatrixProfileLen) # why 0 here and 1 at line 137
# Attempt to include progress bar
pb <- progress::progress_bar$new(
format = 'StompNA [:bar] :percent at :tick_rate it/s, elapsed: :elapsed, eta: :eta', clear = FALSE, total=MatrixProfileLen, width = 80)

Z_mass_pre = mass_pre_NA(Z, window_size=window_size)
B_mass_pre = mass_pre_NA(B, window_size=window_size)
X_mass_pre = mass_pre_NA(X, window_size=window_size)

tictac <- Sys.time()
# maximum variance value of all subsequences

max_sigma2 = max(Z_mass_pre$sigmax2[B_mass_pre$data_mean == 1])

distanceProfile = matrix(0, MatrixProfileLen, 1)
updatePos = matrix(FALSE, MatrixProfileLen, 1) # from rep_len vec to matr

#preallocate
QZ = rep.int(0, times = MatrixProfileLen)
QB = rep.int(0, times = MatrixProfileLen)
BZ = rep.int(0, times = MatrixProfileLen)
ZB = rep.int(0, times = MatrixProfileLen)
BX = rep.int(0, times = MatrixProfileLen)
XB = rep.int(0, times = MatrixProfileLen)

# what follows here switched to tsmp::dist_profile
# and still need to calc all stats as is done in fastfindNNPre and fastfindNN
QZ_tsmp_dp <- tsmp::dist_profile(data=Z, query=Z, window_size=window_size, method='v2')
QB_tsmp_dp <- tsmp::dist_profile(data=B, query=B, window_size=window_size, method='v2')
BZ_tsmp_dp <- tsmp::dist_profile(data=Z, query=B, window_size=window_size, method='v2')
ZB_tsmp_dp <- tsmp::dist_profile(data=B, query=Z, window_size=window_size, method='v2')
BX_tsmp_dp <- tsmp::dist_profile(data=X, query=B, window_size=window_size, method='v2')
XB_tsmp_dp <- tsmp::dist_profile(data=B, query=X, window_size=window_size, method='v2')

QZ = QZ_tsmp_dp$last_product
QB = QB_tsmp_dp$last_product
BZ = BZ_tsmp_dp$last_product
ZB = ZB_tsmp_dp$last_product
BX = BX_tsmp_dp$last_product
XB = XB_tsmp_dp$last_product

#first
firstQZ = QZ_tsmp_dp$last_product
firstQB = QB_tsmp_dp$last_product
firstBZ = BZ_tsmp_dp$last_product
firstZB = ZB_tsmp_dp$last_product
firstBX = BX_tsmp_dp$last_product
firstXB = XB_tsmp_dp$last_product

# generate inputs in the (function) environment for use by CalcLBDist

meani_or = ZB/QB
meanj_or = BZ/QB
sigmai_or2 = XB/QB-meani_or^2
sigmaj_or2 = BX/QB-meanj_or^2
q= (QZ/QB - (meani_or * meanj_or)) /sqrt(abs(sigmai_or2 * sigmaj_or2))

# account for m in CalcLB_dist
m = window_size

distanceProfile = CalcLB_dist(1, num, m, Vmax, Vmin, QZ, QB, BZ, ZB, BX, XB, meanz = Z_mass_pre$data_mean, sigmaz = Z_mass_pre$data_sd, meanb = B_mass_pre$data_mean, sigmab = B_mass_pre$data_sd,meanx = X_mass_pre$data_mean,sigmax = X_mass_pre$data_sd, sigmai_or2, sigmaj_or2, q, max_sigma2, D = distanceProfile)

#apply exlcusion zone
if (exclusion_zone > 0) {
exc_st = max(1, 1-exclusion_zone)
exc_ed = min(MatrixProfileLen, 1+exclusion_zone)
distanceProfile[exc_st:exc_ed,1] <- Inf
}
# evaluate initial matrix profile
MatrixProfile[1:MatrixProfileLen,1] = distanceProfile[1:MatrixProfileLen,1]
MPindex[1:length(MatrixProfileLen)] = 1
updatePos = (distanceProfile < MatrixProfile) # updatePos should all be FALSE now
from = 2

# and now, the long weird dance
for (i in from:MatrixProfileLen) {

# update dot products
QZ[2:(num-m +1)] = QZ[1:(num -m)] -Z[(i-1)] * Z[1:(num -m)] + Z[(i +m -1)] * Z[(m +1) : num]
QB[2:(num-m +1)] = QB[1:(num -m)] -B[(i-1)] * B[1:(num -m)] + B[(i +m -1)] * B[(m +1) : num]
BZ[2:(num-m +1)] = BZ[1:(num -m)] -B[(i-1)] * Z[1:(num -m)] + B[(i +m -1)] * Z[(m +1) : num]
ZB[2:(num-m +1)] = ZB[1:(num -m)] -Z[(i-1)] * B[1:(num -m)] + Z[(i +m -1)] * B[(m +1) : num]
BX[2:(num-m +1)] = BX[1:(num -m)] -B[(i-1)] * X[1:(num -m)] + B[(i +m -1)] * X[(m +1) : num]
XB[2:(num-m +1)] = XB[1:(num -m)] -X[(i-1)] * B[1:(num -m)] + X[(i +m -1)] * B[(m +1) : num]

QZ[1] = firstQZ[i]
QB[1] = firstQB[i]
BZ[1] = firstBZ[i]
ZB[1] = firstZB[i]
BX[1] = firstBX[i]
XB[1] = firstXB[i]
# update stats for CalcLB_dist
meani_or = ZB/QB
meanj_or = BZ/QB
sigmai_or2 = XB/QB-meani_or^2
sigmaj_or2 = BX/QB-meanj_or^2
q= (QZ/QB - (meani_or * meanj_or)) /sqrt(abs(sigmai_or2 * sigmaj_or2))


distanceProfile = try(CalcLB_dist(i, num, m, Vmax, Vmin, QZ, QB, BZ, ZB, BX, XB, meanz = Z_mass_pre$data_mean, sigmaz = Z_mass_pre$data_sd, meanb = B_mass_pre$data_mean, sigmab = B_mass_pre$data_sd, meanx = X_mass_pre$data_mean, sigmax = X_mass_pre$data_sd, sigmai_or2, sigmaj_or2, q, max_sigma2, D = distanceProfile))
# update exclusion_zone
exc_st[i] = max(i, i - exclusion_zone)
exc_ed[i] = min(MatrixProfileLen, i + exclusion_zone)
distanceProfile[exc_st[i]:exc_ed[i], 1] = Inf

# figure out and store nearest neighbor
updatePos = (distanceProfile < MatrixProfile)

MatrixProfile[updatePos] = distanceProfile[updatePos]
#if (any(is.na(MatrixProfile) == TRUE)) browser()
MPindex[which(updatePos)] = i # index, as it says, but not intuitive
# for some debugging, prior to progress::
#if (window_size == 160 & i %% 100 == 0) print(i)
#if (window_size == 2000 & i %% 1000 == 0) print(i)
#if (i %% 300 == 0) browser()
pb$tick()
}
tictac <- Sys.time() - tictac
message(sprintf('Finished in %.2f %s', tictac, units(tictac)))
MatrixProfile = sqrt(abs(MatrixProfile))

return(list(MatrixProfile = MatrixProfile, MPindex = MPindex, data = Z))

}

76 changes: 76 additions & 0 deletions R/mass_pre_NA.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#' Precomputes several values used on MASS extended for missing values NA
#' computes statistics as further required for Matrix Profile IX
#'
#' @param data a `vector` or a `matrix` of `numeric`. Reference Time Series.
#' @param query a `vector` or a `matrix` of `numeric`. Query Time Series (default is `NULL`).
#' @param window_size an `int`. Sliding window size.
#'
#' @return Returns `window_size`, `data_fft`, `data_size`, `data_mean`, `data_sd`, `query_mean` and `query_sd`.
#' @export
#' @keywords internal
#'
#' @seealso [mass_v2()], [mass_v3()] for using precomputed values.
#'
#' @references * Abdullah Mueen, Yan Zhu, Michael Yeh, Kaveh Kamgar, Krishnamurthy Viswanathan,
#' Chetan Kumar Gupta and Eamonn Keogh (2015), The Fastest Similarity Search Algorithm for Time
#' Series Subsequences under Euclidean Distance.
#' @references Website: <https://www.cs.unm.edu/~mueen/FastestSimilaritySearch.html>
#'
#' @examples
#' w <- mp_toy_data$sub_len
#' ref_data <- mp_toy_data$data[, 1]
#' query_data <- mp_toy_data$data[, 1]
#'
#' pre <- tsmp:::mass_pre(ref_data, query_data, w)
#'
#' dp <- list()
#' for (i in 1:(pre$data_size - w + 1)) {
#' dp[[i]] <- tsmp:::mass_v2(
#' query_data[i:(i - 1 + w)], pre$window_size, pre$data_fft, pre$data_size,
#' pre$data_mean, pre$data_sd, pre$query_mean[i], pre$query_sd[i]
#' )
#' }
mass_pre_NA <- function(data, query = NULL, window_size) {
if (is.matrix(data)) {
data <- as.vector(data)
}

data_size <- length(data)

if (window_size > data_size) {
stop("'window_size' must be smaller or equal to 'data' length.")
}

data_avgsd <- old_fast_avg_sd(data, window_size) # precompute moving average and SD
data_mean <- data_avgsd$avg
data_sd <- data_avgsd$sd
pad_size <- 2^ceiling(log2(data_size))
data[(data_size + 1):pad_size] <- 0
data_fft <- stats::fft(data) # precompute fft of data
sigmax2 <- data_avgsd$sigmax2 # lower bound requires max variance

if (!is.null(query)) {
if (is.matrix(query)) {
query <- as.vector(query)
}

query_size <- length(query)

if (window_size > query_size) {
stop("'window_size' must be smaller or equal to 'query' length.")
}

query_avgsd <- old_fast_avg_sd(query, window_size) # precompute moving average and SD
query_mean <- query_avgsd$avg
query_sd <- query_avgsd$sd
} else {
query_mean <- data_mean
query_sd <- data_sd
}

return(list(
window_size = window_size, data_fft = data_fft, data_size = data_size,
data_mean = data_mean, data_sd = data_sd, sigmax2 = sigmax2, query_mean = query_mean,
query_sd = query_sd
))
}
Loading