feat: Update initials_adjpin() algorithm to align with Ersan and Ghac…

…hem (2024) Modified the implementation of the algorithm for generating the initial parameter sets in the initials_adjpin() function to align with the new algorithm described in Ersan and Ghachem (2024). Refer to the paper for more details. Reference: Ersan, Oguz and Ghachem, Montasser, A Methodological Approach to the Computational Problems in the Estimation of Adjusted Pin Model. Available at SSRN: https://ssrn.com/abstract=4359991 or http://dx.doi.org/10.2139/ssrn.4359991
monty-se · Jul 28, 2024 · 632361a · 632361a
1 parent 854747e
commit 632361a
Showing 1 changed file with 101 additions and 210 deletions.
diff --git a/R/model_adjpin.R b/R/model_adjpin.R
@@ -495,7 +495,7 @@ adjpin <- function(data, method = "ECM", initialsets = "GE", num_init = 20,
 #'
 #' @export
 initials_adjpin <- function(data, xtraclusters = 4, restricted = list(),
-                            verbose = TRUE) {
+                                 verbose = TRUE) {
 
   # Check that all variables exist and do not refer to non-existent variables
   # --------------------------------------------------------------------------
@@ -582,33 +582,6 @@ initials_adjpin <- function(data, xtraclusters = 4, restricted = list(),
   }
 
 
-  # A function divides a cluster into sub-clusters - based on order imbalance
-  # --------------------------------------------------------------------------
-  into2clusters <- function(thiscluster) {
-
-    # Initialize the return value 'xoverview' to the cluster to split.
-    # The value 'xoverview' if the number of days in the cluster is larger
-    # than 1, and therefore can be split into 2 clusters.
-    xoverview <- thiscluster
-
-    if (thiscluster$days > 1) {
-
-      medlayers <- thiscluster[1, ]$layer
-
-      if (is.list(thiscluster[1, ]$layer)) medlayers <- unlist(medlayers)
-      xdata <- data[data$layer %in% medlayers, ]
-      xdata$oi <- xdata$b - xdata$s
-
-      clusters <- hclust(dist(xdata$oi), method = "complete")
-      xdata$cluster <- cutree(clusters, 2)
-      xdata$oi <- NULL
-      xoverview <- create_datasummary(xdata, bylayer = FALSE)
-
-    }
-
-    return(xoverview)
-  }
-
   # ----------------------------------------------------------------------------
   # Run the process of producing initial sets for all configurations
   # ----------------------------------------------------------------------------
@@ -675,225 +648,143 @@ initials_adjpin <- function(data, xtraclusters = 4, restricted = list(),
     # c6    |[eb+db]        |[es+mus+ds]    |
     #----------------------------------------
 
+    # Initialize parambox at its theoretical values
+    # ------------------------------------------------------------------------
+    parambox[, 1] <- c(buyrates, buyrates[1:2])
+    parambox[, 2] <- c(sellrates[1:2], sellrates)
+    parambox[, 3] <- rep(0, 6)
 
-    # Gather all elements relative to buyfirst = T (= F) in a list bflist
-    # (sflist). It will be easier to call all these elements, once the value
-    # of buyfirst is determined
-    bflist <- list(xmeans = bbxmeans, data = bxdata,
-               indxmax = which.max(bbxmeans$b),
-               indxliq = 1 + which.min(bbxmeans[2:3, ]$s))
-
-    sflist <- list(xmeans = ssxmeans, data = sxdata,
-                   indxmax = which.max(ssxmeans$s),
-                   indxliq = 1 + which.min(ssxmeans[2:3, ]$b))
-
-    xlists <- list(bflist, sflist)
-
-
-    for (buyfirst in c(TRUE, FALSE)) {
-
-      # Pick the active list based on the value of buyfirst
-      xlist <- xlists[[2 - buyfirst]]
-
-      # xmeans = bbxmeans when buyfirst = T, otherwise xmeans = ssxmeans
-      # ------------------------------------------------------------------------
-      xmeans <- xlist$xmeans
-      data <- xlist$data
-
-      # Initialize parambox at its theoretical values
-      # ------------------------------------------------------------------------
-      parambox[, 1] <- c(buyrates, buyrates[1:2])
-      parambox[, 2] <- c(sellrates[1:2], sellrates)
-      parambox[, 3] <- rep(0, 6)
-
-
-      # +++                                                                +++ #
-      # ++++++                                                          ++++++ #
-      # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
-      # II. PARTITION THE DATA IN SIX CLUSTERS IN A DARATFRAME 'SIXCLUSTERS' + #
-      # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
-      # ++++++                                                          ++++++ #
-      # +++                                                                +++ #
-
-
-      # indxmax: index of the cluster of maximum trades (eb+mub+db, es+mus+ds)
-      # indxliq: index of the cluster with liquidity shocks (eb+db, es+ds)
-      # ------------------------------------------------------------------------
-      indxmax <- xlist$indxmax
-      indxliq <- xlist$indxliq
-
-      # Identify the two clusters to be clustered further, different from
-      # indxmax and indxliq, and gather them into a dataframe 'clusterfurther'
-      # Gather all clusters in one dataframe called 'sixclusters'
-      # ------------------------------------------------------------------------
-      sixclusters <- xmeans[c(indxmax, indxliq), ]
-      clusterfurther <- xmeans[-c(indxmax, indxliq), ]
-
-      if (nrow(clusterfurther) > 0) {
-        for (rw in seq_len(nrow(clusterfurther))) {
-          sixclusters <- rbind(sixclusters, into2clusters(clusterfurther[rw, ]))
-        }
-      }
 
-      sixclusters$layer <- sixclusters$cluster <- NULL
+    # +++                                                                +++ #
+    # ++++++                                                          ++++++ #
+    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
+    # II.ATTACH EACH OBSERVATION TO HYPOTHETICAL CLUSTERS IN 'PARAMBOX'    + #
+    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
+    # ++++++                                                          ++++++ #
+    # +++                                                                +++ #
 
 
-      # +++                                                                +++ #
-      # ++++++                                                          ++++++ #
-      # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
-      # III.ATTACH EACH CLUSTER TO HYPOTHETICAL CLUSTERS IN 'PARAMBOX'       + #
-      # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
-      # ++++++                                                          ++++++ #
-      # +++                                                                +++ #
+    # Connect clusters to hypothetical clusters, using the vector 'xpositions'
+    # where the nth entry contains the index of hypothetical cluster.
+    # Compute a similarity score 'similarity$score', and pick the hypothetical
+    # cluster as the cluster with the highest similarity score.
+    # ------------------------------------------------------------------------
+    xpositions <- NULL
 
-      mergexrows <- function(rows) {
+    for (row in seq_len(nrow(data))) {
 
-        newrow <- rows[1, ]
-        newrow$days <- sum(rows$days)
-        newrow$b <- sum(rows$days * rows$b) / newrow$days
-        newrow$s <- sum(rows$days * rows$s) / newrow$days
-        return(newrow)
+      brow <- data[row, ]$b
+      srow <- data[row, ]$s
 
-      }
+      # Calculate the vector of scores
+      score <- - sqrt((parambox$b - brow)^2 + (parambox$s - srow)^2)
 
-      # Connect clusters to hypothetical clusters, using the vector 'xpositions'
-      # where the nth entry contains the index of hypothetical cluster.
-      # Compute a similarity score 'similarity$score', and pick the hypothetical
-      # cluster as the cluster with the highest similarity score.
-      # ------------------------------------------------------------------------
-      xpositions <- NULL
+      # Pick the highest score
+      xposition <- head(order(score, decreasing = TRUE), 1)
 
-      for (row in seq_len(nrow(sixclusters))) {
+      # Collect the cluster identifier 'xposition' in the vector xpositions
+      xpositions <- c(xpositions, xposition)
 
-        brow <- sixclusters[row, ]$b
-        srow <- sixclusters[row, ]$s
+    }
 
-        similarity <- parambox[, c("b", "s")]
+    data$posx <- xpositions
 
-        similarity$dpb <- apply(parambox, 1, function(x)
-          abs(ppois(brow, x[1], log.p = TRUE, lower.tail = (x[1] < brow))))
+    # Aggregate the data by position and calculate the average buys and sells
+    overview <- aggregate(. ~ posx, data, mean, drop = FALSE)
+    overview$days <- aggregate(. ~ posx, data, FUN = length)[, c("b")]
+    overview <- overview[, c("posx", "b", "s", "days")]
 
-        similarity$dps <- apply(parambox, 1, function(x)
-          abs(ppois(srow, x[2], log.p = TRUE, lower.tail = (x[2] < srow))))
+    parambox <- data.frame(matrix(0, nrow = 6, ncol = 1))
+    colnames(parambox) <- c("posx")
+    parambox$posx <- 1:6
 
-        similarity$score <- similarity$dpb * similarity$dps
+    parambox <- merge(parambox, overview, by="posx", all.x = TRUE)
+    parambox[is.na(parambox)] <- 0
+    parambox <- parambox[, 2:4]
 
-        if (all(similarity$score == 0)) {
-          similarity$dpb <- (parambox$b - brow)^2
-          similarity$dps <- (parambox$s - srow)^2
-          similarity$score <- - sqrt(similarity$dpb + similarity$dps)
-        }
 
-        xposition <- tail(order(similarity$score), 1)
-        xposition <- head(order(similarity$score, decreasing = TRUE), 1)
-        xpositions <- c(xpositions, xposition)
+    # +++                                                                +++ #
+    # ++++++                                                          ++++++ #
+    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
+    # III. COMPUTE INITIAL PARAMETER SETS FOLLOWING ERSAN & GHACHEM (2024) + #
+    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
+    # ++++++                                                          ++++++ #
+    # +++                                                                +++ #
 
-      }
 
+    # Distribute the parambox into three variables avb, avs and days.
+    # avb: average buys, avs: average sells, and days: number of days.
+    # -----------------------------------------------------------------------
+    avb <- unlist(parambox[, 1])
+    avs <- unlist(parambox[, 2])
+    days <- unlist(parambox[, 3])
 
-      # Attach the current cluster 'xcluster' into the hypothetical cluster that
-      # has the index 'hypo' in the hypothetical distribution 'parambox'.
-      # If the cluster 'hypo' in 'parambox' already contain a cluster, merge
-      # both clusters, using the function 'mergexrows()'.
-      # ------------------------------------------------------------------------
-      for (i in seq_len(length(xpositions))) {
 
-        hypo <- xpositions[i]
-        xcluster <- sixclusters[i, c("b", "s", "days")]
+    # Compute 'empirical' values for alpha (a), delta (d) theta (t)
+    # and theta' (tp) and min_avb (min_avs) the minimum average buys (sells)
+    # ------------------------------------------------------------------------
+    a <- sum(days[3:6]) / sum(days)
+    d <- sum(days[c(5, 6)]) / sum(days[3:6])
+    t <- days[2] / sum(days[1:2])
+    tp <- sum(days[c(4, 6)]) / sum(days[3:6])
 
-        if (parambox[hypo, 3] == 0) {
+    params <- c(a, d, t, tp)
+    params[is.na(params)] <- 0
 
-          parambox[hypo, ] <- xcluster
+    # Generation of parameters - See Ersan and Ghachem (2024)
+    # ------------------------------------------------------------------------
 
-        } else {
-          xrows <- rbind(xcluster, parambox[hypo, ])
-          parambox[hypo, ] <- mergexrows(xrows)
-        }
-      }
-      parambox[parambox$days == 0, ] <- 0
+    # If eb or es is equal to zero, the estimates (eb)^e and (es)^e are used.
+    # They are stored in buyrates[1] and sellrates[1] respectively
+    # ----------------------------------------------------
 
-      # +++                                                                +++ #
-      # ++++++                                                          ++++++ #
-      # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
-      # IV. COMPUTE INITIAL PARAMETER SETS FOLLOWING ERSAN & GHACHEM (2022)  + #
-      # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
-      # ++++++                                                          ++++++ #
-      # +++                                                                +++ #
+    eb <- max(c(avb[1], avb[5], 0))
+    if (eb == 0) eb <- buyrates[1]
 
+    es <- max(c(avs[1], avs[3], 0))
+    if (es == 0) es <- sellrates[1]
 
-      # Distribute the parambox into three variables avb, avs and days.
-      # avb: average buys, avs: average sells, and days: number of days.
-      # -----------------------------------------------------------------------
-      avb <- unlist(parambox[, 1])
-      avs <- unlist(parambox[, 2])
-      days <- unlist(parambox[, 3])
+    # The remaining parameters can take the value zero
+    # ----------------------------------------------------
 
+    db <- max(c(avb[2] - eb, avb[6] - eb, 0))
+    ds <- max(c(avs[2] - es, avs[4] - es, 0))
 
-      # Compute 'empirical' values for alpha (a), delta (d) theta (t)
-      # and theta' (tp) and min_avb (min_avs) the minimum average buys (sells)
-      # ------------------------------------------------------------------------
-      a <- sum(days[3:6]) / sum(days)
-      d <- sum(days[c(5, 6)]) / sum(days[3:6])
-      t <- days[2] / sum(days[1:2])
-      tp <- sum(days[c(4, 6)]) / sum(days[3:6])
+    mub <- max(c(avb[3] - eb, avb[4] - eb - db, 0))
+    mus <- max(c(avs[5] - es, avs[6] - es - ds, 0))
 
-      params <- c(a, d, t, tp)
-      params[is.na(params)] <- 0
+    # Store the initial set in a vector
+    # ----------------------------------------------------
 
-      # Generation of parameters - See Ersan and Ghachem (2022)
-      # ------------------------------------------------------------------------
-      eb <- c(avb[1], avb[5])
-      eb <- max(eb[eb > 0], 0)
-      if (eb == 0) eb <- buyrates[1]
-
-      es <- c(avs[1], avs[3])
-      es <- max(es[es > 0], 0)
-      if (es == 0) es <- sellrates[1]
-
-      db <- c(avb[2] - eb, avb[6] - eb,
-              ifelse(avb[4] * avb[3] > 0, avb[4] - avb[3], 0))
-      db <- max(db[db > 0], 0)
-
-      ds <- c(avs[2] - es, avs[4] - es,
-              ifelse(avb[6] * avb[5] > 0, avb[6] - avb[5], 0))
-      ds <- max(ds[ds >= 0], 0)
-
-      mub <- c(avb[4] - eb - db, avb[3] - eb)
-      mub <- max(mub[mub > 0], 0)
-
-      mus <- c(avs[6] - es - ds, avs[5] - es)
-      mus <- max(mus[mus > 0], 0)
-
-      xparams <- c(params, eb, es, mub, mus, db, ds)
-      xparams[is.nan(xparams)] <- 0
-
-      # Exclude initial parameter sets where:
-      # [1] one or more probability parameters are negative
-      # [2] one or more rate parameters are non-positive
-      # [3] mub = 0, and delta != 1. If delta != 1, then there are
-      # positive information days, so we can estimate a positive mub.
-      # [4] mus = 0, and delta != 0. If delta != 0, then there are
-      # negative information days, so we can estimate a positive mus.
-      # [5] db = ds = 0, while either theta or thetap is different from zero
-      # ------------------------------------------------------------------------
-      invalid <- (any(xparams[1:4] < 0)) |
-        (floor(xparams[7]) == 0 & xparams[2] != 1) |
-        (floor(xparams[8]) == 0 & xparams[2] != 0) |
-        (min(floor(xparams[9:10])) == 0 & sum(xparams[3:4]) != 0)
-
-      if (!invalid) {
-        if (xparams[1] == 1) xparams[3] <- 0.5
-        initials <- rbind(initials, xparams)
-      }
+    xparams <- c(params, eb, es, mub, mus, db, ds)
+    xparams[is.nan(xparams)] <- 0
 
-    } # for (buyfirst in c(TRUE, FALSE))
+    # Exclude initial parameter sets where:
+    # [1] one or more probability parameters are negative
+    # [2] eps.b or eps.s is zero
+    # [3] one or more rate parameters are non-positive
+    # [4] mub = 0, and delta != 1. If delta != 1, then there are
+    # positive information days, so we can estimate a positive mub.
+    # [5] mus = 0, and delta != 0. If delta != 0, then there are
+    # negative information days, so we can estimate a positive mus.
+    # [6] db = ds = 0, while either theta or thetap is different from zero
+    # ------------------------------------------------------------------------
+    invalid <- any(xparams[1:4] < 0) | any(xparams[5:6] == 0) |
+      (floor(xparams[7]) == 0 & xparams[2] != 1) |
+      (floor(xparams[8]) == 0 & xparams[2] != 0) |
+      (min(floor(xparams[9:10])) == 0 & sum(xparams[3:4]) != 0)
+
+    if (!invalid) {
+      if (xparams[1] == 1) xparams[3] <- 0.5
+      initials <- rbind(initials, xparams)
+    }
 
   } # for (k in seq_len(nrow(partitions)))
 
   # +++                                                                    +++ #
   # ++++++                                                              ++++++ #
   # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
-  # V. EVENTUALLY ADJUST INITIAL PARAMETER SETS USING 'RESTRICTED'           + #
+  # IV. EVENTUALLY ADJUST INITIAL PARAMETER SETS USING 'RESTRICTED'           + #
   # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
   # ++++++                                                              ++++++ #
   # +++                                                                    +++ #